In [2]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.metrics import TopKCategoricalAccuracy




In [3]:
# Load the training data
df = pd.read_csv('cefr_leveled_texts.csv')

In [4]:
df.head()

Unnamed: 0,text,label
0,Hi!\nI've been meaning to write for ages and f...,B2
1,﻿It was not so much how hard people found the ...,B2
2,Keith recently came back from a trip to Chicag...,B2
3,"The Griffith Observatory is a planetarium, and...",B2
4,-LRB- The Hollywood Reporter -RRB- It's offici...,B2


In [5]:
# Preprocess the text data
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['text'])
sequences = tokenizer.texts_to_sequences(df['text'])
data = pad_sequences(sequences, maxlen=200)

# Convert the categories to one-hot encoded vectors
lb = LabelBinarizer()
labels = lb.fit_transform(df['label'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

# Define the model
model = Sequential()
model.add(Embedding(5000, 64, input_length=200))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
model.add(LSTM(32, dropout=0.2, recurrent_dropout=0.2))  # another LSTM layer
model.add(Dense(32, activation='relu'))  # a Dense layer
model.add(Dense(len(lb.classes_), activation='softmax'))

# Compile the model with the top-2 accuracy metric
model.compile(loss='categorical_crossentropy', optimizer='adam', 
              metrics=['accuracy', TopKCategoricalAccuracy(k=2)])

# Train the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=128)

# Evaluate the model
loss, accuracy, top_2_accuracy = model.evaluate(X_test, y_test)
print(f'Loss: {loss}\nAccuracy: {accuracy}\nTop-2 Accuracy: {top_2_accuracy}')



Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss: 1.2038021087646484
Accuracy: 0.4882943034172058
Top-2 Accuracy: 0.7759197354316711


The above model performs at accuracy 0.49 and top-2 accuracy of 0.78, suggesting that it would be beneficial to fine tune it substantially. Maybe a larger amount of added layers could be tested. There might be other types of models that perform better, such as a BERT model. There are a variety of other ways that a model like this might be improved and the current accuracy appears too low to meaningfully deploy it. However, a top-2 accuracy of 0.76 suggests that the category in most cases should not be more than one level off. Given that it is difficult to definitively assess an article's CEFR level and some articles may fall between two adjacent categories, it may be sufficient to increase the top-2 accuracy some more. 