<a href="https://colab.research.google.com/github/ThanhHung2112/LMS/blob/main/CEFR_Predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q transformers
!pip install -q sentencepiece

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m48.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip install -q accelerate -U
!pip install -q transformers[torch] -U

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/261.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━[0m [32m204.8/261.0 kB[0m [31m5.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.0/261.0 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# RoBERTa For CEFR Predictor

In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import (
    Trainer,
    TrainingArguments,
    get_cosine_schedule_with_warmup,
    AutoTokenizer,
    RobertaForSequenceClassification,
    BertModel,
    AdamW,

)
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, random_split
from sklearn.metrics import accuracy_score

torch.manual_seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_CHECKPOINT = "roberta-base"
SAVE_DIR = "/content/Robert-cefr"
SEQ_LEN = 512

In [None]:
class CEFRDataset(Dataset):
    def __init__(self, texts, labels):
        encoder = LabelEncoder()
        self.texts = texts.tolist()
        self.labels = encoder.fit_transform(labels)
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]
        encoded_text = tokenizer(
            text,
            padding="max_length",
            max_length=SEQ_LEN,
            truncation=True,
            return_tensors="pt"
        )
        encoded_text["input_ids"] = encoded_text["input_ids"].squeeze()
        encoded_text["attention_mask"] = encoded_text["attention_mask"].squeeze()
        label = torch.tensor(label)
        return {
            "input_ids": encoded_text["input_ids"],
            "attention_mask": encoded_text["attention_mask"],
            "labels": label
        }
    def get_labels(self):
        return self.labels

def get_dataset(data):
    data = data.sample(frac=1, random_state=200)
    return CEFRDataset(data["text"], data["cefr"])

def train(train_set, valid_set, epochs, warmup_size=0.1, lr=1e-3, batch_size=16):

    model = RobertaForSequenceClassification.from_pretrained(
    # model = BertModel.from_pretrained(
        MODEL_CHECKPOINT,
        num_labels=6
    )
    model.to(device)
    optim = AdamW(model.parameters(), lr=lr)
    scheduler = get_cosine_schedule_with_warmup(
        optim, num_warmup_steps=round(len(train_set) / batch_size * epochs * warmup_size), num_training_steps=len(train_set) * epochs
    )
    training_args = TrainingArguments(
        output_dir=SAVE_DIR,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        logging_steps=50,
        fp16=True,
        evaluation_strategy="epoch",
        eval_accumulation_steps=1
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_set,
        eval_dataset=valid_set,
        optimizers=(optim, scheduler),
        compute_metrics=compute_accuracy,
    )
    trainer.train()
    trainer.save_model()
    return trainer

def compute_accuracy(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc}

In [None]:
from collections import Counter

df = pd.read_csv("/content/drive/MyDrive/CEFR/cefr_combine.csv")

labels = df["cefr"]
label_datatset_distribution = Counter(labels)

print("Dataset:")
for label, count in label_datatset_distribution.items():
    print(f"{label}: {count} sample")


Dataset:
A1: 191951 sample
A2: 129863 sample
B1: 61711 sample
B2: 18473 sample
C1: 5356 sample
C2: 202 sample


In [None]:
df

Unnamed: 0,cefr,text
0,A1,"Hi, My name's Leon. I don't speak english, but..."
1,A1,Hi.... From: To: Date: Hi! How are you? My nam...
2,A1,\n\t Dear Anna: I'm Xudong. I'm 33 years old....
3,A1,"\n\t Hello! Nice to meet you. I am fine, than..."
4,A1,\n\t Hi teacher! How are you? My name's Marco...
...,...,...
407551,C2,Light propagating in the vicinity of astrophys...
407552,C2,Future of dentistry has become one of the most...
407553,C2,ï»¿The forests â€“ and suburbs â€“ of Europe a...
407554,C2,Hedge funds are turning bullish on oil once ag...


In [None]:
quantitive = 100
long_text_rows = df[df["text"].str.split().str.len() > quantitive]

print(f"Number of rows with word count > 100: {len(long_text_rows)}")

Number of rows with word count > 100: 41051


In [None]:
labels = long_text_rows["cefr"]
label_datatset_distribution = Counter(labels)

print("Dataset:")
for label, count in label_datatset_distribution.items():
    print(f"{label}: {count} sample")

Dataset:
A1: 476 sample
A2: 3840 sample
B1: 14936 sample
B2: 16298 sample
C1: 5299 sample
C2: 202 sample


In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

dataset = get_dataset(long_text_rows)

train_size = int(0.75 * len(dataset))
eval_size = int(0.05 * len(dataset))
test_size = len(dataset) - train_size - eval_size

train_set, valid_set, test_set = torch.utils.data.random_split(dataset, [train_size, eval_size, test_size])

In [None]:
print("train_set:",len(train_set))
print("valid_set:",len(valid_set))
print("test_set:",len(test_set))

def distribution_level(set):
  true_labels = set.dataset.labels[set.indices]
  label_distribution = Counter(true_labels)

  for label, count in label_distribution.items():
      print(f"{label}: {count} sample")
  return
print("Train_set:")
distribution_level(train_set)
print("Valid_set:")
distribution_level(valid_set)
print("Test_set:")
distribution_level(test_set)

train_set: 30788
valid_set: 2052
test_set: 8211
Train_set:
3: 12241 sample
4: 3947 sample
2: 11202 sample
1: 2886 sample
5: 148 sample
0: 364 sample
Valid_set:
3: 799 sample
1: 195 sample
4: 278 sample
2: 738 sample
0: 31 sample
5: 11 sample
Test_set:
3: 3258 sample
2: 2996 sample
4: 1074 sample
1: 759 sample
0: 81 sample
5: 43 sample


In [None]:
trainer = train(train_set, valid_set, epochs=1, warmup_size=0.2, lr=1e-5, batch_size=8)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1083,0.104557,0.975146


In [None]:
predictions = trainer.predict(test_set)
predictions[2]

{'test_loss': 0.11688675731420517,
 'test_accuracy': 0.968944099378882,
 'test_runtime': 107.8751,
 'test_samples_per_second': 76.116,
 'test_steps_per_second': 9.52}

In [None]:
from sklearn.metrics import accuracy_score, classification_report

predicted_labels = predictions.predictions.argmax(axis=1)
true_labels = test_set.dataset.labels[test_set.indices]
cefr_levels = ["A1", "A2", "B1", "B2", "C1", "C2"]
classification_rep = classification_report(true_labels, predicted_labels, target_names=cefr_levels)
print(classification_rep)

              precision    recall  f1-score   support

          A1       0.82      0.97      0.89       104
          A2       0.95      0.96      0.96       806
          B1       0.97      0.98      0.98      2984
          B2       0.99      0.97      0.98      3266
          C1       0.94      0.96      0.95      1016
          C2       0.64      0.71      0.68        35

    accuracy                           0.97      8211
   macro avg       0.89      0.93      0.90      8211
weighted avg       0.97      0.97      0.97      8211



In [None]:
from sklearn.metrics import confusion_matrix

predicted_labels = predictions.predictions.argmax(axis=1)
true_labels = test_set.dataset.labels[test_set.indices]

confusion = confusion_matrix(true_labels, predicted_labels)

confusion_df = pd.DataFrame(confusion, index=cefr_levels, columns=cefr_levels)

print(confusion_df)

     A1   A2    B1    B2   C1  C2
A1  101    2     1     0    0   0
A2   15  775    13     3    0   0
B1    4   19  2925    23   11   2
B2    1   16    57  3152   37   3
C1    2    5     9    13  978   9
C2    0    0     0     1    9  25


In [None]:
trainer.save_model("/content/drive/MyDrive/CEFR/cefr100_model")
tokenizer.save_pretrained("/content/drive/MyDrive/CEFR/cefr100_token")

('/content/drive/MyDrive/CEFR/cefr100_token/tokenizer_config.json',
 '/content/drive/MyDrive/CEFR/cefr100_token/special_tokens_map.json',
 '/content/drive/MyDrive/CEFR/cefr100_token/vocab.json',
 '/content/drive/MyDrive/CEFR/cefr100_token/merges.txt',
 '/content/drive/MyDrive/CEFR/cefr100_token/added_tokens.json',
 '/content/drive/MyDrive/CEFR/cefr100_token/tokenizer.json')

In [None]:
import torch

model = RobertaForSequenceClassification.from_pretrained(
    "/content/drive/MyDrive/CEFR/cefr999_model",
    num_labels=6
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/CEFR/cefr999_token")

def predict_english_level(text):
    # Encode the text using the tokenizer
    inputs = tokenizer(
        text,
        padding="max_length",
        max_length=SEQ_LEN,
        truncation=True,
        return_tensors="pt"
    )
    input_ids = inputs["input_ids"].squeeze()
    attention_mask = inputs["attention_mask"].squeeze()

    # Make a prediction
    with torch.no_grad():
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        output = model(input_ids=input_ids.unsqueeze(0), attention_mask=attention_mask.unsqueeze(0))
        predicted_probs = torch.softmax(output.logits, dim=1).squeeze().tolist()
        predicted_class = output.logits.argmax().item()

    # Map the predicted class back to the English level
    labels = ['A1', 'A2', 'B1', 'B2', 'C1', 'C2']
    predicted_level = labels[predicted_class]

    return predicted_level, predicted_probs

In [None]:
def predict_average_english_level_for_paragraphs(text):

    paragraphs = text.split("\n")

    level_values = []
    for paragraph in paragraphs:
      if len(paragraph) > 2:
        # print(paragraph)
        predicted_level,prob = predict_english_level(paragraph)
        print(predicted_level)
        if predicted_level is not None:
            level_values.append(predicted_level)


    if not level_values:
        return None

    level_numbers = {'A1': 1,'A2': 2,'B1': 3,'B2': 4,'C1': 5,'C2': 6}
    numeric_levels = [level_numbers[level] for level in level_values]

    # Tính trung bình cộng các giá trị dự đoán
    average_level = sum(numeric_levels) / len(numeric_levels)

    # Chuyển giá trị trung bình thành English level cuối cùng
    final_level = [key for key, value in level_numbers.items() if value == round(average_level)]

    return final_level[0],average_level if final_level else None


In [None]:
text = """

The line graph compares the percentage of Britain households reagularly using their own private transport over a period of 36 years.

Overall, the population of people having one car was the highest and stayed almost unchanged, while the number of household with three or more cars was the least. It is also clear that the figures for no car and two cars dramatically changed.

In 1971, almost half of the population of people in the UK had no car and the households having one car only were the second most populated group. Those with three and more cars had by far the least proportion of population with roughly 2 per cent of them and only 8% of households used two cars.

Over the following 36 years, The percentage of families with one car almost leveled off in about 45% and remained the most popular group, while the less number pf people had no car until in 2007 there were only 25% of them. household started havin two cars with a dramatic rise of around 20%, although three cars and more category had an unconsiderable increase stayed under 8%.

"""

words = text.split()

# Đếm số từ trong danh sách
print(len(words))

predicted_level, predicted_probs = predict_english_level(text)

print(f"Predicted English Level: {predicted_level}")
print(f"Predicted Probabilities: {predicted_probs}")
final_predicted_level,avg = predict_average_english_level_for_paragraphs(text)
print(f"Final Predicted English Level: {final_predicted_level,avg}")

188
Predicted English Level: C1
Predicted Probabilities: [0.0004077534831594676, 0.001156926155090332, 0.004624940920621157, 0.008474103175103664, 0.9836565256118774, 0.0016797943972051144]
C1
C1
B1
C1
Final Predicted English Level: ('B2', 4.5)


# More Features ?


In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

def get_pos_tags(text):
    doc = nlp(text)
    pos_tags = [token.pos_ for token in doc]
    return pos_tags

long_text_rows['pos_tags'] = long_text_rows['text'].apply(get_pos_tags)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  long_text_rows['pos_tags'] = long_text_rows['text'].apply(get_pos_tags)


In [None]:
!pip install textstat

Collecting textstat
  Downloading textstat-0.7.3-py3-none-any.whl (105 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyphen (from textstat)
  Downloading pyphen-0.14.0-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.14.0 textstat-0.7.3


In [None]:
import textstat
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize

def avg_words_per_sentence(text):
    sentences = sent_tokenize(text)
    word_count = sum(len(word_tokenize(sentence)) for sentence in sentences)
    if len(sentences) == 0:
        return 0
    return word_count / len(sentences)
df["smog_index"] = df["text"].apply(textstat.smog_index)
df["automated_readability_index"] = df["text"].apply(textstat.automated_readability_index)
df["dale_chall_readability_score"] = df["text"].apply(textstat.dale_chall_readability_score)
df["difficult_words"] = df["text"].apply(textstat.difficult_words)
df["linsear_write_formula"] = df["text"].apply(textstat.linsear_write_formula)
df["gunning_fog"] = df["text"].apply(textstat.gunning_fog)
df["szigriszt_pazos"] = df["text"].apply(textstat.szigriszt_pazos)
df["gutierrez_polini"] = df["text"].apply(textstat.gutierrez_polini)
df["crawford"] = df["text"].apply(textstat.crawford)
df["osman"] = df["text"].apply(textstat.osman)
df["avg_words"] = df["text"].apply(avg_words_per_sentence)
df_original = df

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import (
    Trainer,
    TrainingArguments,
    AutoTokenizer,
    AutoModel,
    AutoModelForSequenceClassification,
    RobertaForSequenceClassification,
    AdamW,
    get_cosine_schedule_with_warmup,
)
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import numpy as np
import spacy
nlp = spacy.load("en_core_web_sm")

torch.manual_seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

MODEL_CHECKPOINT = "roberta-base"
SAVE_DIR = "/content/T-Robert-cefr"
SEQ_LEN = 512


In [None]:
class CEFRDataset(Dataset):
    def __init__(self, texts, labels, pos_tags, textstat_features):
        encoder = LabelEncoder()
        self.texts = texts.tolist()
        self.labels = encoder.fit_transform(labels)
        self.pos_tags = pos_tags
        self.textstat_features = textstat_features

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]
        pos_tags = self.pos_tags[index]
        textstat = self.textstat_features[index]


        encoded_text = tokenizer(
            text,
            padding="max_length",
            max_length=SEQ_LEN,
            truncation=True,
            return_tensors="pt"
        )
        # textstat = torch.tensor(textstat, dtype=torch.float32)

        # encoded_text = torch.stack([encoded_text,pos_tags,textstat],)

        encoded_text["input_ids"] = encoded_text["input_ids"].squeeze()
        encoded_text["attention_mask"] = encoded_text["attention_mask"].squeeze()

        textstat = torch.tensor(textstat, dtype=torch.float32)
        label = torch.tensor(label)

        return {
            "input_ids": encoded_text["input_ids"],
            "attention_mask": encoded_text["attention_mask"],
            "pos_tags": pos_tags,
            "textstat_features": textstat,
            "labels": label
        }

In [None]:
def get_dataset(data):
    data = data.sample(frac=1, random_state=200)

    textstat_features = data[
        ['smog_index',
         'automated_readability_index', 'dale_chall_readability_score', 'difficult_words',
         'linsear_write_formula', 'gunning_fog','szigriszt_pazos',
         'gutierrez_polini', 'crawford','osman','avg_words']
    ].values
    print("1")
    # Add POS tags and convert to tensors
    pos_tags = data['pos_tags'].apply(lambda tags: " ".join(tags))
    encoded_pos_tags = tokenizer(
        pos_tags.tolist(),
        padding="max_length",
        max_length=SEQ_LEN,
        truncation=True,
        return_tensors="pt"
    )
    print("2")

    # Combine text and textstat features
    encoded_text = [tokenizer(text, padding="max_length", max_length=SEQ_LEN, truncation=True, return_tensors="pt") for text in data['text']]
    input_ids = torch.stack([enc['input_ids'].squeeze() for enc in encoded_text])
    attention_mask = torch.stack([enc['attention_mask'].squeeze() for enc in encoded_text])
    print("3")

    # Combine text, textstat features, and POS tags in the dataset
    cefr_data = CEFRDataset(data['text'], data['cefr'], encoded_pos_tags, textstat_features)

    return cefr_data


In [None]:
# Train
def train(train_set, valid_set, epochs=10, warmup_size=0.1, lr=1e-3, batch_size=16):
    model = RobertaForSequenceClassification.from_pretrained(
        MODEL_CHECKPOINT,
        num_labels=6,
    )
    model.to(device)
    optim = AdamW(model.parameters(), lr=lr)
    scheduler = get_cosine_schedule_with_warmup(
        optim, num_warmup_steps=round(len(train_set) / batch_size * epochs * warmup_size), num_training_steps=len(train_set) * epochs
    )
    training_args = TrainingArguments(
        output_dir=SAVE_DIR,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        logging_steps=50,
        fp16=True,
        evaluation_strategy="epoch",
        eval_accumulation_steps=1
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_set,
        eval_dataset=valid_set,
        optimizers=(optim, scheduler),
        compute_metrics=compute_accuracy,
    )
    trainer.train()
    trainer.save_model()
    return trainer

def compute_accuracy(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc}

In [None]:
df_original = pd.read_csv("/content/drive/MyDrive/CEFR/multi_f.csv")
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
dataset = get_dataset(df_original[:5000])

train_size = int(0.75 * len(dataset))
eval_size = int(0.05 * len(dataset))
test_size = len(dataset) - train_size - eval_size

train_set, valid_set, test_set = torch.utils.data.random_split(dataset, [train_size, eval_size, test_size])

1
2
3


In [None]:
del dataset,train_size,eval_size,test_size

In [None]:
trainer = train(train_set, valid_set, epochs=1, warmup_size=0.2, lr=1e-5, batch_size=8)

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.002,0.025373,0.996


In [None]:
predictions = trainer.predict(test_set)
predictions[2]

{'test_loss': 0.10978385806083679,
 'test_accuracy': 0.9711362805991962,
 'test_runtime': 108.5205,
 'test_samples_per_second': 75.663,
 'test_steps_per_second': 9.464}

In [None]:
from sklearn.metrics import accuracy_score, classification_report

predicted_labels = predictions.predictions.argmax(axis=1)
true_labels = test_set.dataset.labels[test_set.indices]
cefr_levels = ["A1", "A2", "B1", "B2", "C1", "C2"]
classification_rep = classification_report(true_labels, predicted_labels, target_names=cefr_levels)
print(classification_rep)

              precision    recall  f1-score   support

          A1       0.82      0.93      0.87        81
          A2       0.97      0.96      0.97       759
          B1       0.98      0.98      0.98      2996
          B2       0.98      0.98      0.98      3258
          C1       0.94      0.96      0.95      1074
          C2       0.74      0.47      0.57        43

    accuracy                           0.97      8211
   macro avg       0.91      0.88      0.89      8211
weighted avg       0.97      0.97      0.97      8211



In [None]:
import pandas as pd
from sklearn.metrics import confusion_matrix

predicted_labels = predictions.predictions.argmax(axis=1)
true_labels = test_set.dataset.labels[test_set.indices]

confusion = confusion_matrix(true_labels, predicted_labels)

confusion_df = pd.DataFrame(confusion, index=cefr_levels, columns=cefr_levels)

print(confusion_df)

    A1   A2    B1    B2    C1  C2
A1  75    2     3     0     1   0
A2  13  729    14     3     0   0
B1   3   12  2939    34     8   0
B2   0    1    41  3179    36   1
C1   0    4    12    20  1032   6
C2   0    0     3     0    20  20


In [None]:
trainer.save_model("/content/drive/MyDrive/CEFR/cefr999_model")
tokenizer.save_pretrained("/content/drive/MyDrive/CEFR/cefr999_token")

('/content/drive/MyDrive/CEFR/cefr999_token/tokenizer_config.json',
 '/content/drive/MyDrive/CEFR/cefr999_token/special_tokens_map.json',
 '/content/drive/MyDrive/CEFR/cefr999_token/vocab.json',
 '/content/drive/MyDrive/CEFR/cefr999_token/merges.txt',
 '/content/drive/MyDrive/CEFR/cefr999_token/added_tokens.json',
 '/content/drive/MyDrive/CEFR/cefr999_token/tokenizer.json')

In [None]:
!pip install -q textstat

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25h

# Config

In [None]:
df_original.drop(["Unnamed: 0", "word_count"], axis=1)

Unnamed: 0,cefr,text,pos_tags,smog_index,automated_readability_index,dale_chall_readability_score,difficult_words,linsear_write_formula,gunning_fog,szigriszt_pazos,gutierrez_polini,crawford,osman,avg_words
0,A1,"\n\t From : May To: Maria Data : August 28, 2...","['SPACE', 'ADP', 'PUNCT', 'PROPN', 'PART', 'PU...",7.0,3.9,5.84,5,4.000000,3.62,115.86,52.18,0.5,82.98,8.285714
1,A1,\n\t My name is Erica . I am thirty-two year...,"['SPACE', 'PRON', 'NOUN', 'AUX', 'PROPN', 'PUN...",6.9,3.3,5.92,5,5.000000,4.83,117.06,53.42,0.8,87.00,12.111111
2,A1,\n\t Hi ! My name is Jiang Shiying . I'm thir...,"['SPACE', 'INTJ', 'PUNCT', 'PRON', 'NOUN', 'AU...",5.6,2.5,5.69,3,4.200000,3.08,119.22,54.38,0.2,91.39,7.857143
3,A1,\n\t Hi teacher ! My name's Areej . I'm twent...,"['SPACE', 'INTJ', 'NOUN', 'PUNCT', 'PRON', 'NO...",6.8,1.6,6.90,6,3.583333,3.38,116.52,55.09,0.3,92.82,7.642857
4,A1,\n\t Hi my name's Marcelo and my favorite day...,"['SPACE', 'INTJ', 'PRON', 'NOUN', 'PART', 'PRO...",5.8,1.3,6.49,4,4.777778,4.64,123.39,58.23,0.5,101.87,13.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41046,C2,Light propagating in the vicinity of astrophys...,"['NOUN', 'VERB', 'ADP', 'DET', 'NOUN', 'ADP', ...",14.6,14.8,9.46,81,13.000000,13.94,80.69,38.48,3.8,45.37,27.533333
41047,C2,Future of dentistry has become one of the most...,"['NOUN', 'ADP', 'NOUN', 'AUX', 'VERB', 'NUM', ...",15.6,17.5,10.12,151,14.200000,14.79,72.18,31.37,4.5,26.23,22.344828
41048,C2,ï»¿The forests â€“ and suburbs â€“ of Europe a...,"['PRON', 'NOUN', 'NUM', 'PUNCT', 'CCONJ', 'VER...",13.5,17.7,9.78,155,20.000000,13.64,83.12,34.85,3.5,32.93,32.840000
41049,C2,Hedge funds are turning bullish on oil once ag...,"['NOUN', 'NOUN', 'AUX', 'VERB', 'ADJ', 'ADP', ...",12.2,12.1,10.16,142,12.400000,10.35,93.62,40.57,2.9,50.85,27.964286


In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
SEQ_LEN = 512


# Encode text using RoBERTa
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
# Load the dataset
df_original = pd.read_csv("/content/drive/MyDrive/CEFR/multi_f.csv")
df_original = df_original.sample(frac=1, random_state=200)

def encode_text(text):
    input_ids = tokenizer(text, return_tensors="pt", max_length=SEQ_LEN, padding=True, truncation=True)["input_ids"]
    input_ids = input_ids.to(device)  # Move input_ids to the same device as the model
    with torch.no_grad():
        output = model(input_ids)
    return output.logits

df_original["text_encoded"] = df_original["text"].apply(encode_text)

In [None]:
# Combine pos_tags into a single sentence and encode it
def encode_pos_tags(pos_tags):
    pos_sentence = " ".join(pos_tags)
    input_ids = tokenizer(pos_sentence, return_tensors="pt",max_length=SEQ_LEN, padding=True, truncation=True)["input_ids"]
    input_ids = input_ids.to(device)  # Move input_ids to the same device as the model
    with torch.no_grad():
        output = model(input_ids)
    return output.logits

df_original["pos_tags_encoded"] = df_original["pos_tags"].apply(encode_pos_tags)

# Encode textstat_features
textstat_features = df_original[['smog_index', 'automated_readability_index', 'dale_chall_readability_score',
                                'difficult_words', 'linsear_write_formula', 'gunning_fog', 'szigriszt_pazos',
                                'gutierrez_polini', 'crawford', 'osman', 'avg_words']].values
scaler = StandardScaler()
textstat_features = scaler.fit_transform(textstat_features)

In [None]:
df_original["cefr"].unique()

array(['C1', 'B2', 'B1', 'A2', 'A1', 'C2'], dtype=object)

In [None]:
# Concatenate the encoded vectors
encoded_data = np.concatenate((np.vstack(df_original["text_encoded"].apply(lambda x: x.cpu()).values),
                               np.vstack(df_original["pos_tags_encoded"].apply(lambda x: x.cpu()).values),
                               textstat_features), axis=1)


In [None]:
import torch.nn as nn
import torch.optim as optim

class CEFR_Model(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(CEFR_Model, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        return x

# encoded_data = np.vstack(df_original["text_encoded"].apply(lambda x: x.cpu()).values)
input_size = encoded_data.shape[1]
hidden_size = 128
num_classes = 6

model = CEFR_Model(input_size, hidden_size, num_classes)

# Data for training step
X = torch.tensor(encoded_data, dtype=torch.float32)
cefr_mapping = {"A1": 0, "A2": 1, "B1": 2, "B2": 3, "C1": 4, "C2": 5}

# Convert the "cefr" column to numeric labels
df_original["cefr_numeric"] = df_original["cefr"].map(cefr_mapping)


y = torch.tensor(df_original["cefr_numeric"].values, dtype=torch.long)


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Calculate class weights
class_weights = compute_class_weight("balanced", classes=np.unique(df_original["cefr_numeric"]), y=df_original["cefr_numeric"])
class_weights = torch.tensor(class_weights, dtype=torch.float32)

criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
num_epochs = 10
batch_size = 8

for epoch in range(num_epochs):
    for i in range(0, X_train.size(0), batch_size):
        batch_X = X_train[i:i+batch_size]
        batch_y = y_train[i:i+batch_size]

        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

In [None]:
# Evaluate the model
with torch.no_grad():
    val_outputs = model(X_val)
    _, val_preds = torch.max(val_outputs, 1)

print(classification_report(y_val, val_preds))