In [None]:
# In the beginning let's install transformers library
!pip install transformers
!pip install SentencePiece



In [None]:
# Importing standard libraries for every machine/deep learning pipeline
import pandas as pd
import torch
from tqdm import tqdm, trange
import numpy as np


# Importing specific libraries for data prerpcessing, model archtecture choice, training and evaluation
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import CamembertTokenizer, CamembertForSequenceClassification
from transformers import AdamW
# import torch.optim as optim
# from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# import seaborn as sns

In [None]:
# Defining constants
epochs = 20
MAX_LEN = 128
batch_size = 32
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# Load the dataset, I selected only 5000 sample because of memory limitation
df = pd.read_csv('/content/drive/MyDrive/Master/DATA SCIENCE AND MACHINE LEARNING/Apple-Project/full_augemented_training_data.csv').sample(4800).reset_index(drop=True)
df.head()

Unnamed: 0,id,sentence,difficulty
0,7544,La culture est une source intarissable d'inspi...,B1
1,2433,"Dans un premier temps, les locuteurs subissent...",C1
2,8312,Nous devons prendre soin de notre environnemen...,B1
3,6386,La Culture est un moyen d'enrichir la vie et d...,A2
4,7315,Le voyage est une expérience enrichissante qui...,B1


In [None]:
# Initialize CamemBERT tokenizer
tokenizer = CamembertTokenizer.from_pretrained('camembert-base',do_lower_case=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# Creates list of texts and labels
text = df['sentence'].to_list()
labels = df['difficulty'].to_list()
#user tokenizer to convert sentences into tokenizer
input_ids  = [tokenizer.encode(sent,add_special_tokens=True,max_length=MAX_LEN) for sent in text]

# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

# Create attention masks
attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
# Use train_test_split to split our data into train and validation sets for training
train_inputs, validation_inputs, train_labels, validation_labels, train_masks, validation_masks = train_test_split(input_ids, labels, attention_masks, random_state=42, test_size=0.2)


# Example mapping
label_mapping = {'A1': 0, 'A2': 1, 'B1': 2, 'B2': 3, 'C1': 4, 'C2': 5}

# Apply mapping
train_labels = [label_mapping[label] for label in train_labels]
validation_labels = [label_mapping[label] for label in validation_labels]


# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop,
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [None]:
# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top.
model = CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=6)
model.to(device)

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


CamembertForSequenceClassification(
  (roberta): CamembertModel(
    (embeddings): CamembertEmbeddings(
      (word_embeddings): Embedding(32005, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): CamembertEncoder(
      (layer): ModuleList(
        (0-11): 12 x CamembertLayer(
          (attention): CamembertAttention(
            (self): CamembertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): CamembertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=Tru

In [None]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5, eps=10e-8)

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)



In [None]:
# Store our loss and accuracy for plotting if we want to visualize training evolution per epochs after the training process
train_loss_set = []

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):
    # Tracking variables for training
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    # Train the model
    model.train()
    for step, batch in enumerate(train_dataloader):
        # Add batch to device CPU or GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        # Clear out the gradients (by default they accumulate)
        optimizer.zero_grad()
        # Forward pass
        outputs = model(b_input_ids,token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        # Get loss value
        loss = outputs[0]
        # Add it to train loss list
        train_loss_set.append(loss.item())
        # Backward pass
        loss.backward()
        # Update parameters and take a step using the computed gradient
        optimizer.step()

        # Update tracking variables
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    print("Train loss: {}".format(tr_loss/nb_tr_steps))




    # Tracking variables for validation
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    # Validation of the model
    model.eval()
    # Evaluate data for one epoch
    for batch in validation_dataloader:
        # Add batch to device CPU or GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        # Telling the model not to compute or store gradients, saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions
            outputs =  model(b_input_ids,token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
            loss, logits = outputs[:2]

        # Move logits and labels to CPU if GPU is used
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        tmp_eval_accuracy = flat_accuracy(logits, label_ids)

        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))

Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

Train loss: 1.4294845869143804


Epoch:   5%|▌         | 1/20 [00:26<08:18, 26.26s/it]

Validation Accuracy: 0.43333333333333335
Train loss: 1.0978318671385447


Epoch:  10%|█         | 2/20 [00:52<07:53, 26.29s/it]

Validation Accuracy: 0.4791666666666667
Train loss: 0.921097997824351


Epoch:  15%|█▌        | 3/20 [01:18<07:27, 26.34s/it]

Validation Accuracy: 0.44895833333333335
Train loss: 0.7983109131455421


Epoch:  20%|██        | 4/20 [01:45<07:01, 26.37s/it]

Validation Accuracy: 0.515625
Train loss: 0.6774102129042149


Epoch:  25%|██▌       | 5/20 [02:11<06:35, 26.39s/it]

Validation Accuracy: 0.509375
Train loss: 0.5190336155394714


Epoch:  30%|███       | 6/20 [02:38<06:09, 26.40s/it]

Validation Accuracy: 0.5052083333333334
Train loss: 0.4173870916167895


Epoch:  35%|███▌      | 7/20 [03:04<05:43, 26.40s/it]

Validation Accuracy: 0.5208333333333334
Train loss: 0.5056240543723106


Epoch:  40%|████      | 8/20 [03:31<05:16, 26.41s/it]

Validation Accuracy: 0.53125
Train loss: 0.29615128816415864


Epoch:  45%|████▌     | 9/20 [03:57<04:50, 26.43s/it]

Validation Accuracy: 0.53125
Train loss: 0.21365870364631215


Epoch:  50%|█████     | 10/20 [04:23<04:24, 26.42s/it]

Validation Accuracy: 0.5135416666666667
Train loss: 0.17319938695679107


Epoch:  55%|█████▌    | 11/20 [04:50<03:57, 26.44s/it]

Validation Accuracy: 0.5177083333333333
Train loss: 0.16911057266406715


Epoch:  60%|██████    | 12/20 [05:16<03:31, 26.43s/it]

Validation Accuracy: 0.521875
Train loss: 0.26146552118783195


Epoch:  65%|██████▌   | 13/20 [05:43<03:05, 26.45s/it]

Validation Accuracy: 0.5197916666666667
Train loss: 0.14610974807292224


Epoch:  70%|███████   | 14/20 [06:09<02:38, 26.45s/it]

Validation Accuracy: 0.503125
Train loss: 0.13745978549122811


Epoch:  75%|███████▌  | 15/20 [06:36<02:12, 26.47s/it]

Validation Accuracy: 0.5270833333333333
Train loss: 0.10705614272349824


Epoch:  80%|████████  | 16/20 [07:02<01:45, 26.48s/it]

Validation Accuracy: 0.5208333333333334
Train loss: 0.08565266483929009


Epoch:  85%|████████▌ | 17/20 [07:29<01:19, 26.48s/it]

Validation Accuracy: 0.5260416666666666
Train loss: 0.08596920877074202


Epoch:  90%|█████████ | 18/20 [07:55<00:52, 26.48s/it]

Validation Accuracy: 0.5333333333333333
Train loss: 0.09299269036855548


Epoch:  95%|█████████▌| 19/20 [08:22<00:26, 26.47s/it]

Validation Accuracy: 0.5291666666666667
Train loss: 0.073733564640861


Epoch: 100%|██████████| 20/20 [08:48<00:00, 26.43s/it]

Validation Accuracy: 0.5239583333333333





In [None]:
model.eval()  # Set the model to evaluation mode
predictions = []

with torch.no_grad():
    for batch in unlabelled_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        predictions.extend(preds.cpu().numpy())

# Map numeric predictions back to difficulty labels
inverse_difficulty_mapping = {v: k for k, v in difficulty_mapping.items()}
predicted_labels = [inverse_difficulty_mapping[pred] for pred in predictions]

In [None]:
result_df = pd.DataFrame({
    'id': range(1, len(predicted_labels) + 1),  # Generating a sequence of IDs starting from 1
    'sentence': unlabelled_data['sentence'],
    'difficulty': predicted_labels
})

result_df.head(100)

In [None]:
from google.colab import files
# Create DataFrame with only predicted difficulties
result_df = pd.DataFrame({
    'id': range(0, len(predicted_labels)),  # Generating a sequence of IDs starting from 1
    'difficulty': predicted_labels
})


# Export to CSV
result_df.to_csv('predicted_difficulties_only.csv', index=False)
files.download('predicted_difficulties_only.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>