<a href="https://colab.research.google.com/github/RebeccaKessler/Machine_Learning/blob/main/Codes/Final_CamemBert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#install libraries
!pip install sentencepiece
!pip install accelerate -U
!pip install pandas numpy matplotlib
!pip install scikit-learn seaborn

Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.w

In [2]:
#import packages
import torch
from torch.utils.data import Dataset
import pandas as pd
from transformers import CamembertConfig, CamembertForSequenceClassification, CamembertTokenizer, Trainer, TrainingArguments
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix
import seaborn as sns
import joblib
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, precision_recall_fscore_support

In [None]:
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = str(self.data.iloc[idx]['sentence'])
        label = int(self.data.iloc[idx]['encoded_labels'])

        encoding = self.tokenizer.encode_plus(
            sentence,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
# Load the dataset
data = pd.read_csv('training_data.csv')

# Define label encoder
label_encoder = LabelEncoder()

# Fit label encoder and transform labels
data['encoded_labels'] = label_encoder.fit_transform(data['difficulty'])
joblib.dump(label_encoder, 'label_encoder.pkl')

# Split data into train and validation sets
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
# Define compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(-1)
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import CamembertForSequenceClassification, CamembertTokenizer
import torch
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = str(self.data.iloc[idx]['sentence'])
        label = int(self.data.iloc[idx]['encoded_labels'])

        encoding = self.tokenizer.encode_plus(
            sentence,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Load and preprocess the data
data = pd.read_csv('training_data.csv')
label_encoder = LabelEncoder()
data['encoded_labels'] = label_encoder.fit_transform(data['difficulty'])

# Define the tokenizer
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')

# K-Fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list = []

for fold, (train_idx, val_idx) in enumerate(kf.split(data)):
    print(f"Training fold {fold+1}")
    train_data = data.iloc[train_idx]
    val_data = data.iloc[val_idx]

    train_dataset = CustomDataset(train_data, tokenizer)
    eval_dataset = CustomDataset(val_data, tokenizer)

    # Set up the Trainer
    model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=6)
    training_args = TrainingArguments(
        output_dir=f'./results_fold_{fold}',
        learning_rate=0.00015,
        num_train_epochs=5,
        per_device_train_batch_size=16,
        warmup_steps=1000,
        weight_decay=0.01,
        logging_dir=f'./logs_fold_{fold}',
        logging_steps=10,
        evaluation_strategy="steps",
        eval_steps=100,
        save_strategy="steps",
        save_steps=500,
        fp16=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics
    )

    # Train and evaluate the model
    trainer.train()
    eval_result = trainer.evaluate()
    accuracy_list.append(eval_result['eval_accuracy'])

# Compute overall accuracy
overall_accuracy = sum(accuracy_list) / len(accuracy_list)
print(f"Overall Accuracy: {overall_accuracy:.4f}")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Training fold 1


Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,1.7625,1.746788,0.251042,0.323099,0.251042,0.189834
200,1.376,1.343789,0.433333,0.411332,0.433333,0.374762
300,1.169,1.181431,0.516667,0.527705,0.516667,0.50995
400,1.0212,1.142577,0.509375,0.501465,0.509375,0.493581
500,0.931,1.053807,0.557292,0.561923,0.557292,0.53489
600,1.0239,1.415723,0.439583,0.468503,0.439583,0.42582
700,1.0289,1.064568,0.533333,0.551986,0.533333,0.514536
800,0.8593,1.256282,0.494792,0.531045,0.494792,0.491594
900,0.9295,1.256602,0.501042,0.506913,0.501042,0.487639
1000,0.7951,1.172967,0.546875,0.57429,0.546875,0.531053


  _warn_prf(average, modifier, msg_start, len(result))


Training fold 2


Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,1.7476,1.72927,0.377083,0.328516,0.377083,0.282811
200,1.3866,1.348997,0.439583,0.369356,0.439583,0.377652
300,1.1596,1.187079,0.505208,0.499252,0.505208,0.491712
400,1.1568,1.232394,0.49375,0.513497,0.49375,0.494747
500,1.0671,1.120191,0.546875,0.563082,0.546875,0.547257
600,0.9458,1.137229,0.54375,0.557239,0.54375,0.53984
700,0.8912,1.118351,0.55625,0.554093,0.55625,0.554207
800,0.7134,1.250609,0.510417,0.529598,0.510417,0.506783
900,0.8104,1.322904,0.530208,0.535175,0.530208,0.523291
1000,0.7162,1.365992,0.534375,0.578464,0.534375,0.525254


  _warn_prf(average, modifier, msg_start, len(result))


Training fold 3


Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,1.7442,1.719622,0.36875,0.432593,0.36875,0.317798
200,1.4419,1.314427,0.488542,0.491187,0.488542,0.473017
300,1.1308,1.287582,0.442708,0.446885,0.442708,0.427753
400,1.1599,1.114265,0.508333,0.563427,0.508333,0.482687
500,0.9688,1.154742,0.508333,0.530888,0.508333,0.496286
600,0.8241,1.114434,0.540625,0.569739,0.540625,0.532539
700,0.9309,1.067625,0.544792,0.541336,0.544792,0.535099
800,0.669,1.141535,0.534375,0.550691,0.534375,0.527017
900,0.8777,1.198354,0.540625,0.561772,0.540625,0.529319
1000,0.5987,1.226343,0.553125,0.564675,0.553125,0.540784


  _warn_prf(average, modifier, msg_start, len(result))


Training fold 4


Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,1.743,1.720396,0.34375,0.251131,0.34375,0.241407
200,1.3815,1.341563,0.416667,0.408261,0.416667,0.351025
300,1.1873,1.235245,0.466667,0.478001,0.466667,0.459745
400,1.0157,1.118172,0.490625,0.470778,0.490625,0.446868
500,0.907,1.050945,0.542708,0.552827,0.542708,0.534028
600,0.9754,1.049733,0.546875,0.564542,0.546875,0.545171
700,0.9777,1.154662,0.521875,0.538877,0.521875,0.509854
800,0.6741,1.153913,0.54375,0.548572,0.54375,0.535031
900,1.0794,1.152477,0.555208,0.587959,0.555208,0.555205
1000,0.6631,1.416458,0.486458,0.528357,0.486458,0.485245


  _warn_prf(average, modifier, msg_start, len(result))


Training fold 5


Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,1.7426,1.72841,0.314583,0.281681,0.314583,0.219623
200,1.3784,1.341336,0.402083,0.321922,0.402083,0.30956
300,1.0612,1.231717,0.426042,0.439829,0.426042,0.391197
400,1.0893,1.101427,0.517708,0.492246,0.517708,0.488441
500,0.9154,1.078788,0.530208,0.520619,0.530208,0.506899
600,0.9538,1.052017,0.555208,0.566479,0.555208,0.556032
700,0.9826,1.08434,0.540625,0.561981,0.540625,0.529595
800,0.7941,1.111412,0.559375,0.56381,0.559375,0.555199
900,0.8467,1.184233,0.526042,0.580802,0.526042,0.507168
1000,0.801,1.218878,0.5125,0.5591,0.5125,0.518438


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Overall Accuracy: 0.5762


In [7]:
# Save the trained model and the tokenizer
model.save_pretrained('./saved_model')
tokenizer.save_pretrained('./saved_model')

('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/sentencepiece.bpe.model',
 './saved_model/added_tokens.json')

retraining using the extended dataset we generated using ChatGTP

In [27]:
full_data = pd.read_csv('combined_random_french_sentences.csv')

# Load CamemBERT model
model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=6)

# Load the tokenizer
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')

# Creating the dataset object using the same tokenizer and configurations as before
label_encoder = LabelEncoder()
full_data['encoded_labels'] = label_encoder.fit_transform(full_data['difficulty'])
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
full_dataset = CustomDataset(full_data, tokenizer)

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [28]:
training_args = TrainingArguments(
    output_dir='./results_full',
    learning_rate=0.00015,
    num_train_epochs=5,
    per_device_train_batch_size=16,
    warmup_steps=1000,
    weight_decay=0.01,
    logging_dir='./logs_full',
    logging_steps=10,
    evaluation_strategy="no",
    save_strategy="epoch",
    save_total_limit=2,
    fp16=True,
)

In [29]:
from transformers import Trainer

# Reinitialize the Trainer with the full dataset
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=full_dataset,
    compute_metrics=None
)

# Train the model
trainer.train()


Step,Training Loss
10,1.789
20,1.7936
30,1.788
40,1.7879
50,1.7783
60,1.774
70,1.7521
80,1.7298
90,1.7033
100,1.6891


TrainOutput(global_step=3065, training_loss=0.4586385465173892, metrics={'train_runtime': 509.0747, 'train_samples_per_second': 96.253, 'train_steps_per_second': 6.021, 'total_flos': 3223226184192000.0, 'train_loss': 0.4586385465173892, 'epoch': 5.0})

In [30]:
model_path_full = "./fine_tuned_model_full"
model.save_pretrained(model_path_full)
tokenizer.save_pretrained(model_path_full)

('./fine_tuned_model_full/tokenizer_config.json',
 './fine_tuned_model_full/special_tokens_map.json',
 './fine_tuned_model_full/sentencepiece.bpe.model',
 './fine_tuned_model_full/added_tokens.json')

In [31]:
from transformers import pipeline

# Load the unlabelled data
unlabelled_data = pd.read_csv('unlabelled_test_data.csv')

# Load the newly trained full model and tokenizer for predictions
model = CamembertForSequenceClassification.from_pretrained(model_path_full)
tokenizer = CamembertTokenizer.from_pretrained(model_path_full)

# Prediction pipeline
classifier = pipeline('text-classification', model=model, tokenizer=tokenizer)

# Predict on new data
predictions = classifier(unlabelled_data['sentence'].tolist())
predicted_labels = [label_encoder.inverse_transform([int(pred['label'].split('_')[-1])])[0] for pred in predictions]

# Create DataFrame and save
results_df = pd.DataFrame({
    'id': unlabelled_data['id'],
    'difficulty': predicted_labels
})
results_df.to_csv('predicted_difficulties_full.csv', index=False)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Retrain it using the provided full training dataset

In [17]:
# Combine training and validation data
combined_data = pd.concat([train_data, val_data])

# Load CamemBERT model pre-trained
model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=6)

# Load the tokenizer
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')

# Create a new dataset object with the entire data
final_dataset = CustomDataset(combined_data, tokenizer)

# Modify training arguments
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=0.00015,
    num_train_epochs=5,
    per_device_train_batch_size=16,
    warmup_steps=1000,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="no",
    save_strategy="steps",
    save_steps=500,
    fp16=True,
    )

# Re-initialize and train the Trainer with the new combined dataset
final_trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=final_dataset,
    compute_metrics=None
)

# Retrain the model on the whole dataset
final_trainer.train()

# Save the final trained model and tokenizer
model.save_pretrained('./final_model')
tokenizer.save_pretrained('./final_model')

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Step,Training Loss
10,1.7935
20,1.7925
30,1.7887
40,1.7923
50,1.783
60,1.7839
70,1.7802
80,1.7701
90,1.7542
100,1.7463


('./final_model/tokenizer_config.json',
 './final_model/special_tokens_map.json',
 './final_model/sentencepiece.bpe.model',
 './final_model/added_tokens.json')

In [18]:
import pandas as pd
from transformers import pipeline

# Load the unlabelled data
unlabelled_data = pd.read_csv('unlabelled_test_data.csv')

# Load the saved model and tokenizer
model_path = './final_model'
model = CamembertForSequenceClassification.from_pretrained(model_path)
tokenizer = CamembertTokenizer.from_pretrained(model_path)

# Load the LabelEncoder
label_encoder = joblib.load('label_encoder.pkl')

# Create a prediction pipeline
classifier = pipeline('text-classification', model=model, tokenizer=tokenizer, framework='pt', device=0)  # Adjust `device` as needed

# Predict labels for the unlabelled data
predictions = classifier(unlabelled_data['sentence'].tolist())

# Decode the numeric labels to original labels using the loaded LabelEncoder
predicted_labels = [label_encoder.inverse_transform([int(pred['label'].split('_')[-1])])[0] for pred in predictions]

# Create a DataFrame to export
results_df = pd.DataFrame({
    'id': unlabelled_data['id'],
    'difficulty': predicted_labels
})

# Save the results to a new CSV file
results_df.to_csv('predicted_difficulties.csv', index=False)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
