<a href="https://colab.research.google.com/github/RebeccaKessler/Machine_Learning/blob/main/Codes/CamemBert_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sentencepiece
!pip install accelerate -U
!pip install pandas numpy matplotlib
!pip install scikit-learn seaborn
!pip install optuna

Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.w

## Import Packages

In [None]:
from transformers import Trainer, TrainingArguments, CamembertTokenizer, CamembertForSequenceClassification, CamembertConfig
import torch
from torch.utils.data import Dataset
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import pipeline
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import pipeline
import optuna

## Define Functions

In [None]:
# Define compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(-1)
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [None]:
 # Define pre-processing function
 class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = str(self.data.iloc[idx]['sentence'])
        label = int(self.data.iloc[idx]['encoded_labels'])

        encoding = self.tokenizer.encode_plus(
            sentence,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

##Hyperparameter Optimization

In [None]:
# Load the data
url = 'https://raw.githubusercontent.com/RebeccaKessler/Machine_Learning/main/training_data.csv'
data = pd.read_csv(url)

In [None]:
# Apply LabelEncoder to labels
label_encoder = LabelEncoder()
data['encoded_labels'] = label_encoder.fit_transform(data['difficulty'])

# Split the dataset into training/validation and final test set
train_val_data, test_data = train_test_split(data, test_size=0.2, stratify=data['encoded_labels'], random_state=42)

In [None]:
#load the tokenizer
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')

# Define objective function for hyperoptimization with Optuna
def objective(trial):
    # Define hyperparameter search space
    learning_rate = trial.suggest_loguniform("learning_rate", 10e-5, 20e-5)
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32])
    num_train_epochs = trial.suggest_int("num_train_epochs", 3, 7)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=num_train_epochs,
        learning_rate=learning_rate,
        per_device_train_batch_size=per_device_train_batch_size,
        logging_dir='./logs',
        logging_steps=10,
        warmup_steps=1000,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        fp16=True
    )


    # Split the training and validation sets from the training data
    train_data, val_data = train_test_split(train_val_data, test_size=0.2, stratify=train_val_data['encoded_labels'], random_state=42)

    # Tokenize datasets
    train_dataset = CustomDataset(train_data, tokenizer)
    eval_dataset = CustomDataset(val_data, tokenizer)

    # Load the pre-trained model pre-trained
    model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=6)

    # Initialize and define the trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics
    )

    # Train and evaluate the model
    trainer.train()
    eval_result = trainer.evaluate()
    return eval_result["eval_accuracy"]

# Create and optimize the Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=25)

# Extract the best hyperparameters
best_params = study.best_trial.params

# Print the best combination of parameters
print(f"Best trial accuracy: {study.best_trial.value}")
print(f"Best parameters: {study.best_trial.params}")

# Define the training arguments using the best hyperparameters
best_training_args = TrainingArguments(
    output_dir='./best_results',
    num_train_epochs=best_params['num_train_epochs'],
    learning_rate=best_params['learning_rate'],
    per_device_train_batch_size=best_params['per_device_train_batch_size'],
    logging_dir='./best_logs',
    logging_steps=10,
    warmup_steps=1000,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    fp16=True
)

# Tokenize the full training dataset
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
train_dataset = CustomDataset(train_val_data, tokenizer)
test_dataset = CustomDataset(test_data, tokenizer)

# Load the pre-trained model
model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=6)

# Initialize the trainer with the best hyperparameters
trainer = Trainer(
    model=model,
    args=best_training_args,
    train_dataset=train_dataset,
    eval_dataset = test_dataset,
    compute_metrics=compute_metrics
)

# Train the model with the best hyperparameters
trainer.train()

# Evaluate on the test dataset
test_result = trainer.evaluate(test_dataset)

# Print the evaluation results
print(f"Test accuracy: {test_result['eval_accuracy']}")
print(f"Test loss: {test_result['eval_loss']}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.40M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
[I 2024-05-23 08:43:04,882] A new study created in memory with name: no-name-e65c78a5-4ccd-4be6-8c40-8d6f46c4b7d7
  learning_rate = trial.suggest_loguniform("learning_rate", 10e-5, 20e-5)


model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2296,1.171459,0.488281,0.511476,0.488281,0.476334
2,1.2298,1.140524,0.484375,0.506924,0.484375,0.459022
3,1.2234,1.488401,0.425781,0.353819,0.425781,0.373075
4,0.9429,1.145452,0.53125,0.546917,0.53125,0.532119


  _warn_prf(average, modifier, msg_start, len(result))


[I 2024-05-23 08:47:07,165] Trial 0 finished with value: 0.53125 and parameters: {'learning_rate': 0.000191567736651308, 'per_device_train_batch_size': 8, 'num_train_epochs': 4}. Best is trial 0 with value: 0.53125.
  learning_rate = trial.suggest_loguniform("learning_rate", 10e-5, 20e-5)
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.212,1.189257,0.520833,0.52538,0.520833,0.513904
2,1.425,1.209592,0.484375,0.51509,0.484375,0.465008
3,1.2305,1.371067,0.44401,0.426093,0.44401,0.390919
4,1.0836,1.287486,0.47526,0.489019,0.47526,0.444736
5,1.1785,1.417755,0.466146,0.511215,0.466146,0.466341
6,0.5537,1.420153,0.546875,0.563609,0.546875,0.5476
7,0.2835,1.718878,0.548177,0.563944,0.548177,0.550385


[I 2024-05-23 08:54:39,531] Trial 1 finished with value: 0.5481770833333334 and parameters: {'learning_rate': 0.00019018657617818227, 'per_device_train_batch_size': 8, 'num_train_epochs': 7}. Best is trial 1 with value: 0.5481770833333334.
  learning_rate = trial.suggest_loguniform("learning_rate", 10e-5, 20e-5)
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3059,1.353601,0.361979,0.419974,0.361979,0.295723
2,1.2007,1.117474,0.536458,0.542866,0.536458,0.534921
3,1.1143,1.258795,0.454427,0.480208,0.454427,0.443558
4,0.9467,1.134984,0.546875,0.551107,0.546875,0.542684
5,0.878,1.276667,0.549479,0.581009,0.549479,0.553982
6,0.4705,1.460112,0.548177,0.561733,0.548177,0.550092


[I 2024-05-23 09:01:19,882] Trial 2 finished with value: 0.5481770833333334 and parameters: {'learning_rate': 0.0001718827423525375, 'per_device_train_batch_size': 8, 'num_train_epochs': 6}. Best is trial 1 with value: 0.5481770833333334.
  learning_rate = trial.suggest_loguniform("learning_rate", 10e-5, 20e-5)
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3926,1.414889,0.38151,0.374157,0.38151,0.33131
2,1.1834,1.124683,0.523438,0.528716,0.523438,0.514435
3,0.9314,1.125369,0.514323,0.497949,0.514323,0.494151


[I 2024-05-23 09:04:34,289] Trial 3 finished with value: 0.5143229166666666 and parameters: {'learning_rate': 0.0001470459736756901, 'per_device_train_batch_size': 16, 'num_train_epochs': 3}. Best is trial 1 with value: 0.5481770833333334.
  learning_rate = trial.suggest_loguniform("learning_rate", 10e-5, 20e-5)
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7558,1.718001,0.423177,0.419075,0.423177,0.350419
2,1.3634,1.337304,0.447917,0.432551,0.447917,0.415287
3,1.1705,1.224859,0.463542,0.469743,0.463542,0.455871
4,0.8966,1.254222,0.460938,0.501674,0.460938,0.456405


[I 2024-05-23 09:09:18,338] Trial 4 finished with value: 0.4609375 and parameters: {'learning_rate': 0.00014228666698955732, 'per_device_train_batch_size': 32, 'num_train_epochs': 4}. Best is trial 1 with value: 0.5481770833333334.


Best trial accuracy: 0.5481770833333334
Best parameters: {'learning_rate': 0.00019018657617818227, 'per_device_train_batch_size': 8, 'num_train_epochs': 7}


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2206,1.214878,0.475,0.466016,0.475,0.457842
2,1.3902,1.248128,0.436458,0.345197,0.436458,0.360764
3,1.0031,1.205222,0.514583,0.536569,0.514583,0.499778
4,0.8215,1.250303,0.517708,0.517131,0.517708,0.513728
5,0.6794,1.39248,0.547917,0.558301,0.547917,0.539413
6,0.3242,1.697584,0.551042,0.559273,0.551042,0.552212
7,0.1509,1.966278,0.566667,0.586113,0.566667,0.571075


  _warn_prf(average, modifier, msg_start, len(result))


Test accuracy: 0.5666666666666667
Test loss: 1.966277837753296


## Fine-tune the CamemBert Model (with optimized parameters)

In [None]:
# Load the data
url = 'https://raw.githubusercontent.com/RebeccaKessler/Machine_Learning/main/training_data.csv'
data = pd.read_csv(url)

In [None]:
# Encode data
label_encoder = LabelEncoder()
data['encoded_labels'] = label_encoder.fit_transform(data['difficulty'])

In [None]:
# Load the tokenizer
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')

# Create empty lists for statistics
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []


# K-Fold cross-validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(kf.split(data, data['encoded_labels'])):
    train_data = data.iloc[train_idx]
    test_data = data.iloc[val_idx]

    # Tokenize datasets
    train_dataset = CustomDataset(train_data, tokenizer)
    test_dataset = CustomDataset(test_data, tokenizer)

    # Load pre-trained model
    model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=6)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=f'./results_fold_{fold}',
        learning_rate=0.00015, #manually adjusted
        num_train_epochs=7,
        per_device_train_batch_size=16,
        warmup_steps=1000,
        weight_decay=0.02, #increase in weight_decay to control for overfitting
        logging_dir=f'./logs_fold_{fold}',
        logging_steps=10,
        evaluation_strategy="steps",
        eval_steps=100,
        save_strategy="steps",
        save_steps=500,
        fp16=True,
    )

    # Initialize and define trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics
    )

    # Train and evaluate the model
    trainer.train()
    eval_result = trainer.evaluate()
    accuracy_list.append(eval_result['eval_accuracy'])
    precision_list.append(eval_result['eval_precision'])
    recall_list.append(eval_result['eval_recall'])
    f1_list.append(eval_result['eval_f1'])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,1.7615,1.732193,0.394792,0.423927,0.394792,0.361972
200,1.3994,1.346618,0.416667,0.431343,0.416667,0.344877
300,1.187,1.182873,0.471875,0.467929,0.471875,0.464877
400,1.1204,1.120708,0.508333,0.498647,0.508333,0.477427
500,0.894,1.260015,0.465625,0.48474,0.465625,0.457919
600,1.0024,1.157585,0.5125,0.500446,0.5125,0.487288
700,1.0044,1.251707,0.497917,0.516105,0.497917,0.484788
800,0.9577,1.640252,0.455208,0.500307,0.455208,0.437991
900,0.8586,1.413117,0.507292,0.523554,0.507292,0.502523
1000,0.8532,1.294814,0.528125,0.534899,0.528125,0.518675


Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,1.7562,1.735944,0.325,0.261548,0.325,0.220452
200,1.3517,1.318539,0.486458,0.475052,0.486458,0.436722
300,1.1344,1.129538,0.5625,0.560605,0.5625,0.555625
400,1.0604,1.078498,0.542708,0.552866,0.542708,0.517471
500,1.0273,1.02982,0.583333,0.60754,0.583333,0.587413
600,0.9194,1.025854,0.578125,0.582881,0.578125,0.573433
700,1.0554,1.137699,0.507292,0.505512,0.507292,0.488782
800,0.8189,1.028609,0.58125,0.581273,0.58125,0.57854
900,0.9266,1.070752,0.585417,0.586854,0.585417,0.58022
1000,0.7728,1.162071,0.542708,0.57641,0.542708,0.53752


  _warn_prf(average, modifier, msg_start, len(result))


Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,1.7485,1.730184,0.346875,0.491938,0.346875,0.256051
200,1.3563,1.348517,0.4125,0.425292,0.4125,0.349985
300,1.1564,1.388955,0.385417,0.429374,0.385417,0.346332
400,1.2059,1.222035,0.465625,0.487483,0.465625,0.454852
500,0.9548,1.141804,0.507292,0.496066,0.507292,0.489342
600,1.0335,1.19247,0.5,0.498294,0.5,0.482043
700,0.8529,1.169837,0.510417,0.518592,0.510417,0.500988
800,0.7708,1.212782,0.525,0.542333,0.525,0.517352
900,0.9314,1.228996,0.539583,0.530976,0.539583,0.527594
1000,0.7549,1.484882,0.510417,0.51072,0.510417,0.500325


Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,1.7592,1.73995,0.360417,0.37756,0.360417,0.277469
200,1.4072,1.33927,0.442708,0.426927,0.442708,0.403478
300,1.2244,1.215503,0.483333,0.491042,0.483333,0.47883
400,1.0995,1.138976,0.4875,0.551207,0.4875,0.483728
500,0.9098,1.062393,0.552083,0.564707,0.552083,0.552624
600,0.9284,1.038775,0.551042,0.548342,0.551042,0.536471
700,0.9192,1.114752,0.510417,0.54006,0.510417,0.501299
800,0.9686,1.149027,0.526042,0.590328,0.526042,0.520523
900,0.8482,1.133928,0.54375,0.537257,0.54375,0.522601
1000,0.6879,1.200999,0.510417,0.580871,0.510417,0.505146


Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,1.7505,1.732699,0.414583,0.513511,0.414583,0.363501
200,1.3884,1.315899,0.46875,0.453964,0.46875,0.437149
300,1.2088,1.159067,0.525,0.520521,0.525,0.513986
400,1.0535,1.098582,0.546875,0.540853,0.546875,0.540612
500,0.9151,1.049877,0.54375,0.55469,0.54375,0.531808
600,0.8837,1.073721,0.535417,0.5334,0.535417,0.530446
700,1.0066,1.241957,0.469792,0.503051,0.469792,0.45886
800,0.7663,1.171304,0.553125,0.573099,0.553125,0.551977
900,0.7899,1.125248,0.570833,0.625019,0.570833,0.571724
1000,0.6298,1.218406,0.552083,0.592893,0.552083,0.556223


In [None]:
# Compute overall statistics of the model
overall_accuracy = sum(accuracy_list) / len(accuracy_list)
overall_precision = sum(precision_list) / len(precision_list)
overall_recall = sum(recall_list) / len(recall_list)
overall_f1 = sum(f1_list) / len(f1_list)

print(f"Overall Accuracy: {overall_accuracy:.4f}")
print(f"Overall Precision: {overall_precision:.4f}")
print(f"Overall Recall: {overall_recall:.4f}")
print(f"Overall F1 Score: {overall_f1:.4f}")

## Re-train on Full Dataset

In [None]:
# Combine training and validation data
combined_data = pd.concat([train_data, test_data])

In [None]:
#Load pre-trained model
model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=6)

# Load the tokenizer
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')

# Tokenize the dataset
final_dataset = CustomDataset(combined_data, tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=0.00015,
    num_train_epochs=7,
    per_device_train_batch_size=16,
    warmup_steps=1000,
    weight_decay=0.02,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="no",
    save_strategy="steps",
    save_steps=500,
    fp16=True,
    )

# Re-initialize and define the trainer
final_trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=final_dataset,
    compute_metrics=None
)

# Retrain the model on the whole dataset
final_trainer.train()

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Step,Training Loss
10,1.7994
20,1.7907
30,1.7965
40,1.7918
50,1.7885
60,1.7826
70,1.7799
80,1.7822
90,1.7709
100,1.7544


Step,Training Loss
10,1.7994
20,1.7907
30,1.7965
40,1.7918
50,1.7885
60,1.7826
70,1.7799
80,1.7822
90,1.7709
100,1.7544


TrainOutput(global_step=2100, training_loss=0.774715739545368, metrics={'train_runtime': 365.7318, 'train_samples_per_second': 91.871, 'train_steps_per_second': 5.742, 'total_flos': 2210212240588800.0, 'train_loss': 0.774715739545368, 'epoch': 7.0})

In [None]:
# Save the final fine-tuned model and tokenizer
model.save_pretrained('./final_model')
tokenizer.save_pretrained('./final_model')

('./final_model/tokenizer_config.json',
 './final_model/special_tokens_map.json',
 './final_model/sentencepiece.bpe.model',
 './final_model/added_tokens.json')

## Make Predictions

In [None]:
# Load the unlabelled data
url = 'https://raw.githubusercontent.com/RebeccaKessler/Machine_Learning/main/unlabelled_test_data.csv'
unlabelled_data = pd.read_csv(url)

In [None]:
# Load the saved fine-tuned model and tokenizer
model_path = './final_model'
model = CamembertForSequenceClassification.from_pretrained(model_path)
tokenizer = CamembertTokenizer.from_pretrained(model_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# Create a prediction pipeline
device = 0 if torch.cuda.is_available() else -1
classifier = pipeline('text-classification', model=model, tokenizer=tokenizer, framework='pt', device=device)

# Predict labels for the unlabelled data
predictions = classifier(unlabelled_data['sentence'].tolist())

# Decode the numeric labels to original labels using the loaded LabelEncoder
predicted_labels = [label_encoder.inverse_transform([int(pred['label'].split('_')[-1])])[0] for pred in predictions]

# Create a new DataFrame with predictions
results_df = pd.DataFrame({
    'id': unlabelled_data['id'],
    'difficulty': predicted_labels
})

# Save the results to a new CSV file
results_df.to_csv('predicted_difficulties.csv', index=False)

print("Predictions saved to 'predicted_difficulties.csv'")


Predictions saved to 'predicted_difficulties.csv'


## Re-train on Extended Dataset

In [None]:
# Load extended data set
url = 'https://raw.githubusercontent.com/RebeccaKessler/Machine_Learning/main/combined_random_french_sentences.csv'
full_data = pd.read_csv(url)

In [None]:
# Apply LabelEncoder to labels
label_encoder = LabelEncoder()
full_data['encoded_labels'] = label_encoder.fit_transform(full_data['difficulty'])

In [None]:
# Load pre-trained model
model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=6)

# Load the tokenizer
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')

# Tokenize dataset
full_dataset = CustomDataset(full_data, tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=0.00015,
    num_train_epochs=7,
    per_device_train_batch_size=16,
    warmup_steps=1000,
    weight_decay=0.02,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='no',
    save_strategy="steps",
    save_steps=500,
    fp16=True,
    )

# Re-initialize and define the trainer
final_trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=full_dataset,
    compute_metrics=None
)

# Retrain the model on the extended dataset
final_trainer.train()


Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Step,Training Loss
10,1.7909
20,1.7934
30,1.7926
40,1.7895
50,1.7803
60,1.7822
70,1.7603
80,1.7481
90,1.7281
100,1.7163


In [None]:
#save the model
model_path_full = "./fine_tuned_model_full"
model.save_pretrained(model_path_full)
tokenizer.save_pretrained(model_path_full)

## Make Predictions

In [None]:
# Load the unlabelled data
url = 'https://raw.githubusercontent.com/RebeccaKessler/Machine_Learning/main/unlabelled_test_data.csv'
unlabelled_data = pd.read_csv(url)

In [None]:
# Load the fine-tuned model and tokenizer for predictions
model = CamembertForSequenceClassification.from_pretrained(model_path_full)
tokenizer = CamembertTokenizer.from_pretrained(model_path_full)

# Define prediction pipeline
classifier = pipeline('text-classification', model=model, tokenizer=tokenizer)

# Predict the labels of the unlabelled data
predictions = classifier(unlabelled_data['sentence'].tolist())
predicted_labels = [label_encoder.inverse_transform([int(pred['label'].split('_')[-1])])[0] for pred in predictions]

# Create a new dataframe with predictions
results_df = pd.DataFrame({
    'id': unlabelled_data['id'],
    'difficulty': predicted_labels
})
results_df.to_csv('predicted_difficulties_full.csv', index=False)