<a href="https://colab.research.google.com/github/RebeccaKessler/Machine_Learning/blob/main/Codes/Flaubert_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sentencepiece
!pip install accelerate -U
!pip install optuna
!pip install pandas numpy matplotlib
!pip install scikit-learn seaborn
!pip install sacremoses

Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.w

In [None]:
#import packages
import torch
from torch.utils.data import Dataset
import pandas as pd
from transformers import CamembertConfig, CamembertForSequenceClassification, CamembertTokenizer, Trainer, TrainingArguments, FlaubertTokenizer, FlaubertForSequenceClassification, Trainer, TrainingArguments, FlaubertModel, FlaubertTokenizer
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import StratifiedKFold
from transformers import pipeline
import optuna

## Define functions

In [None]:
# Define compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(-1)
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [None]:
 # Define pre-processing function
 class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = str(self.data.iloc[idx]['sentence'])
        label = int(self.data.iloc[idx]['encoded_labels'])

        encoding = self.tokenizer.encode_plus(
            sentence,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

## Hyperparameter Optimization

In [None]:
# Load the data
url = 'https://raw.githubusercontent.com/RebeccaKessler/Machine_Learning/main/training_data.csv'
data = pd.read_csv(url)

In [None]:
# Encode the labels
label_encoder = LabelEncoder()
data['encoded_labels'] = label_encoder.fit_transform(data['difficulty'])

# Split the dataset into training/validation and final test set
train_val_data, test_data = train_test_split(data, test_size=0.2, stratify=data['encoded_labels'], random_state=42)


In [None]:
# Load the tokenizer
tokenizer = FlaubertTokenizer.from_pretrained('flaubert/flaubert_base_cased')

# Define objective function for the hyperoptimization
def objective(trial):
    # Define hyperparameter search space
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 10e-5)
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32])
    num_train_epochs = trial.suggest_int("num_train_epochs", 3, 7)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=num_train_epochs,
        learning_rate=learning_rate,
        per_device_train_batch_size=per_device_train_batch_size,
        logging_dir='./logs',
        logging_steps=10,
        warmup_steps=500,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        fp16=True
    )

    # Split the dataset into training and validation sets (80/20)
    train_data, val_data = train_test_split(train_val_data, test_size=0.2, stratify=train_val_data['encoded_labels'], random_state=42)

    # Tokenize datasets
    train_dataset = CustomDataset(train_data, tokenizer)
    eval_dataset = CustomDataset(val_data, tokenizer)

    # Load pre-trained model
    model = FlaubertForSequenceClassification.from_pretrained('flaubert/flaubert_base_cased', num_labels=6)

    # Define and initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics
    )

    # Train and evaluate
    trainer.train()
    eval_result = trainer.evaluate()
    return eval_result["eval_accuracy"]

# Create and optimize the Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=25)

# Extract the best hyperparameters
best_params = study.best_trial.params

# Print the best combination of parameters
print(f"Best trial accuracy: {study.best_trial.value}")
print(f"Best parameters: {study.best_trial.params}")

# Define the best training arguments using the best hyperparameters
best_training_args = TrainingArguments(
    output_dir='./best_results',
    num_train_epochs=best_params['num_train_epochs'],
    learning_rate=best_params['learning_rate'],
    per_device_train_batch_size=best_params['per_device_train_batch_size'],
    logging_dir='./best_logs',
    logging_steps=10,
    warmup_steps=500,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    fp16=True
)

# Tokenize the full training dataset
tokenizer = FlaubertTokenizer.from_pretrained('flaubert/flaubert_base_cased')
train_dataset = CustomDataset(train_val_data, tokenizer)
test_dataset = CustomDataset(test_data, tokenizer)

# Load the pre-trained model
model = FlaubertForSequenceClassification.from_pretrained('flaubert/flaubert_base_cased', num_labels=6)

# Initialize the trainer with the best hyperparameters
trainer = Trainer(
    model=model,
    args=best_training_args,
    train_dataset=train_dataset,
    eval_dataset = test_dataset,
    compute_metrics=compute_metrics
)

# Train the model with the best hyperparameters
trainer.train()

# Evaluate on the test dataset
test_result = trainer.evaluate(test_dataset)

# Print the evaluation results
print(f"Test accuracy: {test_result['eval_accuracy']}")
print(f"Test loss: {test_result['eval_loss']}")

[I 2024-05-22 18:44:02,191] A new study created in memory with name: no-name-39ba26e6-7258-49e7-bf22-8752f4e6856e
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 10e-5)
Some weights of FlaubertForSequenceClassification were not initialized from the model checkpoint at flaubert/flaubert_base_cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.787,1.712246,0.238281,0.226435,0.238281,0.214672
2,1.5572,1.553309,0.325521,0.340766,0.325521,0.290583
3,1.4661,1.369633,0.429688,0.426299,0.429688,0.407252
4,1.1143,1.2253,0.488281,0.516617,0.488281,0.487109


[I 2024-05-22 18:46:59,154] Trial 0 finished with value: 0.48828125 and parameters: {'learning_rate': 1.945527124650321e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 4}. Best is trial 0 with value: 0.48828125.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 10e-5)
Some weights of FlaubertForSequenceClassification were not initialized from the model checkpoint at flaubert/flaubert_base_cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6085,1.53465,0.339844,0.335105,0.339844,0.329
2,1.356,1.289185,0.451823,0.501637,0.451823,0.451836
3,1.0503,1.122532,0.50651,0.525393,0.50651,0.507696


[I 2024-05-22 18:49:38,363] Trial 1 finished with value: 0.5065104166666666 and parameters: {'learning_rate': 2.358174046185184e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 3}. Best is trial 1 with value: 0.5065104166666666.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 10e-5)
Some weights of FlaubertForSequenceClassification were not initialized from the model checkpoint at flaubert/flaubert_base_cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6681,1.643159,0.303385,0.296004,0.303385,0.278812
2,1.5076,1.354641,0.420573,0.454374,0.420573,0.395692
3,1.1015,1.131903,0.5,0.508836,0.5,0.480716
4,0.8332,1.128642,0.53125,0.549748,0.53125,0.527358
5,0.8097,1.079497,0.566406,0.570231,0.566406,0.566672


[I 2024-05-22 18:53:57,633] Trial 2 finished with value: 0.56640625 and parameters: {'learning_rate': 2.2075355727268254e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 5}. Best is trial 2 with value: 0.56640625.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 10e-5)
Some weights of FlaubertForSequenceClassification were not initialized from the model checkpoint at flaubert/flaubert_base_cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6435,1.648632,0.307292,0.29779,0.307292,0.298056
2,1.4434,1.437122,0.391927,0.409508,0.391927,0.376414
3,1.1875,1.137735,0.503906,0.520567,0.503906,0.488805
4,0.9489,1.102865,0.548177,0.564737,0.548177,0.549791
5,0.9588,1.04265,0.578125,0.583823,0.578125,0.576897
6,0.7935,1.119884,0.557292,0.563754,0.557292,0.557353
7,0.6273,1.155784,0.561198,0.568204,0.561198,0.560681


[I 2024-05-22 18:59:49,524] Trial 3 finished with value: 0.5611979166666666 and parameters: {'learning_rate': 1.3217028971201137e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 7}. Best is trial 2 with value: 0.56640625.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 10e-5)
Some weights of FlaubertForSequenceClassification were not initialized from the model checkpoint at flaubert/flaubert_base_cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2903,1.267247,0.458333,0.514825,0.458333,0.433914
2,1.0494,1.195323,0.49349,0.496285,0.49349,0.485077
3,0.9891,1.140626,0.529948,0.533142,0.529948,0.502604
4,0.8093,1.300826,0.545573,0.593108,0.545573,0.524057
5,0.7492,1.65703,0.542969,0.572267,0.542969,0.544903
6,0.311,1.851874,0.574219,0.579756,0.574219,0.574656
7,0.2504,2.129099,0.606771,0.613325,0.606771,0.607776


[I 2024-05-22 19:07:42,361] Trial 4 finished with value: 0.6067708333333334 and parameters: {'learning_rate': 4.0496777151282306e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 7}. Best is trial 4 with value: 0.6067708333333334.


Best trial accuracy: 0.6067708333333334
Best parameters: {'learning_rate': 4.0496777151282306e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 7}


Some weights of FlaubertForSequenceClassification were not initialized from the model checkpoint at flaubert/flaubert_base_cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4781,1.296681,0.39375,0.494532,0.39375,0.369915
2,1.1996,1.179287,0.479167,0.522648,0.479167,0.4495


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4781,1.296681,0.39375,0.494532,0.39375,0.369915
2,1.1996,1.179287,0.479167,0.522648,0.479167,0.4495
3,0.9495,1.191887,0.54375,0.549306,0.54375,0.540155
4,0.5869,1.246589,0.579167,0.583504,0.579167,0.578261
5,0.4144,1.852916,0.576042,0.58994,0.576042,0.57497
6,0.2648,2.411003,0.580208,0.578724,0.580208,0.578561
7,0.0649,2.68153,0.588542,0.594998,0.588542,0.589763


Training accuracy: 0.9872395833333333
Training loss: 0.04846487194299698
Test accuracy: 0.5885416666666666
Test loss: 2.6815295219421387


## Fine-tune the Flaubert model (with optimized parameters)

In [None]:
# Load the data
url = 'https://raw.githubusercontent.com/RebeccaKessler/Machine_Learning/main/training_data.csv'
data = pd.read_csv(url)

In [None]:
# Encode data
label_encoder = LabelEncoder()
data['encoded_labels'] = label_encoder.fit_transform(data['difficulty'])

In [None]:
# Define the tokenizer
tokenizer = FlaubertTokenizer.from_pretrained('flaubert/flaubert_base_cased')

# Create empty lists for statistics
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

# K-Fold cross-validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(kf.split(data, data['encoded_labels'])):
    print(f"Training fold {fold + 1}")
    train_data = data.iloc[train_idx]
    test_data = data.iloc[val_idx]

    # Tokenize data
    train_dataset = CustomDataset(train_data, tokenizer)
    eval_dataset = CustomDataset(test_data, tokenizer)

    # Set up the model
    model = FlaubertForSequenceClassification.from_pretrained('flaubert/flaubert_base_cased', num_labels=6)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=f'./results_fold_{fold}',
        learning_rate=0.00005, #manually adjusted
        num_train_epochs=7,
        per_device_train_batch_size=16,
        warmup_steps=500,
        weight_decay=0.05, #increase in weight_decay to control for overfititng
        logging_dir=f'./logs_fold_{fold}',
        logging_steps=10,
        evaluation_strategy="steps",
        eval_steps=100,
        save_strategy="steps",
        save_steps=500,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        greater_is_better=True,
    )

    #  Define and initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics,
    )

    # Train and evaluate
    trainer.train()
    eval_result = trainer.evaluate()
    accuracy_list.append(eval_result['eval_accuracy'])
    precision_list.append(eval_result['eval_precision'])
    recall_list.append(eval_result['eval_recall'])
    f1_list.append(eval_result['eval_f1'])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.56M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/896k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.50k [00:00<?, ?B/s]

Training fold 1


model.safetensors:   0%|          | 0.00/553M [00:00<?, ?B/s]

Some weights of FlaubertForSequenceClassification were not initialized from the model checkpoint at flaubert/flaubert_base_cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,1.7159,1.647942,0.28125,0.228929,0.28125,0.227875
200,1.4713,1.359013,0.446875,0.470667,0.446875,0.438528
300,1.2921,1.153873,0.508333,0.538362,0.508333,0.507524
400,1.1019,1.093724,0.5375,0.539147,0.5375,0.536495
500,0.8992,1.105448,0.536458,0.538425,0.536458,0.534242
600,1.1096,1.094859,0.504167,0.555526,0.504167,0.497031
700,1.0203,1.077677,0.542708,0.582121,0.542708,0.541111
800,0.7107,1.317592,0.508333,0.52926,0.508333,0.489216
900,0.6619,1.164242,0.547917,0.56531,0.547917,0.548464
1000,0.4383,1.527905,0.585417,0.599042,0.585417,0.582527


Training fold 2


Some weights of FlaubertForSequenceClassification were not initialized from the model checkpoint at flaubert/flaubert_base_cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,1.7596,1.662878,0.271875,0.330251,0.271875,0.216066
200,1.5915,1.518087,0.364583,0.331075,0.364583,0.324439
300,1.3356,1.287375,0.4375,0.447667,0.4375,0.39253
400,1.2892,1.302755,0.461458,0.471538,0.461458,0.414483
500,1.2033,1.253181,0.459375,0.456832,0.459375,0.439384
600,1.0537,1.042088,0.548958,0.564404,0.548958,0.548062
700,1.0765,1.022714,0.547917,0.575025,0.547917,0.49895
800,0.7595,1.001837,0.598958,0.600352,0.598958,0.595633
900,0.9013,1.066564,0.585417,0.595775,0.585417,0.583555
1000,0.6644,1.196671,0.577083,0.613362,0.577083,0.579695


  _warn_prf(average, modifier, msg_start, len(result))


Training fold 3


Some weights of FlaubertForSequenceClassification were not initialized from the model checkpoint at flaubert/flaubert_base_cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,1.7056,1.689954,0.25,0.216491,0.25,0.199159
200,1.6117,1.576373,0.363542,0.37216,0.363542,0.315756
300,1.3066,1.381589,0.432292,0.474225,0.432292,0.423663
400,1.3184,1.232349,0.458333,0.493014,0.458333,0.38344
500,1.1802,1.296611,0.421875,0.480169,0.421875,0.364449
600,1.1627,1.116387,0.534375,0.530636,0.534375,0.523282
700,1.1348,1.078242,0.542708,0.567075,0.542708,0.520564
800,0.953,1.270321,0.539583,0.557553,0.539583,0.537853
900,0.9161,1.102558,0.551042,0.558216,0.551042,0.548201
1000,0.624,1.316211,0.570833,0.5705,0.570833,0.569118


Training fold 4


Some weights of FlaubertForSequenceClassification were not initialized from the model checkpoint at flaubert/flaubert_base_cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,1.83,1.706415,0.2625,0.271974,0.2625,0.240141
200,1.6232,1.526733,0.354167,0.381964,0.354167,0.31615
300,1.2219,1.303463,0.486458,0.486199,0.486458,0.47182
400,1.2629,1.152657,0.482292,0.540685,0.482292,0.46338
500,1.0673,1.244184,0.471875,0.53997,0.471875,0.422171
600,1.1202,1.071484,0.538542,0.548782,0.538542,0.50963
700,0.9992,1.016283,0.575,0.575114,0.575,0.564687
800,0.875,1.154294,0.534375,0.567924,0.534375,0.534982
900,0.8316,1.038841,0.566667,0.576106,0.566667,0.565402
1000,0.6292,1.229847,0.578125,0.59989,0.578125,0.578411


Training fold 5


Some weights of FlaubertForSequenceClassification were not initialized from the model checkpoint at flaubert/flaubert_base_cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,1.7328,1.663436,0.291667,0.30429,0.291667,0.269078
200,1.5297,1.581565,0.322917,0.372689,0.322917,0.304034
300,1.3617,1.290285,0.45625,0.421996,0.45625,0.422043
400,1.2067,1.155341,0.48125,0.540481,0.48125,0.444447
500,1.126,1.163797,0.48125,0.528079,0.48125,0.440349
600,1.0895,1.143971,0.525,0.56447,0.525,0.489724
700,1.1432,1.119962,0.510417,0.543533,0.510417,0.501425
800,0.893,1.089232,0.56875,0.585323,0.56875,0.567597
900,0.7902,1.119636,0.564583,0.576311,0.564583,0.563127
1000,0.5953,1.307438,0.55,0.552948,0.55,0.546097


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,1.7328,1.663436,0.291667,0.30429,0.291667,0.269078
200,1.5297,1.581565,0.322917,0.372689,0.322917,0.304034
300,1.3617,1.290285,0.45625,0.421996,0.45625,0.422043
400,1.2067,1.155341,0.48125,0.540481,0.48125,0.444447
500,1.126,1.163797,0.48125,0.528079,0.48125,0.440349
600,1.0895,1.143971,0.525,0.56447,0.525,0.489724
700,1.1432,1.119962,0.510417,0.543533,0.510417,0.501425
800,0.893,1.089232,0.56875,0.585323,0.56875,0.567597
900,0.7902,1.119636,0.564583,0.576311,0.564583,0.563127
1000,0.5953,1.307438,0.55,0.552948,0.55,0.546097


Accuracy: [0.5854166666666667, 0.6010416666666667, 0.6, 0.5916666666666667, 0.5822916666666667]
Precision: [0.5990421616195472, 0.6069583980576044, 0.605396885675242, 0.6040783389733825, 0.5982624650081628]
Recall: [0.5854166666666667, 0.6010416666666667, 0.6, 0.5916666666666667, 0.5822916666666667]
F1 Score: [0.5825271146219662, 0.6026966543639529, 0.6013547687493201, 0.5948939354171586, 0.5834866944690326]


In [None]:
# Compute and print overall statistics of the model
overall_accuracy = sum(accuracy_list) / len(accuracy_list)
overall_precision = sum(precision_list) / len(precision_list)
overall_recall = sum(recall_list) / len(recall_list)
overall_f1 = sum(f1_list) / len(f1_list)

print(f"Overall Accuracy: {overall_accuracy:.4f}")
print(f"Overall Precision: {overall_precision:.4f}")
print(f"Overall Recall: {overall_recall:.4f}")
print(f"Overall F1 Score: {overall_f1:.4f}")

Overall Accuracy: 0.5921
Overall Precision: 0.6027
Overall Recall: 0.5921
Overall F1 Score: 0.5930


## Re-train on full dataset

In [None]:
# Combine training and validation data
combined_data = pd.concat([train_data, test_data])

# Load model and tokenizer
model = FlaubertForSequenceClassification.from_pretrained('flaubert/flaubert_base_cased', num_labels=6)
tokenizer = FlaubertTokenizer.from_pretrained('flaubert/flaubert_base_cased')

# Tokenize dataset
final_dataset = CustomDataset(combined_data, tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=0.00005,
    num_train_epochs=7,
    per_device_train_batch_size=16,
    warmup_steps=500,
    weight_decay=0.05,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="no",
    save_strategy="steps",
    save_steps=500,
    fp16=True,
    )

# Re-initialize and define trainer
final_trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=final_dataset,
    compute_metrics=None
)

# Retrain the model
final_trainer.train()


Some weights of FlaubertForSequenceClassification were not initialized from the model checkpoint at flaubert/flaubert_base_cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10,2.0274
20,2.0182
30,1.9043
40,1.8817
50,1.8411
60,1.8444
70,1.843
80,1.8242
90,1.8566
100,1.8119


TrainOutput(global_step=2100, training_loss=0.7711642868320148, metrics={'train_runtime': 415.6248, 'train_samples_per_second': 80.842, 'train_steps_per_second': 5.053, 'total_flos': 2194972132147200.0, 'train_loss': 0.7711642868320148, 'epoch': 7.0})

In [None]:
# Save the fine-tuned model and tokenizer
model.save_pretrained('./final_model')
tokenizer.save_pretrained('./final_model')

## Make predictions

In [None]:
# Load the unlabelled data
url = 'https://raw.githubusercontent.com/RebeccaKessler/Machine_Learning/main/unlabelled_test_data.csv'
unlabelled_data = pd.read_csv(url)

In [None]:
# Load the saved fine-tuned model and tokenizer
model_path = './final_model'
model = FlaubertForSequenceClassification.from_pretrained(model_path)
tokenizer = FlaubertTokenizer.from_pretrained(model_path)

In [None]:
# Create a prediction pipeline
device = 0 if torch.cuda.is_available() else -1
classifier = pipeline('text-classification', model=model, tokenizer=tokenizer, framework='pt', device=device)

# Predict labels for the unlabelled data
predictions = classifier(unlabelled_data['sentence'].tolist())

# Decode the numeric labels to original labels using the previously fitted LabelEncoder
predicted_labels = [label_encoder.inverse_transform([int(pred['label'].split('_')[-1])])[0] for pred in predictions]

# Create a DataFrame to export the predictions
results_df = pd.DataFrame({
    'id': unlabelled_data['id'],
    'difficulty': predicted_labels
})

# Save the results to a new CSV file
results_df.to_csv('predicted_difficulties.csv', index=False)

print("Predictions saved to 'predicted_difficulties.csv'")

Predictions saved to 'predicted_difficulties.csv'


## Re-train on extended dataset

In [None]:
# Import extended dataset
url = 'https://raw.githubusercontent.com/RebeccaKessler/Machine_Learning/main/final_combined_training_data.csv'
full_data = pd.read_csv(url)

In [None]:
# Load model and tokenizer
tokenizer = FlaubertTokenizer.from_pretrained('flaubert/flaubert_base_cased')
model = FlaubertForSequenceClassification.from_pretrained('flaubert/flaubert_base_cased', num_labels=6)

In [None]:
# Apply label encoder and tokenizer to the dataset
label_encoder = LabelEncoder()
full_data['encoded_labels'] = label_encoder.fit_transform(full_data['difficulty'])
full_dataset = CustomDataset(full_data, tokenizer)

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=0.00005,
    num_train_epochs=7,
    per_device_train_batch_size=16,
    warmup_steps=500,
    weight_decay=0.05,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="no",
    save_strategy="steps",
    save_steps=500,
    fp16=True,
    )

# Initialize and define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=full_dataset,
    compute_metrics=None
)

# Train model
trainer.train()

In [None]:
# Save the fine-tuned model and tokenizer
model_path_full = './final_model_extended'
model.save_pretrained('./final_model_extended')
tokenizer.save_pretrained('./final_model_extended')

## Make predictions

In [None]:
# Load the unlabelled data
url = 'https://raw.githubusercontent.com/RebeccaKessler/Machine_Learning/main/unlabelled_test_data.csv'
unlabelled_data = pd.read_csv(url)

In [None]:
# Load the fine-tuned model and tokenizer for predictions
model = FlaubertForSequenceClassification.from_pretrained(model_path_full)
tokenizer = FlaubertTokenizer.from_pretrained(model_path_full)

In [None]:
# Define prediction pipeline
classifier = pipeline('text-classification', model=model, tokenizer=tokenizer)

# Make prediction on unlabelled dataset
predictions = classifier(unlabelled_data['sentence'].tolist())
predicted_labels = [label_encoder.inverse_transform([int(pred['label'].split('_')[-1])])[0] for pred in predictions]

# Create a new dataframe with the predictions
results_df = pd.DataFrame({
    'id': unlabelled_data['id'],
    'difficulty': predicted_labels
})

# Save the dataframe
results_df.to_csv('predicted_difficulties_full.csv', index=False)