<a href="https://colab.research.google.com/github/RebeccaKessler/Machine_Learning/blob/main/Codes/Flaubert_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install sentencepiece
!pip install accelerate -U
!pip install optuna
!pip install pandas numpy matplotlib
!pip install scikit-learn seaborn
!pip install sacremoses

Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.w

In [14]:
#import packages
import torch
from torch.utils.data import Dataset
import pandas as pd
from transformers import CamembertConfig, CamembertForSequenceClassification, CamembertTokenizer, Trainer, TrainingArguments, FlaubertTokenizer, FlaubertForSequenceClassification, Trainer, TrainingArguments, FlaubertModel, FlaubertTokenizer
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import StratifiedKFold, KFold
from transformers import pipeline
import optuna

## Define functions

In [3]:
# Define compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(-1)
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [4]:
 # Define pre-processing function
 class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = str(self.data.iloc[idx]['sentence'])
        label = int(self.data.iloc[idx]['encoded_labels'])

        encoding = self.tokenizer.encode_plus(
            sentence,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

## Hyperparameter Optimization

In [5]:
# Load the data
url = 'https://raw.githubusercontent.com/RebeccaKessler/Machine_Learning/main/training_data.csv'
data = pd.read_csv(url)

In [6]:
# Encode the labels
label_encoder = LabelEncoder()
data['encoded_labels'] = label_encoder.fit_transform(data['difficulty'])

# Split the dataset into training/validation and final test set
train_val_data, test_data = train_test_split(data, test_size=0.2, stratify=data['encoded_labels'], random_state=42)


In [7]:
# Load the tokenizer
tokenizer = FlaubertTokenizer.from_pretrained('flaubert/flaubert_base_cased')

# Define objective function for the hyperoptimization
def objective(trial):
    # Define hyperparameter search space
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 10e-5)
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32])
    num_train_epochs = trial.suggest_int("num_train_epochs", 3, 7)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=num_train_epochs,
        learning_rate=learning_rate,
        per_device_train_batch_size=per_device_train_batch_size,
        logging_dir='./logs',
        logging_steps=10,
        warmup_steps=500,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        fp16=True
    )

    # Split the dataset into training and validation sets (80/20)
    train_data, val_data = train_test_split(train_val_data, test_size=0.2, stratify=train_val_data['encoded_labels'], random_state=42)

    # Tokenize datasets
    train_dataset = CustomDataset(train_data, tokenizer)
    eval_dataset = CustomDataset(val_data, tokenizer)

    # Load pre-trained model
    model = FlaubertForSequenceClassification.from_pretrained('flaubert/flaubert_base_cased', num_labels=6)

    # Define and initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics
    )

    # Train and evaluate
    trainer.train()
    eval_result = trainer.evaluate()
    return eval_result["eval_accuracy"]

# Create and optimize the Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=25)

# Extract the best hyperparameters
best_params = study.best_trial.params

# Print the best combination of parameters
print(f"Best trial accuracy: {study.best_trial.value}")
print(f"Best parameters: {study.best_trial.params}")

# Define the best training arguments using the best hyperparameters
best_training_args = TrainingArguments(
    output_dir='./best_results',
    num_train_epochs=best_params['num_train_epochs'],
    learning_rate=best_params['learning_rate'],
    per_device_train_batch_size=best_params['per_device_train_batch_size'],
    logging_dir='./best_logs',
    logging_steps=10,
    warmup_steps=500,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    fp16=True
)

# Tokenize the full training dataset
tokenizer = FlaubertTokenizer.from_pretrained('flaubert/flaubert_base_cased')
train_dataset = CustomDataset(train_val_data, tokenizer)
test_dataset = CustomDataset(test_data, tokenizer)

# Load the pre-trained model
model = FlaubertForSequenceClassification.from_pretrained('flaubert/flaubert_base_cased', num_labels=6)

# Initialize the trainer with the best hyperparameters
trainer = Trainer(
    model=model,
    args=best_training_args,
    train_dataset=train_dataset,
    eval_dataset = test_dataset,
    compute_metrics=compute_metrics
)

# Train the model with the best hyperparameters
trainer.train()

# Evaluate on the test dataset
test_result = trainer.evaluate(test_dataset)

# Print the evaluation results
print(f"Test accuracy: {test_result['eval_accuracy']}")
print(f"Test loss: {test_result['eval_loss']}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.56M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/896k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.50k [00:00<?, ?B/s]

[I 2024-05-23 10:21:20,271] A new study created in memory with name: no-name-820181f6-39d4-4f6b-85fb-7f3d18dc84a2
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 10e-5)


model.safetensors:   0%|          | 0.00/553M [00:00<?, ?B/s]

Some weights of FlaubertForSequenceClassification were not initialized from the model checkpoint at flaubert/flaubert_base_cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7986,1.665155,0.270833,0.264224,0.270833,0.265285
2,1.5189,1.52396,0.364583,0.400851,0.364583,0.333673
3,1.3162,1.221896,0.486979,0.498065,0.486979,0.486016
4,1.0163,1.143434,0.532552,0.55138,0.532552,0.532996
5,1.0054,1.126067,0.532552,0.555697,0.532552,0.532467
6,0.7007,1.124112,0.565104,0.576835,0.565104,0.564372


[I 2024-05-23 10:25:48,406] Trial 0 finished with value: 0.5651041666666666 and parameters: {'learning_rate': 2.941379672886861e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 6}. Best is trial 0 with value: 0.5651041666666666.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 10e-5)
Some weights of FlaubertForSequenceClassification were not initialized from the model checkpoint at flaubert/flaubert_base_cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7743,1.64554,0.273438,0.250704,0.273438,0.25284
2,1.4298,1.466114,0.369792,0.37271,0.369792,0.343133
3,1.263,1.194343,0.480469,0.509567,0.480469,0.481205
4,1.0061,1.143491,0.501302,0.51502,0.501302,0.501764


[I 2024-05-23 10:28:55,814] Trial 1 finished with value: 0.5013020833333334 and parameters: {'learning_rate': 3.4976991890013584e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 4}. Best is trial 0 with value: 0.5651041666666666.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 10e-5)
Some weights of FlaubertForSequenceClassification were not initialized from the model checkpoint at flaubert/flaubert_base_cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8073,1.701572,0.259115,0.268132,0.259115,0.260712
2,1.5582,1.549788,0.31901,0.316406,0.31901,0.302635
3,1.4763,1.317841,0.436198,0.431252,0.436198,0.429689
4,1.1337,1.277428,0.458333,0.494111,0.458333,0.456023


[I 2024-05-23 10:31:47,496] Trial 2 finished with value: 0.4583333333333333 and parameters: {'learning_rate': 1.847654947502767e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 4}. Best is trial 0 with value: 0.5651041666666666.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 10e-5)
Some weights of FlaubertForSequenceClassification were not initialized from the model checkpoint at flaubert/flaubert_base_cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6867,1.556075,0.309896,0.282087,0.309896,0.287947
2,1.3103,1.217065,0.486979,0.501843,0.486979,0.484377
3,1.1791,1.06026,0.522135,0.52208,0.522135,0.519715
4,0.9764,1.168909,0.507812,0.551115,0.507812,0.505542


[I 2024-05-23 10:35:53,187] Trial 3 finished with value: 0.5078125 and parameters: {'learning_rate': 8.076529230244487e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 4}. Best is trial 0 with value: 0.5651041666666666.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 10e-5)
Some weights of FlaubertForSequenceClassification were not initialized from the model checkpoint at flaubert/flaubert_base_cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7374,1.624765,0.28125,0.262375,0.28125,0.264238
2,1.4544,1.452995,0.395833,0.35656,0.395833,0.35684
3,1.3109,1.182178,0.498698,0.505708,0.498698,0.498704
4,1.0378,1.11137,0.526042,0.54796,0.526042,0.518691
5,1.0515,1.13383,0.542969,0.579803,0.542969,0.546497
6,0.7809,1.096228,0.561198,0.570259,0.561198,0.559336


[I 2024-05-23 10:42:27,475] Trial 4 finished with value: 0.5611979166666666 and parameters: {'learning_rate': 3.727014943145028e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 6}. Best is trial 0 with value: 0.5651041666666666.


Best trial accuracy: 0.5651041666666666
Best parameters: {'learning_rate': 2.941379672886861e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 6}


Some weights of FlaubertForSequenceClassification were not initialized from the model checkpoint at flaubert/flaubert_base_cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7111,1.657323,0.291667,0.321326,0.291667,0.266765
2,1.4327,1.354155,0.435417,0.443496,0.435417,0.429238
3,1.1282,1.25371,0.459375,0.517836,0.459375,0.425973
4,1.0779,1.178297,0.501042,0.501175,0.501042,0.478395
5,0.7643,1.11706,0.55,0.569014,0.55,0.55386
6,0.5587,1.160894,0.578125,0.588223,0.578125,0.579868


Test accuracy: 0.578125
Test loss: 1.1608940362930298


## Fine-tune the Flaubert model (with optimized parameters)

In [18]:
# Load the data
url = 'https://raw.githubusercontent.com/RebeccaKessler/Machine_Learning/main/training_data.csv'
data = pd.read_csv(url)

In [19]:
# Encode data
label_encoder = LabelEncoder()
data['encoded_labels'] = label_encoder.fit_transform(data['difficulty'])

In [None]:
# Define the tokenizer
tokenizer = FlaubertTokenizer.from_pretrained('flaubert/flaubert_base_cased')

# Create empty lists for statistics
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

# K-Fold cross-validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, test_idx) in enumerate(kf.split(data, data['encoded_labels'])):
    print(f"Training fold {fold + 1}")
    train_data = data.iloc[train_idx]
    test_data = data.iloc[test_idx]

    # Tokenize data
    train_dataset = CustomDataset(train_data, tokenizer)
    test_dataset = CustomDataset(test_data, tokenizer)

    # Set up the model
    model = FlaubertForSequenceClassification.from_pretrained('flaubert/flaubert_base_cased', num_labels=6)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=f'./results_fold_{fold}',
        learning_rate=0.00005, #manually adjusted
        num_train_epochs=7,
        per_device_train_batch_size=16,
        warmup_steps=500,
        weight_decay=0.05, #increase in weight_decay to control for overfititng
        logging_dir=f'./logs_fold_{fold}',
        logging_steps=10,
        evaluation_strategy="steps",
        eval_steps=100,
        save_strategy="steps",
        save_steps=500,
        fp16=True
    )

    #  Define and initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics,
    )

    # Train and evaluate
    trainer.train()
    eval_result = trainer.evaluate()
    accuracy_list.append(eval_result['eval_accuracy'])
    precision_list.append(eval_result['eval_precision'])
    recall_list.append(eval_result['eval_recall'])
    f1_list.append(eval_result['eval_f1'])


Training fold 1


Some weights of FlaubertForSequenceClassification were not initialized from the model checkpoint at flaubert/flaubert_base_cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,1.7165,1.683906,0.298958,0.3211,0.298958,0.255965
200,1.5066,1.388259,0.397917,0.459285,0.397917,0.385201
300,1.2236,1.201431,0.4875,0.512351,0.4875,0.486056
400,1.2942,1.142046,0.486458,0.506679,0.486458,0.467723
500,0.9691,1.144278,0.516667,0.536732,0.516667,0.519378
600,1.1656,1.067505,0.519792,0.552279,0.519792,0.513801
700,1.0751,1.097603,0.50625,0.542598,0.50625,0.509957
800,0.7967,1.277684,0.516667,0.536573,0.516667,0.509306
900,0.7995,1.103632,0.558333,0.565299,0.558333,0.552067
1000,0.4882,1.402304,0.561458,0.573861,0.561458,0.56149


Training fold 2


Some weights of FlaubertForSequenceClassification were not initialized from the model checkpoint at flaubert/flaubert_base_cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,1.7806,1.703732,0.26875,0.362461,0.26875,0.215336
200,1.6012,1.581099,0.328125,0.32819,0.328125,0.277859
300,1.3102,1.290212,0.458333,0.463988,0.458333,0.398762
400,1.1899,1.116922,0.505208,0.515375,0.505208,0.501522
500,1.5631,1.235337,0.455208,0.470198,0.455208,0.407919
600,1.0929,1.135384,0.516667,0.515052,0.516667,0.496251
700,1.1312,1.07422,0.546875,0.544638,0.546875,0.528021
800,1.0482,1.072295,0.555208,0.55039,0.555208,0.543994
900,1.0276,1.129772,0.576042,0.586685,0.576042,0.575728
1000,0.8342,1.109903,0.580208,0.603565,0.580208,0.581342


Training fold 3


Some weights of FlaubertForSequenceClassification were not initialized from the model checkpoint at flaubert/flaubert_base_cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,1.7093,1.66699,0.282292,0.244975,0.282292,0.233071
200,1.6112,1.507816,0.364583,0.472278,0.364583,0.325066
300,1.3552,1.295911,0.438542,0.498933,0.438542,0.412575
400,1.3367,1.201927,0.472917,0.506108,0.472917,0.406756
500,1.1223,1.12241,0.515625,0.540113,0.515625,0.497471
600,1.2129,1.205147,0.509375,0.543278,0.509375,0.477263
700,0.9841,1.097981,0.53125,0.57719,0.53125,0.522042
800,0.7729,1.285239,0.541667,0.552722,0.541667,0.53514
900,0.815,1.133876,0.546875,0.556234,0.546875,0.546641
1000,0.6582,1.168853,0.571875,0.577543,0.571875,0.567754


Training fold 4


Some weights of FlaubertForSequenceClassification were not initialized from the model checkpoint at flaubert/flaubert_base_cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,1.8499,1.717129,0.258333,0.261045,0.258333,0.23675
200,1.6273,1.607498,0.327083,0.342712,0.327083,0.283446
300,1.35,1.431342,0.454167,0.469007,0.454167,0.434449
400,1.2116,1.281218,0.419792,0.514408,0.419792,0.366525
500,1.0629,1.206463,0.501042,0.528267,0.501042,0.458485
600,1.0884,1.135593,0.53125,0.539879,0.53125,0.506097
700,1.0129,1.085269,0.546875,0.544834,0.546875,0.536634
800,0.9539,1.108753,0.535417,0.564245,0.535417,0.534588
900,0.9013,1.043639,0.565625,0.578363,0.565625,0.562557
1000,0.6683,1.346654,0.561458,0.585914,0.561458,0.55821


In [None]:
# Compute and print overall statistics of the model
overall_accuracy = sum(accuracy_list) / len(accuracy_list)
overall_precision = sum(precision_list) / len(precision_list)
overall_recall = sum(recall_list) / len(recall_list)
overall_f1 = sum(f1_list) / len(f1_list)

print(f"Overall Accuracy: {overall_accuracy:.4f}")
print(f"Overall Precision: {overall_precision:.4f}")
print(f"Overall Recall: {overall_recall:.4f}")
print(f"Overall F1 Score: {overall_f1:.4f}")

## Re-train on full dataset

In [None]:
# Combine training and validation data
combined_data = pd.concat([train_data, test_data])

# Load model and tokenizer
model = FlaubertForSequenceClassification.from_pretrained('flaubert/flaubert_base_cased', num_labels=6)
tokenizer = FlaubertTokenizer.from_pretrained('flaubert/flaubert_base_cased')

# Tokenize dataset
final_dataset = CustomDataset(combined_data, tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=0.00005,
    num_train_epochs=7,
    per_device_train_batch_size=16,
    warmup_steps=500,
    weight_decay=0.05,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="no",
    save_strategy="steps",
    save_steps=500,
    fp16=True,
    )

# Re-initialize and define trainer
final_trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=final_dataset,
    compute_metrics=None
)

# Retrain the model
final_trainer.train()


In [None]:
# Save the fine-tuned model and tokenizer
model.save_pretrained('./final_model')
tokenizer.save_pretrained('./final_model')

## Make predictions

In [None]:
# Load the unlabelled data
url = 'https://raw.githubusercontent.com/RebeccaKessler/Machine_Learning/main/unlabelled_test_data.csv'
unlabelled_data = pd.read_csv(url)

In [None]:
# Load the saved fine-tuned model and tokenizer
model_path = './final_model'
model = FlaubertForSequenceClassification.from_pretrained(model_path)
tokenizer = FlaubertTokenizer.from_pretrained(model_path)

In [None]:
# Create a prediction pipeline
device = 0 if torch.cuda.is_available() else -1
classifier = pipeline('text-classification', model=model, tokenizer=tokenizer, framework='pt', device=device)

# Predict labels for the unlabelled data
predictions = classifier(unlabelled_data['sentence'].tolist())

# Decode the numeric labels to original labels using the previously fitted LabelEncoder
predicted_labels = [label_encoder.inverse_transform([int(pred['label'].split('_')[-1])])[0] for pred in predictions]

# Create a DataFrame to export the predictions
results_df = pd.DataFrame({
    'id': unlabelled_data['id'],
    'difficulty': predicted_labels
})

# Save the results to a new CSV file
results_df.to_csv('predicted_difficulties.csv', index=False)

print("Predictions saved to 'predicted_difficulties.csv'")

## Re-train on extended dataset

In [None]:
# Import extended dataset
url = 'https://raw.githubusercontent.com/RebeccaKessler/Machine_Learning/main/final_combined_training_data.csv'
full_data = pd.read_csv(url)

In [None]:
# Load model and tokenizer
tokenizer = FlaubertTokenizer.from_pretrained('flaubert/flaubert_base_cased')
model = FlaubertForSequenceClassification.from_pretrained('flaubert/flaubert_base_cased', num_labels=6)

In [None]:
# Apply label encoder and tokenizer to the dataset
label_encoder = LabelEncoder()
full_data['encoded_labels'] = label_encoder.fit_transform(full_data['difficulty'])
full_dataset = CustomDataset(full_data, tokenizer)

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=0.00005,
    num_train_epochs=7,
    per_device_train_batch_size=16,
    warmup_steps=500,
    weight_decay=0.05,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="no",
    save_strategy="steps",
    save_steps=500,
    fp16=True,
    )

# Initialize and define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=full_dataset,
    compute_metrics=None
)

# Train model
trainer.train()

In [None]:
# Save the fine-tuned model and tokenizer
model_path_full = './final_model_extended'
model.save_pretrained('./final_model_extended')
tokenizer.save_pretrained('./final_model_extended')

## Make predictions

In [None]:
# Load the unlabelled data
url = 'https://raw.githubusercontent.com/RebeccaKessler/Machine_Learning/main/unlabelled_test_data.csv'
unlabelled_data = pd.read_csv(url)

In [None]:
# Load the fine-tuned model and tokenizer for predictions
model = FlaubertForSequenceClassification.from_pretrained(model_path_full)
tokenizer = FlaubertTokenizer.from_pretrained(model_path_full)

In [None]:
# Define prediction pipeline
classifier = pipeline('text-classification', model=model, tokenizer=tokenizer)

# Make prediction on unlabelled dataset
predictions = classifier(unlabelled_data['sentence'].tolist())
predicted_labels = [label_encoder.inverse_transform([int(pred['label'].split('_')[-1])])[0] for pred in predictions]

# Create a new dataframe with the predictions
results_df = pd.DataFrame({
    'id': unlabelled_data['id'],
    'difficulty': predicted_labels
})

# Save the dataframe
results_df.to_csv('predicted_difficulties_full.csv', index=False)