<a href="https://colab.research.google.com/github/RebeccaKessler/Machine_Learning/blob/main/Codes/Flaubert_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install sentencepiece
!pip install accelerate -U
!pip install optuna
!pip install pandas numpy matplotlib
!pip install scikit-learn seaborn
!pip install sacremoses

Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.w

In [2]:
#import packages
import torch
from torch.utils.data import Dataset
import pandas as pd
from transformers import CamembertConfig, CamembertForSequenceClassification, CamembertTokenizer, Trainer, TrainingArguments, FlaubertTokenizer, FlaubertForSequenceClassification, Trainer, TrainingArguments, FlaubertModel, FlaubertTokenizer
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import StratifiedKFold
from transformers import pipeline

## Define functions

In [3]:
# Define compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(-1)
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [4]:
 # Define pre-processing function
 class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = str(self.data.iloc[idx]['sentence'])
        label = int(self.data.iloc[idx]['encoded_labels'])

        encoding = self.tokenizer.encode_plus(
            sentence,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

## Fine-tune the model

In [5]:
# Load the data
url = 'https://raw.githubusercontent.com/RebeccaKessler/Machine_Learning/main/training_data.csv'
data = pd.read_csv(url)

In [6]:
# Encode data
label_encoder = LabelEncoder()
data['encoded_labels'] = label_encoder.fit_transform(data['difficulty'])

In [None]:
# Define the tokenizer
tokenizer = FlaubertTokenizer.from_pretrained('flaubert/flaubert_base_cased')

# Create empty lists for statistics
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

# K-Fold cross-validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(kf.split(data, data['encoded_labels'])):
    print(f"Training fold {fold + 1}")
    train_data = data.iloc[train_idx]
    val_data = data.iloc[val_idx]

    # Tokenize data
    train_dataset = CustomDataset(train_data, tokenizer)
    eval_dataset = CustomDataset(val_data, tokenizer)

    # Set up the model
    model = FlaubertForSequenceClassification.from_pretrained('flaubert/flaubert_base_cased', num_labels=6)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=f'./results_fold_{fold}',
        learning_rate=0.00005,
        num_train_epochs=7,
        per_device_train_batch_size=16,
        warmup_steps=500,
        weight_decay=0.05,
        logging_dir=f'./logs_fold_{fold}',
        logging_steps=10,
        evaluation_strategy="steps",
        eval_steps=100,
        save_strategy="steps",
        save_steps=500,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        greater_is_better=True,
    )

    #  Define and initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
    )

    # Train and evaluate
    trainer.train()
    eval_result = trainer.evaluate()
    accuracy_list.append(eval_result['eval_accuracy'])
    precision_list.append(eval_result['eval_precision'])
    recall_list.append(eval_result['eval_recall'])
    f1_list.append(eval_result['eval_f1'])

# Print results
print(f"Accuracy: {accuracy_list}")
print(f"Precision: {precision_list}")
print(f"Recall: {recall_list}")
print(f"F1 Score: {f1_list}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.56M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/896k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.50k [00:00<?, ?B/s]

Training fold 1


model.safetensors:   0%|          | 0.00/553M [00:00<?, ?B/s]

Some weights of FlaubertForSequenceClassification were not initialized from the model checkpoint at flaubert/flaubert_base_cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,1.7458,1.696405,0.2875,0.274851,0.2875,0.252951
200,1.5197,1.450535,0.389583,0.414352,0.389583,0.383226
300,1.3362,1.220984,0.458333,0.484587,0.458333,0.458269
400,1.1813,1.114768,0.513542,0.505111,0.513542,0.502402
500,1.0482,1.245608,0.5,0.527556,0.5,0.495113
600,1.1591,1.128651,0.452083,0.536297,0.452083,0.427287
700,1.0585,1.101231,0.535417,0.579174,0.535417,0.509965
800,0.7551,1.358135,0.519792,0.558532,0.519792,0.509692
900,0.8048,1.162963,0.56875,0.579446,0.56875,0.561448
1000,0.4522,1.536388,0.540625,0.574238,0.540625,0.535471


Training fold 2


Some weights of FlaubertForSequenceClassification were not initialized from the model checkpoint at flaubert/flaubert_base_cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,1.7596,1.662878,0.271875,0.330251,0.271875,0.216066
200,1.5915,1.518087,0.364583,0.331075,0.364583,0.324439
300,1.3356,1.287375,0.4375,0.447667,0.4375,0.39253
400,1.2892,1.302755,0.461458,0.471538,0.461458,0.414483
500,1.2033,1.253181,0.459375,0.456832,0.459375,0.439384
600,1.0537,1.042088,0.548958,0.564404,0.548958,0.548062
700,1.0765,1.022714,0.547917,0.575025,0.547917,0.49895
800,0.7595,1.001837,0.598958,0.600352,0.598958,0.595633
900,0.9013,1.066564,0.585417,0.595775,0.585417,0.583555
1000,0.6644,1.196671,0.577083,0.613362,0.577083,0.579695


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Compute and print overall statistics of the model
overall_accuracy = sum(accuracy_list) / len(accuracy_list)
overall_precision = sum(precision_list) / len(precision_list)
overall_recall = sum(recall_list) / len(recall_list)
overall_f1 = sum(f1_list) / len(f1_list)

print(f"Overall Accuracy: {overall_accuracy:.4f}")
print(f"Overall Precision: {overall_precision:.4f}")
print(f"Overall Recall: {overall_recall:.4f}")
print(f"Overall F1 Score: {overall_f1:.4f}")

## Re-train on full dataset

In [None]:
# Combine training and validation data
combined_data = pd.concat([train_data, val_data])

# Load model and tokenizer
model = FlaubertForSequenceClassification.from_pretrained('flaubert/flaubert_base_cased', num_labels=6)
tokenizer = FlaubertTokenizer.from_pretrained('flaubert/flaubert_base_cased')

# Tokenize dataset
final_dataset = CustomDataset(combined_data, tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=0.00005,
    num_train_epochs=7,
    per_device_train_batch_size=16,
    warmup_steps=500,
    weight_decay=0.05,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="no",
    save_strategy="steps",
    save_steps=500,
    fp16=True,
    )

# Re-initialize and define trainer
final_trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=final_dataset,
    compute_metrics=None
)

# Retrain the model
final_trainer.train()


In [None]:
# Save the fine-tuned model and tokenizer
model.save_pretrained('./final_model')
tokenizer.save_pretrained('./final_model')

## Make predictions

In [None]:
# Load the unlabelled data
url = 'https://raw.githubusercontent.com/RebeccaKessler/Machine_Learning/main/unlabelled_test_data.csv'
unlabelled_data = pd.read_csv(url)

In [None]:
# Load the saved fine-tuned model and tokenizer
model_path = './final_model'
model = FlaubertForSequenceClassification.from_pretrained(model_path)
tokenizer = FlaubertTokenizer.from_pretrained(model_path)

In [None]:
# Create a prediction pipeline
device = 0 if torch.cuda.is_available() else -1
classifier = pipeline('text-classification', model=model, tokenizer=tokenizer, framework='pt', device=device)

# Predict labels for the unlabelled data
predictions = classifier(unlabelled_data['sentence'].tolist())

# Decode the numeric labels to original labels using the previously fitted LabelEncoder
predicted_labels = [label_encoder.inverse_transform([int(pred['label'].split('_')[-1])])[0] for pred in predictions]

# Create a DataFrame to export the predictions
results_df = pd.DataFrame({
    'id': unlabelled_data['id'],
    'difficulty': predicted_labels
})

# Save the results to a new CSV file
results_df.to_csv('predicted_difficulties.csv', index=False)

print("Predictions saved to 'predicted_difficulties.csv'")

## Re-train on extended dataset

In [None]:
# Import extended dataset
url = 'https://raw.githubusercontent.com/RebeccaKessler/Machine_Learning/main/final_combined_training_data.csv'
full_data = pd.read_csv(url)

In [None]:
# Load model and tokenizer
tokenizer = FlaubertTokenizer.from_pretrained('flaubert/flaubert_base_cased')
model = FlaubertForSequenceClassification.from_pretrained('flaubert/flaubert_base_cased', num_labels=6)

In [None]:
# Apply label encoder and tokenizer to the dataset
label_encoder = LabelEncoder()
full_data['encoded_labels'] = label_encoder.fit_transform(full_data['difficulty'])
full_dataset = CustomDataset(full_data, tokenizer)

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=0.00005,
    num_train_epochs=7,
    per_device_train_batch_size=16,
    warmup_steps=500,
    weight_decay=0.05,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="no",
    save_strategy="steps",
    save_steps=500,
    fp16=True,
    )

# Initialize and define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=full_dataset,
    compute_metrics=None
)

# Train model
trainer.train()

In [None]:
# Save the fine-tuned model and tokenizer
model_path_full = './final_model_extended'
model.save_pretrained('./final_model_extended')
tokenizer.save_pretrained('./final_model_extended')

('./final_model/tokenizer_config.json',
 './final_model/special_tokens_map.json',
 './final_model/vocab.json',
 './final_model/merges.txt',
 './final_model/added_tokens.json')

## Make predictions

In [None]:
# Load the unlabelled data
url = 'https://raw.githubusercontent.com/RebeccaKessler/Machine_Learning/main/unlabelled_test_data.csv'
unlabelled_data = pd.read_csv(url)

In [None]:
# Load the fine-tuned model and tokenizer for predictions
model = FlaubertForSequenceClassification.from_pretrained(model_path_full)
tokenizer = FlaubertTokenizer.from_pretrained(model_path_full)

In [None]:
# Define prediction pipeline
classifier = pipeline('text-classification', model=model, tokenizer=tokenizer)

# Make prediction on unlabelled dataset
predictions = classifier(unlabelled_data['sentence'].tolist())
predicted_labels = [label_encoder.inverse_transform([int(pred['label'].split('_')[-1])])[0] for pred in predictions]

# Create a new dataframe with the predictions
results_df = pd.DataFrame({
    'id': unlabelled_data['id'],
    'difficulty': predicted_labels
})

# Save the dataframe
results_df.to_csv('predicted_difficulties_full.csv', index=False)

## Hyperparameter Optimization

In [None]:
import optuna

In [None]:
# Load the data
url = 'https://raw.githubusercontent.com/RebeccaKessler/Machine_Learning/main/training_data.csv'
data = pd.read_csv(url)

In [None]:
# Encode the labels
label_encoder = LabelEncoder()
data['encoded_labels'] = label_encoder.fit_transform(data['difficulty'])

In [None]:
# Load the tokenizer
tokenizer = FlaubertTokenizer.from_pretrained('flaubert/flaubert-base')

In [None]:
# Define objective function for the hyperoptimization

def objective(trial):
    # Define hyperparameter search space
    learning_rate = trial.suggest_loguniform("learning_rate", 10e-5, 20e-5)
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32])
    num_train_epochs = trial.suggest_int("num_train_epochs", 3, 7)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=num_train_epochs,
        learning_rate=learning_rate,
        per_device_train_batch_size=per_device_train_batch_size,
        logging_dir='./logs',
        logging_steps=10,
        warmup_steps=1000,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        fp16=True
    )

    # Split the dataset into training and validation sets (80/20)
    train_data, val_data = train_test_split(data, test_size=0.2, stratify=data['encoded_labels'], random_state=42)

    # Tokenize datasets
    train_dataset = CustomDataset(train_data, tokenizer)
    eval_dataset = CustomDataset(val_data, tokenizer)

    # Load pre-trained model
    model = FlaubertForSequenceClassification.from_pretrained('flaubert/flaubert-base', num_labels=6)

    # Define and initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics
    )

    # Train and evaluate
    trainer.train()
    eval_result = trainer.evaluate()
    return eval_result["eval_accuracy"]

# Create and optimize the Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

# Print the best combination of parameters
print(f"Best trial accuracy: {study.best_trial.value}")
print(f"Best parameters: {study.best_trial.params}")