<a href="https://colab.research.google.com/github/RebeccaKessler/Machine_Learning/blob/main/Codes/Model_0_59.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sentencepiece
!pip install accelerate -U
!pip install pandas numpy matplotlib
!pip install scikit-learn seaborn
!pip install optuna

Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.w

## Import Packages

In [None]:
from transformers import Trainer, TrainingArguments, CamembertTokenizer, CamembertForSequenceClassification, CamembertConfig
import torch
from torch.utils.data import Dataset
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import pipeline
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import pipeline

## Define Functions

In [None]:
# Define compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(-1)
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [None]:
 class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = str(self.data.iloc[idx]['sentence'])
        label = int(self.data.iloc[idx]['encoded_labels'])

        encoding = self.tokenizer.encode_plus(
            sentence,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

## CamemBert Model

In [None]:
# Load and preprocess the data
url = 'https://raw.githubusercontent.com/RebeccaKessler/Machine_Learning/main/training_data.csv'
data = pd.read_csv(url)
label_encoder = LabelEncoder()
data['encoded_labels'] = label_encoder.fit_transform(data['difficulty'])

In [None]:
# Define the tokenizer
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')

# K-Fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

for fold, (train_idx, val_idx) in enumerate(kf.split(data)):
    print(f"Training fold {fold+1}")
    train_data = data.iloc[train_idx]
    val_data = data.iloc[val_idx]

    train_dataset = CustomDataset(train_data, tokenizer)
    eval_dataset = CustomDataset(val_data, tokenizer)

    # Set up the Trainer
    model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=6)
    training_args = TrainingArguments(
        output_dir=f'./results_fold_{fold}',
        learning_rate=0.00015,
        num_train_epochs=7,
        per_device_train_batch_size=16,
        warmup_steps=1000,
        weight_decay=0.05,
        logging_dir=f'./logs_fold_{fold}',
        logging_steps=10,
        evaluation_strategy="steps",
        eval_steps=100,
        save_strategy="steps",
        save_steps=500,
        fp16=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics
    )

    # Train and evaluate the model
    trainer.train()
    eval_result = trainer.evaluate()
    accuracy_list.append(eval_result['eval_accuracy'])
    precision_list.append(eval_result['eval_precision'])
    recall_list.append(eval_result['eval_recall'])
    f1_list.append(eval_result['eval_f1'])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Training fold 1


Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


In [None]:
# Compute overall statistics of the model
overall_accuracy = sum(accuracy_list) / len(accuracy_list)
overall_precision = sum(precision_list) / len(precision_list)
overall_recall = sum(recall_list) / len(recall_list)
overall_f1 = sum(f1_list) / len(f1_list)

print(f"Overall Accuracy: {overall_accuracy:.4f}")
print(f"Overall Precision: {overall_precision:.4f}")
print(f"Overall Recall: {overall_recall:.4f}")
print(f"Overall F1 Score: {overall_f1:.4f}")

In [None]:
# Save the trained model and the tokenizer
model.save_pretrained('./saved_model')
tokenizer.save_pretrained('./saved_tokenizer')

('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/sentencepiece.bpe.model',
 './saved_model/added_tokens.json')

## Making Predictions

Retraining on the full training dataset

In [None]:
# Combine training and validation data
combined_data = pd.concat([train_data, val_data])

In [None]:
#Load CamemBERT model pre-trained
model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=6)

# Load the tokenizer
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')

# Create a new dataset object with the entire data
final_dataset = CustomDataset(combined_data, tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=0.00015,
    num_train_epochs=7,
    per_device_train_batch_size=16,
    warmup_steps=1000,
    weight_decay=0.05,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="no",
    save_strategy="steps",
    save_steps=500,
    fp16=True,
    )

# Re-initialize and train the Trainer with the new combined dataset
final_trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=final_dataset,
    compute_metrics=None
)

# Retrain the model on the whole dataset
final_trainer.train()

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Step,Training Loss
10,1.7854
20,1.7926
30,1.7836
40,1.8019
50,1.7841
60,1.7889
70,1.7787
80,1.7774
90,1.7718
100,1.7627


('./final_model/tokenizer_config.json',
 './final_model/special_tokens_map.json',
 './final_model/sentencepiece.bpe.model',
 './final_model/added_tokens.json')

In [None]:
# Save the final trained model and tokenizer
model.save_pretrained('./final_model')
tokenizer.save_pretrained('./final_model')

Make predictions on the unlabelled test data

In [None]:
# Load the unlabelled data
url = 'https://raw.githubusercontent.com/RebeccaKessler/Machine_Learning/main/unlabelled_test_data.csv'
unlabelled_data = pd.read_csv(url)

In [None]:
# Load the saved model and tokenizer
model_path = './final_model'
model = CamembertForSequenceClassification.from_pretrained(model_path)
tokenizer = CamembertTokenizer.from_pretrained(model_path)

In [None]:
# Create a prediction pipeline
device = 0 if torch.cuda.is_available() else -1
classifier = pipeline('text-classification', model=model, tokenizer=tokenizer, framework='pt', device=device)

# Predict labels for the unlabelled data
predictions = classifier(unlabelled_data['sentence'].tolist())

# Decode the numeric labels to original labels using the loaded LabelEncoder
predicted_labels = [label_encoder.inverse_transform([int(pred['label'].split('_')[-1])])[0] for pred in predictions]

# Create a DataFrame to export
results_df = pd.DataFrame({
    'id': unlabelled_data['id'],
    'difficulty': predicted_labels
})

# Save the results to a new CSV file
results_df.to_csv('predicted_difficulties.csv', index=False)

print("Predictions saved to 'predicted_difficulties.csv'")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Predictions saved to 'predicted_difficulties.csv'


retrain on the extended dataset

In [None]:
url = 'https://raw.githubusercontent.com/RebeccaKessler/Machine_Learning/main/final_combined_training_data.csv'
full_data = pd.read_csv(url)
label_encoder = LabelEncoder()
full_data['encoded_labels'] = label_encoder.fit_transform(full_data['difficulty'])

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# Load CamemBERT model pre-trained
model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=6)

# Load the tokenizer
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')

#Prepare dataset
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
full_dataset = CustomDataset(full_data, tokenizer)


# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=0.00015,
    num_train_epochs=num_7,
    per_device_train_batch_size=16,
    warmup_steps=1000,
    weight_decay=0.05,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='no',
    save_strategy="steps",
    save_steps=500,
    fp16=True,
    )

# Re-initialize and train the Trainer with the new combined dataset
final_trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=full_dataset,
    compute_metrics=None
)

# Retrain the model on the whole dataset
final_trainer.train()


Step,Training Loss
10,1.7909
20,1.7934
30,1.7925
40,1.7895
50,1.7799
60,1.7815
70,1.7601
80,1.7441
90,1.7268
100,1.7128


Step,Training Loss
10,1.7909
20,1.7934
30,1.7925
40,1.7895
50,1.7799
60,1.7815
70,1.7601
80,1.7441
90,1.7268
100,1.7128


TrainOutput(global_step=4291, training_loss=0.35238448654582966, metrics={'train_runtime': 696.188, 'train_samples_per_second': 98.537, 'train_steps_per_second': 6.164, 'total_flos': 4512516657868800.0, 'train_loss': 0.35238448654582966, 'epoch': 7.0})

In [None]:
#save model
model_path_full = "./fine_tuned_model_full"
model.save_pretrained(model_path_full)
tokenizer.save_pretrained(model_path_full)

('./fine_tuned_model_full/tokenizer_config.json',
 './fine_tuned_model_full/special_tokens_map.json',
 './fine_tuned_model_full/sentencepiece.bpe.model',
 './fine_tuned_model_full/added_tokens.json')

make prediction on the unlabelled test data

In [None]:
# Load the unlabelled data
url = 'https://raw.githubusercontent.com/RebeccaKessler/Machine_Learning/main/unlabelled_test_data.csv'
unlabelled_data = pd.read_csv(url)

# Load the newly trained full model and tokenizer for predictions
model = CamembertForSequenceClassification.from_pretrained(model_path_full)
tokenizer = CamembertTokenizer.from_pretrained(model_path_full)

# Prediction pipeline
classifier = pipeline('text-classification', model=model, tokenizer=tokenizer)

# Predict on new data
predictions = classifier(unlabelled_data['sentence'].tolist())
predicted_labels = [label_encoder.inverse_transform([int(pred['label'].split('_')[-1])])[0] for pred in predictions]

# Create DataFrame and save
results_df = pd.DataFrame({
    'id': unlabelled_data['id'],
    'difficulty': predicted_labels
})
results_df.to_csv('predicted_difficulties_full.csv', index=False)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


##Hyperparameter Optimization

In [None]:
import optuna

# Objective function for Optuna
def objective(trial):
    # Hyperparameter search space
    learning_rate = trial.suggest_loguniform("learning_rate", 10e-5, 20e-5)
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32])
    num_train_epochs = trial.suggest_int("num_train_epochs", 3, 7)

    # Modify training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=num_train_epochs,
        learning_rate=learning_rate,
        per_device_train_batch_size=per_device_train_batch_size,
        logging_dir='./logs',
        logging_steps=10,
        warmup_steps=1000,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        fp16=True
    )

    # Load the tokenizer and dataset
    tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
    data = pd.read_csv('training_data.csv')

    # Encode the labels
    label_encoder = LabelEncoder()
    data['encoded_labels'] = label_encoder.fit_transform(data['difficulty'])
    joblib.dump(label_encoder, 'label_encoder.pkl')

    # Split the dataset into training and validation sets
    train_data, val_data = train_test_split(data, test_size=0.2, stratify=data['encoded_labels'], random_state=42)

    # Create datasets
    train_dataset = CustomDataset(train_data, tokenizer)
    eval_dataset = CustomDataset(val_data, tokenizer)

    # Load Camembert model pre-trained
    model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=6)

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics
    )

    # Train and evaluate
    trainer.train()
    eval_result = trainer.evaluate()
    return eval_result["eval_accuracy"]

# Create and optimize the Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

# Print the best trial found
print(f"Best trial accuracy: {study.best_trial.value}")
print(f"Best parameters: {study.best_trial.params}")