<a href="https://colab.research.google.com/github/RebeccaKessler/Machine_Learning/blob/main/Codes_Model_0_59.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install sentencepiece
!pip install accelerate -U
!pip install pandas numpy matplotlib
!pip install scikit-learn seaborn

Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.w

In [2]:
from transformers import Trainer, TrainingArguments, CamembertTokenizer, CamembertForSequenceClassification, CamembertConfig
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import pipeline
import joblib
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, precision_recall_fscore_support

In [3]:
# Define compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(-1)
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import Trainer, TrainingArguments, CamembertForSequenceClassification, CamembertTokenizer
import torch
from torch.utils.data import Dataset
from transformers import CamembertConfig, CamembertForSequenceClassification

class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = str(self.data.iloc[idx]['sentence'])
        label = int(self.data.iloc[idx]['encoded_labels'])

        encoding = self.tokenizer.encode_plus(
            sentence,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Load and preprocess the data
data = pd.read_csv('training_data.csv')
label_encoder = LabelEncoder()
data['encoded_labels'] = label_encoder.fit_transform(data['difficulty'])

# Define the tokenizer
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')


# K-Fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list = []

for fold, (train_idx, val_idx) in enumerate(kf.split(data)):
    print(f"Training fold {fold+1}")
    train_data = data.iloc[train_idx]
    val_data = data.iloc[val_idx]

    train_dataset = CustomDataset(train_data, tokenizer)
    eval_dataset = CustomDataset(val_data, tokenizer)

    # Set up the Trainer
    model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=6)
    training_args = TrainingArguments(
        output_dir=f'./results_fold_{fold}',
        learning_rate=0.00015,
        num_train_epochs=7,
        per_device_train_batch_size=16,
        warmup_steps=1000,
        weight_decay=0.05,
        logging_dir=f'./logs_fold_{fold}',
        logging_steps=10,
        evaluation_strategy="steps",
        eval_steps=100,
        save_strategy="steps",
        save_steps=500,
        fp16=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics
    )

    # Train and evaluate the model
    trainer.train()
    eval_result = trainer.evaluate()
    accuracy_list.append(eval_result['eval_accuracy'])

# Compute overall accuracy
overall_accuracy = sum(accuracy_list) / len(accuracy_list)
print(f"Overall Accuracy: {overall_accuracy:.4f}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Training fold 1


model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,1.7421,1.729658,0.319792,0.511671,0.319792,0.261062
200,1.3392,1.346746,0.464583,0.497362,0.464583,0.433479
300,1.133,1.228836,0.482292,0.481978,0.482292,0.471087
400,1.0346,1.182704,0.478125,0.454832,0.478125,0.450169
500,0.9201,1.11079,0.501042,0.557221,0.501042,0.475316
600,1.079,1.125303,0.498958,0.520954,0.498958,0.471649
700,0.9416,1.070902,0.534375,0.586118,0.534375,0.515529
800,0.7668,1.200665,0.50625,0.541078,0.50625,0.501641
900,0.8982,1.523273,0.457292,0.498131,0.457292,0.454669
1000,0.6541,1.498165,0.535417,0.550955,0.535417,0.521359


Training fold 2


Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,1.7505,1.740816,0.35625,0.451404,0.35625,0.263141
200,1.3787,1.350074,0.466667,0.527643,0.466667,0.427248
300,1.155,1.195463,0.489583,0.468511,0.489583,0.464459
400,1.0686,1.139733,0.529167,0.536204,0.529167,0.52466
500,1.0115,1.141538,0.527083,0.539088,0.527083,0.527125
600,0.9271,1.178185,0.532292,0.551138,0.532292,0.51431
700,0.9407,1.08383,0.544792,0.547521,0.544792,0.540727
800,0.9134,1.182113,0.554167,0.568148,0.554167,0.550805
900,0.8584,1.215135,0.544792,0.555831,0.544792,0.539701
1000,0.6815,1.338314,0.5375,0.544179,0.5375,0.528739


Training fold 3


Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,1.7533,1.730479,0.402083,0.503023,0.402083,0.340765
200,1.449,1.33216,0.479167,0.493615,0.479167,0.4681
300,1.146,1.327953,0.414583,0.444986,0.414583,0.394914
400,1.239,1.167745,0.482292,0.515603,0.482292,0.465251
500,1.0034,1.234345,0.467708,0.543198,0.467708,0.460758
600,0.9349,1.085659,0.55,0.557837,0.55,0.545169
700,0.9266,1.079704,0.551042,0.558167,0.551042,0.538216
800,0.7589,1.205244,0.517708,0.571726,0.517708,0.522328
900,0.8753,1.185056,0.551042,0.561288,0.551042,0.538934
1000,0.7528,1.196425,0.571875,0.600224,0.571875,0.567252


Training fold 4


Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,1.7518,1.730541,0.375,0.405155,0.375,0.277002
200,1.3777,1.326081,0.466667,0.457903,0.466667,0.428816
300,1.1643,1.184575,0.49375,0.50003,0.49375,0.476795
400,0.9875,1.135844,0.492708,0.492834,0.492708,0.45573
500,0.902,1.092596,0.5375,0.559108,0.5375,0.515293
600,0.9761,1.054222,0.527083,0.553796,0.527083,0.521344
700,0.9735,1.078629,0.53125,0.550828,0.53125,0.5226
800,0.7354,1.15908,0.533333,0.528093,0.533333,0.524604
900,0.8095,1.189128,0.535417,0.573669,0.535417,0.52882
1000,0.6873,1.41905,0.479167,0.540663,0.479167,0.474619


  _warn_prf(average, modifier, msg_start, len(result))


Training fold 5


Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,1.7596,1.745336,0.271875,0.353052,0.271875,0.162137
200,1.3691,1.348234,0.4,0.347845,0.4,0.317172
300,1.0477,1.23661,0.453125,0.470326,0.453125,0.429189
400,1.0727,1.178716,0.477083,0.487083,0.477083,0.450694
500,0.8167,1.064532,0.545833,0.538302,0.545833,0.53428
600,0.879,1.051603,0.536458,0.540424,0.536458,0.533119
700,0.8793,1.093736,0.532292,0.564423,0.532292,0.524744
800,0.7475,1.065691,0.557292,0.585308,0.557292,0.561425
900,0.7902,1.436191,0.44375,0.479044,0.44375,0.411756
1000,0.6078,1.254987,0.513542,0.568733,0.513542,0.515525


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Overall Accuracy: 0.5860


In [None]:
# Save the trained model and the tokenizer
model.save_pretrained('./saved_model')
tokenizer.save_pretrained('./saved_model')

('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/sentencepiece.bpe.model',
 './saved_model/added_tokens.json')

In [None]:
# Combine training and validation data
combined_data = pd.concat([train_data, val_data])

# Load CamemBERT model pre-trained
model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=6)

# Load the tokenizer
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')

# Create a new dataset object with the entire data
final_dataset = CustomDataset(combined_data, tokenizer)

# Modify training arguments
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=0.00015,
    num_train_epochs=7,
    per_device_train_batch_size=16,
    warmup_steps=1000,
    weight_decay=0.05,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=500,
    fp16=True,
    )

# Re-initialize and train the Trainer with the new combined dataset
final_trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=final_dataset,
    compute_metrics=None
)

# Retrain the model on the whole dataset
final_trainer.train()

# Save the final trained model and tokenizer
model.save_pretrained('./final_model')
tokenizer.save_pretrained('./final_model')

Step,Training Loss
10,0.6826
20,0.5939
30,0.6056
40,0.588
50,0.5489
60,0.5536
70,0.5214
80,0.4613
90,0.4471
100,0.4864


Step,Training Loss
10,0.6826
20,0.5939
30,0.6056
40,0.588
50,0.5489
60,0.5536
70,0.5214
80,0.4613
90,0.4471
100,0.4864


('./final_model/tokenizer_config.json',
 './final_model/special_tokens_map.json',
 './final_model/sentencepiece.bpe.model',
 './final_model/added_tokens.json')

In [None]:
import pandas as pd
import joblib
from transformers import pipeline, CamembertForSequenceClassification, CamembertTokenizer

# Load the unlabelled data
unlabelled_data = pd.read_csv('unlabelled_test_data.csv')

# Load the saved model and tokenizer
model_path = './saved_model'
model = CamembertForSequenceClassification.from_pretrained(model_path)
tokenizer = CamembertTokenizer.from_pretrained(model_path)

# Load the LabelEncoder
label_encoder = joblib.load('label_encoder.pkl')

# Create a prediction pipeline
device = 0 if torch.cuda.is_available() else -1
classifier = pipeline('text-classification', model=model, tokenizer=tokenizer, framework='pt', device=device)

# Predict labels for the unlabelled data
predictions = classifier(unlabelled_data['sentence'].tolist())

# Decode the numeric labels to original labels using the loaded LabelEncoder
predicted_labels = [label_encoder.inverse_transform([int(pred['label'].split('_')[-1])])[0] for pred in predictions]

# Create a DataFrame to export
results_df = pd.DataFrame({
    'id': unlabelled_data['id'],
    'difficulty': predicted_labels
})

# Save the results to a new CSV file
results_df.to_csv('predicted_difficulties.csv', index=False)

print("Predictions saved to 'predicted_difficulties.csv'")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Predictions saved to 'predicted_difficulties.csv'


In [None]:
full_data = pd.read_csv('combined_random_french_sentences.csv')

# Load CamemBERT model pre-trained
model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=6)

# Load the tokenizer
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')

# Creating the dataset object using the same tokenizer and configurations as before
label_encoder = LabelEncoder()
full_data['encoded_labels'] = label_encoder.fit_transform(full_data['difficulty'])
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
full_dataset = CustomDataset(full_data, tokenizer)

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
from transformers import TrainingArguments

learning_rate = 0.00015
num_train_epochs = 7
per_device_train_batch_size = 16

# Modify training arguments
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=learning_rate,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    warmup_steps=1000,
    weight_decay=0.05,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='no',
    save_strategy="steps",
    save_steps=500,
    fp16=True,
    )

# Re-initialize and train the Trainer with the new combined dataset
final_trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=full_dataset,
    compute_metrics=None
)

# Retrain the model on the whole dataset
final_trainer.train()


Step,Training Loss
10,1.7909
20,1.7934
30,1.7926
40,1.7895
50,1.7803
60,1.7822
70,1.7603
80,1.7481
90,1.7281
100,1.7163


TrainOutput(global_step=4291, training_loss=0.35101464429822066, metrics={'train_runtime': 716.4809, 'train_samples_per_second': 95.746, 'train_steps_per_second': 5.989, 'total_flos': 4512516657868800.0, 'train_loss': 0.35101464429822066, 'epoch': 7.0})

In [None]:
model_path_full = "./fine_tuned_model_full"
model.save_pretrained(model_path_full)
tokenizer.save_pretrained(model_path_full)

('./fine_tuned_model_full/tokenizer_config.json',
 './fine_tuned_model_full/special_tokens_map.json',
 './fine_tuned_model_full/sentencepiece.bpe.model',
 './fine_tuned_model_full/added_tokens.json')

In [None]:
from transformers import pipeline

# Load the unlabelled data
unlabelled_data = pd.read_csv('unlabelled_test_data.csv')

# Load the newly trained full model and tokenizer for predictions
model = CamembertForSequenceClassification.from_pretrained(model_path_full)
tokenizer = CamembertTokenizer.from_pretrained(model_path_full)

# Prediction pipeline
classifier = pipeline('text-classification', model=model, tokenizer=tokenizer)

# Predict on new data
predictions = classifier(unlabelled_data['sentence'].tolist())
predicted_labels = [label_encoder.inverse_transform([int(pred['label'].split('_')[-1])])[0] for pred in predictions]

# Create DataFrame and save
results_df = pd.DataFrame({
    'id': unlabelled_data['id'],
    'difficulty': predicted_labels
})
results_df.to_csv('predicted_difficulties_full.csv', index=False)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
!pip install sentencepiece
!pip install accelerate -U
!pip install optuna
!pip install pandas numpy matplotlib
!pip install scikit-learn seaborn

In [None]:
import optuna
from transformers import Trainer, TrainingArguments, CamembertTokenizer, CamembertForSequenceClassification
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import joblib

# Custom Dataset class
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = str(self.data.iloc[idx]['sentence'])
        label = int(self.data.iloc[idx]['encoded_labels'])

        encoding = self.tokenizer.encode_plus(
            sentence,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Compute evaluation metrics
def compute_metrics(pred):
    predictions, labels = pred
    preds = predictions.argmax(axis=1)
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

# Objective function for Optuna
def objective(trial):
    # Hyperparameter search space
    learning_rate = trial.suggest_loguniform("learning_rate", 10e-5, 20e-5)
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32])

    # Modify training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        learning_rate=learning_rate,
        per_device_train_batch_size=per_device_train_batch_size,
        logging_dir='./logs',
        logging_steps=10,
        warmup_steps=1000,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        fp16=True
    )

    # Load the tokenizer and dataset
    tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
    data = pd.read_csv('training_data.csv')

    # Encode the labels
    label_encoder = LabelEncoder()
    data['encoded_labels'] = label_encoder.fit_transform(data['difficulty'])
    joblib.dump(label_encoder, 'label_encoder.pkl')

    # Split the dataset into training and validation sets
    train_data, val_data = train_test_split(data, test_size=0.2, stratify=data['encoded_labels'], random_state=42)

    # Create datasets
    train_dataset = CustomDataset(train_data, tokenizer)
    eval_dataset = CustomDataset(val_data, tokenizer)

    # Load Camembert model pre-trained
    model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=6)

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics
    )

    # Train and evaluate
    trainer.train()
    eval_result = trainer.evaluate()
    return eval_result["eval_accuracy"]

# Create and optimize the Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

# Print the best trial found
print(f"Best trial accuracy: {study.best_trial.value}")
print(f"Best parameters: {study.best_trial.params}")