<a href="https://colab.research.google.com/github/RebeccaKessler/Machine_Learning/blob/main/Codes/Combined_Final_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#install required packages
!pip install sentencepiece
!pip install accelerate -U
!pip install pandas numpy matplotlib
!pip install scikit-learn seaborn
!pip install transformers torch pandas scikit-learn
!pip install sacremoses

Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.w

## Import Packages & Define Functions

In [None]:
#import required packages
from transformers import Trainer, TrainingArguments, CamembertTokenizer, CamembertForSequenceClassification, CamembertConfig, FlaubertTokenizer, FlaubertForSequenceClassification
from sklearn.metrics import accuracy_score
import torch
import numpy as np
from torch.utils.data import Dataset
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import pipeline
import joblib
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, precision_recall_fscore_support

In [None]:
# Define compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(-1)
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [None]:
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = str(self.data.iloc[idx]['sentence'])
        label = int(self.data.iloc[idx]['encoded_labels'])

        encoding = self.tokenizer.encode_plus(
            sentence,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

## Flaubert Model

In [None]:
# Load and preprocess the data
url = 'https://raw.githubusercontent.com/RebeccaKessler/Machine_Learning/main/training_data.csv'
full_data = pd.read_csv(url)
label_encoder = LabelEncoder()
full_data['encoded_labels'] = label_encoder.fit_transform(full_data['difficulty'])

# Load CamemBERT model pre-trained
model = FlaubertForSequenceClassification.from_pretrained('flaubert/flaubert_base_cased', num_labels=6)

# Load the tokenizer
tokenizer = FlaubertTokenizer.from_pretrained('flaubert/flaubert_base_cased')

# Tokenize the training dataset
final_dataset = CustomDataset(full_data, tokenizer)

# Define optimal training arguments
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=0.00005,
    num_train_epochs=7,
    per_device_train_batch_size=16,
    warmup_steps=500,
    weight_decay=0.05,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="no",
    save_strategy="steps",
    save_steps=500,
    fp16=True,
    )

# Initialize and train the Trainer
final_trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=final_dataset,
    compute_metrics=None
)

# Train the model
final_trainer.train()

# Save the final trained model and tokenizer
model.save_pretrained('./flaubert_final_model')
tokenizer.save_pretrained('./flaubert_final_model')

Some weights of FlaubertForSequenceClassification were not initialized from the model checkpoint at flaubert/flaubert_base_cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10,2.0105
20,1.8723
30,1.979
40,1.9164
50,1.911
60,1.8343
70,1.8012
80,1.793
90,1.6786
100,1.6905


('./flaubert_final_model/tokenizer_config.json',
 './flaubert_final_model/special_tokens_map.json',
 './flaubert_final_model/vocab.json',
 './flaubert_final_model/merges.txt',
 './flaubert_final_model/added_tokens.json')

##Camembert Model

In [None]:
# Load and preprocess the data
url = 'https://raw.githubusercontent.com/RebeccaKessler/Machine_Learning/main/training_data.csv'
full_data = pd.read_csv(url)
label_encoder = LabelEncoder()
full_data['encoded_labels'] = label_encoder.fit_transform(full_data['difficulty'])

#Load CamemBERT model pre-trained
model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=6)

# Load the tokenizer
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')

# Tokenize the training dataset
final_dataset = CustomDataset(full_data, tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=0.00015,
    num_train_epochs=7,
    per_device_train_batch_size=16,
    warmup_steps=1000,
    weight_decay=0.02,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="no",
    save_strategy="steps",
    save_steps=500,
    fp16=True,
    )

# Initialize and train the Trainer
final_trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=final_dataset,
    compute_metrics=None
)

# Train the model
final_trainer.train()

# Save the final trained model and tokenizer
model.save_pretrained('./camembert_final_model')
tokenizer.save_pretrained('./camembert_final_model')

config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Step,Training Loss
10,1.7904
20,1.7938
30,1.794
40,1.7913
50,1.7854
60,1.7804
70,1.7781
80,1.7768
90,1.7678
100,1.7491


('./camembert_final_model/tokenizer_config.json',
 './camembert_final_model/special_tokens_map.json',
 './camembert_final_model/sentencepiece.bpe.model',
 './camembert_final_model/added_tokens.json')

##Combined Predictions

In [None]:
# Load the unlabelled data
url = 'https://raw.githubusercontent.com/RebeccaKessler/Machine_Learning/main/unlabelled_test_data.csv'
unlabelled_data = pd.read_csv(url)

# Load the saved CamemBERT model and tokenizer
camembert_model_path = './camembert_final_model'
camembert_model = CamembertForSequenceClassification.from_pretrained(camembert_model_path)
camembert_tokenizer = CamembertTokenizer.from_pretrained(camembert_model_path)

# Load the saved Flaubert model and tokenizer
flaubert_model_path = './flaubert_final_model'
flaubert_model = FlaubertForSequenceClassification.from_pretrained(flaubert_model_path)
flaubert_tokenizer = FlaubertTokenizer.from_pretrained(flaubert_model_path)

# Create prediction pipelines to get probabilities
device = 0 if torch.cuda.is_available() else -1
camembert_classifier = pipeline('text-classification', model=camembert_model, tokenizer=camembert_tokenizer, framework='pt', device=device, return_all_scores=True)
flaubert_classifier = pipeline('text-classification', model=flaubert_model, tokenizer=flaubert_tokenizer, framework='pt', device=device, return_all_scores=True)

# Predict probabilities for the unlabelled data using both models
camembert_probs = camembert_classifier(unlabelled_data['sentence'].tolist())
flaubert_probs = flaubert_classifier(unlabelled_data['sentence'].tolist())

# Convert the predictions to numpy arrays
camembert_probs_array = np.array([[prob['score'] for prob in probs] for probs in camembert_probs])
flaubert_probs_array = np.array([[prob['score'] for prob in probs] for probs in flaubert_probs])

# Combine predictions using soft voting (average probabilities)
average_probs = (camembert_probs_array + flaubert_probs_array) / 2
final_predictions = np.argmax(average_probs, axis=1)

# Decode the numeric labels to original labels using the loaded LabelEncoder
predicted_labels = label_encoder.inverse_transform(final_predictions)

# Create a DataFrame to export
results_df = pd.DataFrame({
    'id': unlabelled_data['id'],
    'difficulty': predicted_labels
})

# Save the results to a new CSV file
results_df.to_csv('predicted_difficulties_combined.csv', index=False)

print("Predictions saved to 'predicted_difficulties_combined.csv'")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Predictions saved to 'predicted_difficulties_combined.csv'
