In [None]:
!pip install pandas numpy matplotlib
!pip install scikit-learn seaborn
!pip install transformers torch

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, precision_score, recall_score, f1_score, accuracy_score, precision_recall_fscore_support
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW, get_scheduler

## Define Functions

In [None]:
# Define evaluation function
def evaluate_model(model, data_loader, device):
    model.eval()
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in data_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            _, predicted_labels = torch.max(logits, dim=1)

            predictions.extend(predicted_labels.cpu().numpy())
            true_labels.extend(batch['labels'].cpu().numpy())

    # Calculate accuracy
    accuracy = accuracy_score(true_labels, predictions)

    # Calculate precision, recall, and F1-score
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='macro')

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1
    }

In [None]:
# Define pre-processing functions
class FrenchDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_len=512):
        self.sentences = sentences
        self.labels = [label_mapping[label] for label in labels]
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(sentence, add_special_tokens=True, truncation=True, padding='max_length', max_length=self.max_len, return_tensors="pt")
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
# Define a mapping from current string labels to integers
label_mapping = {
    'A1': 0,
    'A2': 1,
    'B1': 2,
    'B2': 3,
    'C1': 4,
    'C2': 5
}

In [None]:
# Define pre-processing functions for unlabelled data
class UnlabeledFrenchDataset(Dataset):
    def __init__(self, sentences, tokenizer, max_len=512):
        self.sentences = sentences
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        encoding = self.tokenizer(sentence, add_special_tokens=True, truncation=True, padding='max_length', max_length=self.max_len, return_tensors="pt")
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

## Fine-tune Bert Model

In [None]:
# Load the dataset
url = 'https://raw.githubusercontent.com/RebeccaKessler/Machine_Learning/main/training_data.csv'
df = pd.read_csv(url)
sentences = df['sentence'].tolist()
labels = df['difficulty'].tolist()


In [None]:
# Load pre-trained tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')

# Load the pre-trained model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-multilingual-cased', num_labels=6)

In [None]:
# Prepare dataset
# Split the data into training and validation sets
train_sentences, val_sentences, train_labels, val_labels = train_test_split(sentences, labels, test_size=0.2, random_state=42)

# Tokenize datasets
train_dataset = FrenchDataset(train_sentences, train_labels, tokenizer)
val_dataset = FrenchDataset(val_sentences, val_labels, tokenizer)

# Prepare data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [None]:
# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set up the training of model
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 4
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

In [None]:
# Fine-tune the model
model.train()
for epoch in range(num_epochs):
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        print(f"Epoch {epoch}, Loss: {loss.item()}")

In [None]:
# Evaluate the model
evaluation_results = evaluate_model(model, val_loader, device)
print("Accuracy:", evaluation_results['accuracy'])
print("Precision:", evaluation_results['precision'])
print("Recall:", evaluation_results['recall'])
print("F1 Score:", evaluation_results['f1_score'])


## Re-traine on full dataset

In [None]:
# Load tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-multilingual-cased', num_labels=6)
model.to(device)

In [None]:
# Prepare data
# Create a dataset with all data
full_dataset = FrenchDataset(sentences, labels, tokenizer)

# Data loader for the full dataset
full_loader = DataLoader(full_dataset, batch_size=16, shuffle=True)

In [None]:
# Set up training
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 4
num_training_steps = num_epochs * len(full_loader)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

In [None]:
# Train the model
model.train()
for epoch in range(num_epochs):
    for batch in full_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        print(f"Epoch {epoch}, Loss: {loss.item()}")

In [None]:
# Save the fine-tuned model
model.save_pretrained("full_model_directory")
tokenizer.save_pretrained("full_model_directory")

## Make Predictions

In [None]:
# Load test data
url = 'https://raw.githubusercontent.com/RebeccaKessler/Machine_Learning/main/unlabelled_test_data.csv'
unlabeled_df = pd.read_csv(url)
unlabeled_sentences = unlabeled_df['sentence'].tolist()

In [None]:
# Prepare unlabeled dataset
unlabeled_dataset = UnlabeledFrenchDataset(unlabeled_sentences, tokenizer)
unlabeled_loader = DataLoader(unlabeled_dataset, batch_size=16, shuffle=False)

In [None]:
# Make predictions
model.eval()
predictions = []

with torch.no_grad():
    for batch in unlabeled_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        _, predicted_labels = torch.max(logits, dim=1)
        predictions.extend(predicted_labels.cpu().numpy())

# Map predictions back to label strings
predicted_difficulties = [list(label_mapping.keys())[label] for label in predictions]

# Create new dataframe with predictions
data = {
    'sentence': unlabeled_sentences,
    'predicted_difficulty': predicted_difficulties
}
results_df = pd.DataFrame(data)

# Save as a CSV
results_df.to_csv('predictions_bert.csv', index=False)