<a href="https://colab.research.google.com/github/RebeccaKessler/Machine_Learning/blob/main/Codes/Code_Bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas numpy matplotlib
!pip install scikit-learn seaborn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [None]:
df_training_data = pd.read_csv('training_data.csv')
df_unlabelled_test_data = pd.read_csv('unlabelled_test_data.csv')

In [None]:
df_training_data.head(5)

In [None]:
df_unlabelled_test_data.head()

# Bert Model

In [None]:
!pip install transformers torch

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW, get_scheduler
from sklearn.model_selection import train_test_split
import pandas as pd

In [None]:
# Define a mapping from string labels to integers
label_mapping = {
    'A1': 0,
    'A2': 1,
    'B1': 2,
    'B2': 3,
    'C1': 4,
    'C2': 5
}

In [None]:
class FrenchDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_len=512):
        self.sentences = sentences
        self.labels = [label_mapping[label] for label in labels]  # Convert string labels to integers
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(sentence, add_special_tokens=True, truncation=True, padding='max_length', max_length=self.max_len, return_tensors="pt")
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
# Read the dataset
df = pd.read_csv('training_data.csv')
sentences = df['sentence'].tolist()
labels = df['difficulty'].tolist()


# Split the data into training and validation sets
train_sentences, val_sentences, train_labels, val_labels = train_test_split(sentences, labels, test_size=0.2, random_state=42)

# Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')

# Create datasets
train_dataset = FrenchDataset(train_sentences, train_labels, tokenizer)
val_dataset = FrenchDataset(val_sentences, val_labels, tokenizer)

# Data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

#Prepare the model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-multilingual-cased', num_labels=6)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Step 3: Set up training
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 4
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

In [None]:
# Step 4: Training loop
model.train()
for epoch in range(num_epochs):
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        print(f"Epoch {epoch}, Loss: {loss.item()}")

In [None]:
model.save_pretrained("your_model_directory")
tokenizer.save_pretrained("your_model_directory")

In [None]:
def evaluate_model(model, data_loader, device):
    model.eval()
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in data_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            _, predicted_labels = torch.max(logits, dim=1)

            predictions.extend(predicted_labels.cpu().numpy())
            true_labels.extend(batch['labels'].cpu().numpy())

    accuracy = accuracy_score(true_labels, predictions)
    return accuracy


# Evaluate the model
evaluation_results = evaluate_model(model, val_loader, device)
print(evaluation_results)

In [None]:
# Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')

# Create a dataset with all data
full_dataset = FrenchDataset(sentences, labels, tokenizer)

# Data loader for the full dataset
full_loader = DataLoader(full_dataset, batch_size=16, shuffle=True)

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-multilingual-cased', num_labels=6)
model.to(device)

#set up training
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 4
num_training_steps = num_epochs * len(full_loader)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

In [None]:
model.train()
for epoch in range(num_epochs):
    for batch in full_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        print(f"Epoch {epoch}, Loss: {loss.item()}")

In [None]:
model.save_pretrained("full_model_directory")
tokenizer.save_pretrained("full_model_directory")

In [None]:
class UnlabeledFrenchDataset(Dataset):
    def __init__(self, sentences, tokenizer, max_len=512):
        self.sentences = sentences
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        encoding = self.tokenizer(sentence, add_special_tokens=True, truncation=True, padding='max_length', max_length=self.max_len, return_tensors="pt")
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

In [None]:
# Load test data
unlabeled_df = pd.read_csv('unlabelled_test_data.csv')
unlabeled_sentences = unlabeled_df['sentence'].tolist()

In [None]:
# Prepare unlabeled dataset
unlabeled_dataset = UnlabeledFrenchDataset(unlabeled_sentences, tokenizer)
unlabeled_loader = DataLoader(unlabeled_dataset, batch_size=16, shuffle=False)


In [None]:
# Predict
model.eval()
predictions = []

with torch.no_grad():
    for batch in unlabeled_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        _, predicted_labels = torch.max(logits, dim=1)
        predictions.extend(predicted_labels.cpu().numpy())

# Map predictions back to label strings
predicted_difficulties = [list(label_mapping.keys())[label] for label in predictions]

# Combine predictions with sentences
data = {
    'sentence': unlabeled_sentences,
    'predicted_difficulty': predicted_difficulties
}

# Create a DataFrame
results_df = pd.DataFrame(data)

# Save to CSV
results_df.to_csv('/mnt/data/predicted_difficulties.csv', index=False)

# Camabert Model

## Version 1 (simple)

In [None]:
!pip install sentencepiece
!pip install accelerate -U

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import CamembertTokenizer, CamembertForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.metrics import accuracy_score
import sentencepiece as spm
from sklearn.preprocessing import LabelEncoder

In [None]:
class FrenchDifficultyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)
        return len(self.labels)


In [None]:
# Load the dataset
data = pd.read_csv('training_data.csv')

label_encoder = LabelEncoder()

# Fit label encoder and return encoded labels
data['encoded_labels'] = label_encoder.fit_transform(data['difficulty'])

# Now split the dataset with encoded labels
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data['sentence'], data['encoded_labels'], test_size=0.2, random_state=42
)

# Reset index to ensure proper alignment and avoid KeyError
train_labels = train_labels.reset_index(drop=True)
val_labels = val_labels.reset_index(drop=True)


from transformers import CamembertTokenizer

# Load the tokenizer
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')

# Tokenize the data
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True)

train_dataset = FrenchDifficultyDataset(train_encodings, train_labels)
val_dataset = FrenchDifficultyDataset(val_encodings, val_labels)

In [None]:
from transformers import CamembertForSequenceClassification, Trainer, TrainingArguments

# Load CamemBERT model pre-trained
model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=6)

# Define compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(-1)
    return {'accuracy': accuracy_score(labels, predictions)}

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    gradient_accumulation_steps=2,
    fp16=True
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
    )

trainer.train()
evaluation_results = trainer.evaluate()
print(evaluation_results)

## Hyperparameter Optimization

In [None]:
!pip install optuna

In [None]:
!pip install sentencepiece
!pip install accelerate -U

In [None]:
import optuna
from sklearn.model_selection import train_test_split
from transformers import CamembertTokenizer, CamembertForSequenceClassification, Trainer, TrainingArguments, CamembertConfig
from sklearn.metrics import accuracy_score
import torch

In [None]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = str(self.data.iloc[idx]['sentence'])
        label = int(self.data.iloc[idx]['encoded_labels'])

        encoding = self.tokenizer.encode_plus(
            sentence,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
# Load the dataset
data = pd.read_csv('training_data.csv')

# Define label encoder
label_encoder = LabelEncoder()

# Fit label encoder and transform labels
data['encoded_labels'] = label_encoder.fit_transform(data['difficulty'])

# Split data into train and validation sets
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# Define a function to train and evaluate the model with given hyperparameters
def objective(trial):
    # Sample hyperparameters to tune
    num_train_epochs = trial.suggest_int("num_train_epochs", 3, 7)
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32])

    model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=6)

    tokenizer = CamembertTokenizer.from_pretrained('camembert-base')

    # Define training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        learning_rate=1e-5,
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=per_device_train_batch_size,
        warmup_steps=1000,
        weight_decay=0.1,
        logging_dir='./logs',
        logging_steps=10,
        evaluation_strategy="steps",
        eval_steps=100,
        save_strategy="steps",
        save_steps=500,
        gradient_accumulation_steps=2,
        load_best_model_at_end=True
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=CustomDataset(train_data, tokenizer),
        eval_dataset=CustomDataset(val_data, tokenizer),
        compute_metrics=lambda p: {'accuracy': accuracy_score(p.predictions.argmax(axis=1), p.label_ids)}
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    eval_result = trainer.evaluate()

    return eval_result["eval_accuracy"]

# Perform hyperparameter optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

# Print the best hyperparameters
print("Best trial:")
print(study.best_trial.params)

## version 2 (with optimized parameters)

In [None]:
!pip install sentencepiece
!pip install accelerate -U

In [None]:
import torch
from torch.utils.data import Dataset
import pandas as pd
import spacy
from transformers import CamembertTokenizer, CamembertForSequenceClassification, Trainer, TrainingArguments, CamembertConfig
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = str(self.data.iloc[idx]['sentence'])
        label = int(self.data.iloc[idx]['encoded_labels'])

        encoding = self.tokenizer.encode_plus(
            sentence,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [None]:
# Load the dataset
data = pd.read_csv('training_data.csv')

# Define label encoder
label_encoder = LabelEncoder()

# Fit label encoder and transform labels
data['encoded_labels'] = label_encoder.fit_transform(data['difficulty'])

# Split data into train and validation sets
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
# Define compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(-1)
    return {'accuracy': accuracy_score(labels, predictions)}

In [None]:
from transformers import CamembertConfig, CamembertForSequenceClassification, CamembertTokenizer, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score

In [None]:
learning_rate = 0.00019389784400015783
num_train_epochs = 5
per_device_train_batch_size = 32

# Modify training arguments
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=learning_rate,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    warmup_steps=1000,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=500,
    gradient_accumulation_steps=2,
    fp16=True)

# Load CamemBERT model pre-trained
model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=6)

# Load the tokenizer
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')

train_dataset = CustomDataset(train_data, tokenizer)
eval_dataset = CustomDataset(val_data, tokenizer)

# Re-initialize and train the Trainer with possibly adjusted datasets
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=lambda p: {'accuracy': accuracy_score(p.predictions.argmax(axis=1), p.label_ids)}
)


trainer.train()
eval_result = trainer.evaluate()
print(eval_result)

retrain on extended dataset

In [None]:
full_data = pd.read_csv('combined_random_french_sentences.csv')

# Define and fit label encoder, transform labels
label_encoder = LabelEncoder()
full_data['encoded_labels'] = label_encoder.fit_transform(full_data['difficulty'])
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
full_dataset = CustomDataset(full_data, tokenizer)

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=0.00019389784400015783,
    num_train_epochs=5,
    per_device_train_batch_size=32,
    warmup_steps=1000,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy="steps",
    save_steps=500,
    gradient_accumulation_steps=2,
    fp16=True,
    evaluation_strategy="no"  # Disable evaluation
)


# Initialize the model
model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=6)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=full_dataset,  # use the full dataset for training
    compute_metrics=None  # Optional: Define this function if you want metrics
)

# Train the model
trainer.train()

test on unlabeled data

In [None]:
test_data = pd.read_csv('unlabelled_test_data.csv')

In [None]:
class TestDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = str(self.data.iloc[idx]['sentence'])

        encoding = self.tokenizer.encode_plus(
            sentence,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

In [None]:
# Create test dataset (without labels)
test_dataset = TestDataset(test_data, tokenizer)

# Make predictions
predictions = trainer.predict(test_dataset)

# Decode predictions to labels
predicted_labels = np.argmax(predictions.predictions, axis=1)
predicted_labels = label_encoder.inverse_transform(predicted_labels)


In [None]:
output_df = pd.DataFrame({
    'id': test_data['id'],
    'difficulty': predicted_labels
})

output_df.to_csv('submissions_camabert_without_dropout_random.csv', index=False)

## Version 3 (with transfer learning)

In [None]:
!pip install transformers
!pip install accelerate -U
!pip install pandas numpy matplotlib
!pip install scikit-learn seaborn

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from transformers import CamembertTokenizer, CamembertForSequenceClassification, Trainer, TrainingArguments, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Load data from CSV file
df = pd.read_csv("training_data.csv")

# Assuming your dataset has 'sentence' and 'difficulty' columns
sentences = df['sentence'].tolist()
difficulties = df['difficulty'].tolist()

# Split data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(sentences, difficulties, test_size=0.2, random_state=42)

# Load pre-trained CamemBERT tokenizer
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')

# Tokenize training and validation texts
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit label encoder and transform labels
train_labels = label_encoder.fit_transform(train_labels)
val_labels = label_encoder.transform(val_labels)

# Convert labels to tensors
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)

# Convert labels to tensors
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)

# Create datasets
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_encodings, train_labels)
val_dataset = CustomDataset(val_encodings, val_labels)

# Fine-tune CamemBERT for sequence classification
model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=6)
optimizer = AdamW(model.parameters(), lr=20e-5)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    gradient_accumulation_steps=2
)

# Define a function to compute accuracy
def compute_accuracy(p):
    preds = p.predictions.argmax(-1)
    return {"accuracy": accuracy_score(preds, p.label_ids)}

# Define the total number of training steps
total_steps = len(train_dataset) * training_args.num_train_epochs

# Create the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=500,
    num_training_steps=total_steps
)

# Update the 'optimizers' argument in Trainer initialization
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_accuracy,
    optimizers=(optimizer, scheduler),  # Include the scheduler
)

# Train the model
train_history = trainer.train()

# Evaluate the fine-tuned model
eval_results = trainer.evaluate(eval_dataset=val_dataset)
print(eval_results)

In [None]:
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

In [None]:
# Load the fine-tuned model and tokenizer
model = CamembertForSequenceClassification.from_pretrained('./fine_tuned_model')
tokenizer = CamembertTokenizer.from_pretrained('./fine_tuned_model')

# Load your unlabeled dataset (assuming it's stored in a CSV file)
unlabeled_df = pd.read_csv("unlabelled_data.csv")

# Tokenize the sentences in the unlabeled dataset
tokenized_texts = tokenizer(unlabeled_df['sentence'].tolist(), truncation=True, padding=True)

# Create torch dataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

unlabeled_dataset = CustomDataset(tokenized_texts)

# Set the model to evaluation mode
model.eval()

# Generate predictions for the unlabeled dataset
predictions = []
with torch.no_grad():
    for batch in torch.utils.data.DataLoader(unlabeled_dataset, batch_size=32):
        input_ids = batch['input_ids'].to(model.device)
        attention_mask = batch['attention_mask'].to(model.device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions.extend(logits.argmax(dim=1).cpu().numpy())

# Add predictions to the original DataFrame
unlabeled_df['difficulty'] = predictions

# Export predictions to a CSV file
unlabeled_df[['id', 'difficulty']].to_csv("predictions.csv", index=False)

import matplotlib.pyplot as plt

# Extract training loss and evaluation metrics from train_history
train_loss_values = train_history['loss']
eval_accuracy_values = trainer['eval_accuracy']

# Plot the learning curve
plt.plot(range(1, len(train_loss_values) + 1), train_loss_values, label='Training Loss')
plt.plot(range(1, len(eval_accuracy_values) + 1), eval_accuracy_values, label='Evaluation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Metric Value')
plt.title('Learning Curve')
plt.legend()
plt.show()