<a href="https://colab.research.google.com/github/Niiingleiii/ML-French-Text-Classification/blob/main/Nvidia_Main_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Basic Models: Logistic Regression, KNN, Random Forest, Decision Tree

In [10]:
pip install datasets evaluate

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━

In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import CamembertTokenizer, CamembertForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import evaluate
import numpy as np

# Set environment variables for offline mode
os.environ['HF_DATASETS_OFFLINE'] = '1'
os.environ['TRANSFORMERS_OFFLINE'] = '1'


In [None]:
# Load dataset
dataset = pd.read_csv('https://raw.githubusercontent.com/Niiingleiii/ML-data/main/training_data.csv')
aug_dataset = pd.read_csv('https://raw.githubusercontent.com/emilysr2/Data-Science-and-Machine-Learning/main/augmented_training_data_7000_v3%20(1).csv')
dataset = pd.concat([dataset, aug_dataset], ignore_index=True)
dataset['difficulty'] = dataset['difficulty'].astype('category')
class_names = dataset['difficulty'].cat.categories.tolist()

# Split dataset
train_set, validation_set = train_test_split(dataset, test_size=0.1, random_state=42)

# Number of labels
num_labels = train_set['difficulty'].nunique()

# Convert 'difficulty' to an integer
train_set['difficulty'] = train_set['difficulty'].cat.codes
validation_set['difficulty'] = validation_set['difficulty'].cat.codes

# Fill any missing values
train_set = train_set.fillna('')
validation_set = validation_set.fillna('')

# Rename columns to match expected feature names
train_set = train_set.rename(columns={'sentence': 'text', 'difficulty': 'labels'})
validation_set = validation_set.rename(columns={'sentence': 'text', 'difficulty': 'labels'})

# Select only the columns needed for the Dataset
train_set = train_set[['text', 'labels']]
validation_set = validation_set[['text', 'labels']]

# Load tokenizer
tokenizer = CamembertTokenizer.from_pretrained('almanach/camembert-base')

# Preprocess data
def preprocess_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512)

# Apply preprocessing to training data
train_encodings = preprocess_function({'text': train_set['text'].tolist()})
val_encodings = preprocess_function({'text': validation_set['text'].tolist()})

X_train = train_encodings['input_ids']
X_val = val_encodings['input_ids']
y_train = train_set['labels'].tolist()
y_val = validation_set['labels'].tolist()


In [None]:
def evaluation(model, X_test, y_test):
    y_pred = model.predict(X_test)
    model_accuracy = accuracy_score(y_test, y_pred)
    model_precision = precision_score(y_test, y_pred, average='macro')
    model_recall = recall_score(y_test, y_pred, average='macro')
    model_f1 = f1_score(y_test, y_pred, average='macro')
    model_eval = [model_accuracy, model_precision, model_recall, model_f1]
    return model_eval

In [None]:
# initialize
logistic_regression_model = LogisticRegression(max_iter=1000)
knn_model = KNeighborsClassifier(algorithm='kd_tree')
decision_tree_model = DecisionTreeClassifier()
random_forest_model = RandomForestClassifier()

# train
logistic_regression_model.fit(X_train, y_train)
knn_model.fit(X_train, y_train)
decision_tree_model.fit(X_train, y_train)
random_forest_model.fit(X_train, y_train)

# results
model_comparison = pd.DataFrame(index=['Accuracy', 'Precision', 'Recall', 'F1'])

model_comparison['Logistic Regression'] = evaluation(logistic_regression_model, X_val, y_val)
model_comparison['KNN'] = evaluation(knn_model, X_val, y_val)
model_comparison['Decision Tree'] = evaluation(decision_tree_model, X_val, y_val)
model_comparison['Random Forest'] = evaluation(random_forest_model, X_val, y_val)

print(model_comparison)

In [None]:
rfc_model_best = RandomForestClassifier(max_depth=None, min_samples_split=2, n_estimators=300, random_state=42)
rfc_model_best.fit(X_train,y_train)
model_comparison['Random Forest (best parameter)'] = evaluation(rfc_model_best, X_val, y_val)

print(model_comparison)

# Advanced Models: LGBM, Random Trees, XGBoost, CatBoost

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
etc_model = ExtraTreesClassifier(random_state=42)
etc_model.fit(X_train,y_train)

model_comparison['Extra Trees'] = evaluation(etc_model, X_val, y_val)
model_comparison

In [None]:
from lightgbm import LGBMClassifier
lgb_model = LGBMClassifier()
lgb_model.fit(X_train,y_train)

model_comparison['LightGBM'] = evaluation(lgb_model, X_val, y_val)
model_comparison

In [None]:
from xgboost import XGBClassifier
xgb_model = XGBClassifier()
xgb_model.fit(X_train,y_train)

model_comparison['XGBoost'] = evaluation(xgb_model, X_val, y_val)
model_comparison

In [None]:
!pip install catboost
from catboost import CatBoostClassifier

In [None]:
cb_model_bert = CatBoostClassifier()
cb_model_bert.fit(X_train,y_train)

model_comparison['Catboost'] = evaluation(cb_model_bert, X_val, y_val)
model_comparison

# Our Best Model - CamenBert

In [None]:
!pip uninstall accelerate -y
# Install necessary packages
!pip install transformers datasets evaluate accelerate -U

In [None]:
import os
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import CamembertTokenizer, CamembertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, Features, ClassLabel, Value
import evaluate
import numpy as np
from sklearn.metrics import precision_score, accuracy_score, f1_score, recall_score

In [None]:
# Paths to data
train_data_path = 'https://raw.githubusercontent.com/Niiingleiii/ML-data/main/training_data.csv'
aug_data_path = 'https://raw.githubusercontent.com/emilysr2/Data-Science-and-Machine-Learning/main/augmented_training_data_7000_v3%20(1).csv'
test_data_path = 'https://raw.githubusercontent.com/Niiingleiii/ML-data/main/unlabelled_test_data.csv'

# Load dataset
dataset = pd.read_csv(train_data_path)
aug_dataset = pd.read_csv(aug_data_path)
dataset = pd.concat([dataset, aug_dataset], ignore_index=True)
dataset['difficulty'] = dataset['difficulty'].astype('category')
class_names = dataset['difficulty'].cat.categories.tolist()

print(f'Number of datapoints: {len(dataset)}')
print(f'Datatable:\n {dataset.head(4)}')
print(f'Datatable columns {dataset.columns}')

In [None]:
rain_set, validation_set = train_test_split(dataset, test_size=0.1, random_state=42)

# Print the size of each set
print(f'Number of training datapoints: {len(train_set)}')
print(f'Number of validation datapoints: {len(validation_set)}')

# Optionally, print some samples from each set
print(f'Training datatable sample:\n {train_set.head(3)}')
print(f'Validation datatable sample:\n {validation_set.head(3)}')

# Number of labels
num_labels = train_set['difficulty'].nunique()
print(f'Number of labels: {num_labels}')

# Ensure data types are supported
train_set['difficulty'] = train_set['difficulty'].cat.codes
validation_set['difficulty'] = validation_set['difficulty'].cat.codes

# Fill any missing values
train_set = train_set.fillna('')
validation_set = validation_set.fillna('')

# Rename columns to match expected feature names
train_set = train_set.rename(columns={'sentence': 'text', 'difficulty': 'labels'})
validation_set = validation_set.rename(columns={'sentence': 'text', 'difficulty': 'labels'})

# Select only the columns needed for the Dataset
train_set = train_set[['text', 'labels']]
validation_set = validation_set[['text', 'labels']]

# Define feature types explicitly
features = Features({
    'text': Value('string'),
    'labels': ClassLabel(num_classes=num_labels)
})

In [None]:
# Load tokenizer and model
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')

# Freeze all layers except the last two transformer layers and the classification head
def freeze_layers(model):
    for param in model.roberta.parameters():
        param.requires_grad = False

    # Access the last layer of the encoder
    last_layer = model.roberta.encoder.layer[-1:]
    # Enable gradient computation for the parameters in the last layer
    for param in last_layer.parameters():
        param.requires_grad = True

    for param in model.classifier.parameters():
        param.requires_grad = True


In [None]:
# Preprocess data
def preprocess_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512)

# Create Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_set, features=features, preserve_index=False)
val_dataset = Dataset.from_pandas(validation_set, features=features, preserve_index=False)

train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

# Evaluation metrics
accuracy_metric = evaluate.load('accuracy')
precision_metric = evaluate.load('precision')
recall_metric = evaluate.load('recall')
f1_metric = evaluate.load('f1')

def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    labels = p.label_ids
    accuracy = accuracy_metric.compute(predictions=preds, references=labels)
    precision = precision_metric.compute(predictions=preds, references=labels, average='weighted')
    recall = recall_metric.compute(predictions=preds, references=labels, average='weighted')
    f1 = f1_metric.compute(predictions=preds, references=labels, average='weighted')
    return {
        'eval_accuracy': accuracy['accuracy'],
        'eval_precision': precision['precision'],
        'eval_recall': recall['recall'],
        'eval_f1': f1['f1']
    }

In [None]:
# Training configuration for the best model
best_training_config = {'seed': 42, 'learning_rate': 10e-5}

# Train the best model
best_model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=num_labels)
freeze_layers(best_model)

best_training_args = TrainingArguments(
    output_dir=f'./results_seed_{best_training_config["seed"]}',
    evaluation_strategy='epoch',
    learning_rate=best_training_config['learning_rate'],
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=6,
    weight_decay=0.01,
    logging_dir=f'./logs_seed_{best_training_config["seed"]}',
    logging_steps=15,
    seed=best_training_config['seed']
)

best_trainer = Trainer(
    model=best_model,
    args=best_training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

best_trainer.train()

# Evaluate the best model
best_metrics = best_trainer.evaluate()
print("Metrics for the best model:")
print(f"Accuracy: {best_metrics['eval_accuracy']}")
print(f"Precision: {best_metrics['eval_precision']}")
print(f"Recall: {best_metrics['eval_recall']}")
print(f"F1 Score: {best_metrics['eval_f1']}")


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
# Evaluate and get predictions
predictions = trainer.predict(val_dataset)
preds = predictions.predictions.argmax(-1)
labels = predictions.label_ids

# Compute confusion matrix
cm = confusion_matrix(labels, preds)

# Display confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.show()

# Ensembling Using CamenBert

In [None]:
 Training configurations for ensembling
ensemble_training_configs = [
    {'seed': 42, 'learning_rate': 10e-5},
    {'seed': 43, 'learning_rate': 11e-5},
    {'seed': 44, 'learning_rate': 10e-5},
    {'seed': 45, 'learning_rate': 10e-5},
    {'seed': 46, 'learning_rate': 1e-5},
]

ensemble_models = []
ensemble_trainers = []

# Train the models for ensembling
for config in ensemble_training_configs:
    model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=num_labels)
    freeze_layers(model)

    training_args = TrainingArguments(
        output_dir=f'./results_seed_{config["seed"]}',
        evaluation_strategy='epoch',
        learning_rate=config['learning_rate'],
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=6,
        weight_decay=0.01,
        logging_dir=f'./logs_seed_{config["seed"]}',
        logging_steps=15,
        seed=config['seed']
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics
    )

    trainer.train()
    ensemble_trainers.append(trainer)
    ensemble_models.append(model)

In [None]:
# Evaluate the ensemble model
ensemble_predictions = []
for trainer in ensemble_trainers:
    predictions = trainer.predict(val_dataset)
    ensemble_predictions.append(predictions.predictions)

# Average the predictions
average_val_predictions = np.mean(np.stack(ensemble_predictions), axis=0)
ensemble_preds = np.argmax(average_val_predictions, axis=1)

# Calculate ensemble metrics
ensemble_labels = val_dataset['labels']
ensemble_accuracy = accuracy_score(ensemble_labels, ensemble_preds)
ensemble_precision = precision_score(ensemble_labels, ensemble_preds, average='weighted')
ensemble_recall = recall_score(ensemble_labels, ensemble_preds, average='weighted')
ensemble_f1 = f1_score(ensemble_labels, ensemble_preds, average='weighted')

print("Metrics for the ensemble model:")
print(f"Accuracy: {ensemble_accuracy}")
print(f"Precision: {ensemble_precision}")
print(f"Recall: {ensemble_recall}")
print(f"F1 Score: {ensemble_f1}")

In [None]:
# Evaluate the models and make predictions for ensembling
test_data = pd.read_csv(test_data_path)
test_data = test_data.rename(columns={'sentence': 'text'})
id = test_data[['id']]
test_data = test_data[['text']]
test_dataset = Dataset.from_pandas(test_data, preserve_index=False)
test_dataset = test_dataset.map(preprocess_function, batched=True)
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# Collect predictions from each model
all_predictions = []

for trainer in ensemble_trainers:
    predictions = trainer.predict(test_dataset)
    all_predictions.append(predictions.predictions)

# Average the predictions to get the final prediction
average_predictions = np.mean(np.stack(all_predictions), axis=0)
final_preds = np.argmax(average_predictions, axis=1)

# Map the predictions to the original labels
pred_labels = [class_names[pred] for pred in final_preds]

# Save the predictions to a submission file
submission = pd.DataFrame({'id': id['id'].values.tolist(), 'difficulty': pred_labels})
submission.to_csv('/content/submission_ensemble.csv', index=False)

print("Ensemble predictions saved to submission_ensemble.csv")

# Code to download the file in Google Colab
from google.colab import files
files.download('/content/submission_ensemble.csv')