In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from transformers import CamembertTokenizer, CamembertForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE

# Disable wandb tracking if not needed
os.environ["WANDB_DISABLED"] = "true"

# Get data
train_data = pd.read_csv('../input/french-climate-change-fake-news/train.csv')
test_data = pd.read_csv('../input/french-climate-change-fake-news/test.csv')

# Check data
print("Train data:")
print(train_data.head())
print("\nTest data:")
print(test_data.head())

# Split data
X_train = train_data['Text']
y_train = train_data['Label']

# Map string labels to integers
label_mapping = {label: idx for idx, label in enumerate(y_train.unique())}
y_train = y_train.map(label_mapping)

# Ensure test dataset labels are mapped too (if applicable, or create placeholder)
if 'Label' in test_data.columns:
    test_data['Label'] = test_data['Label'].map(label_mapping)

# Perform Train-Test Split
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

# Addressing class imbalance by oversampling minority classes in the training set
from collections import Counter
from sklearn.utils import resample

# Combine text and labels for oversampling
train_data_combined = pd.DataFrame({'Text': X_train_split, 'Label': y_train_split})

# Separate majority and minority classes
class_counts = Counter(y_train_split)
majority_class = max(class_counts, key=class_counts.get)
minority_classes = [cls for cls in class_counts if cls != majority_class]

# Oversample minority classes
oversampled_data = [train_data_combined[train_data_combined['Label'] == majority_class]]
for cls in minority_classes:
    class_data = train_data_combined[train_data_combined['Label'] == cls]
    oversampled_class_data = resample(class_data, 
                                      replace=True, 
                                      n_samples=class_counts[majority_class], 
                                      random_state=42)
    oversampled_data.append(oversampled_class_data)

# Combine oversampled data
oversampled_train_data = pd.concat(oversampled_data)
X_train_resampled = oversampled_train_data['Text']
y_train_resampled = oversampled_train_data['Label']

# Custom Dataset for BERT
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(label, dtype=torch.long),
        }

# Load tokenizer and model
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
model = CamembertForSequenceClassification.from_pretrained(
    "camembert-base", num_labels=len(np.unique(y_train))
)

# Prepare data
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train_resampled, y_train_resampled, test_size=0.2, random_state=42
)
train_dataset = NewsDataset(X_train_split.tolist(), y_train_split.tolist(), tokenizer, max_len=512)
val_dataset = NewsDataset(X_val.tolist(), y_val.tolist(), tokenizer, max_len=512)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",  # Evaluation happens at the end of each epoch
    save_strategy="epoch",  # Save the model at the end of each epoch
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    report_to=["none"],  # Disables wandb or other trackers
    load_best_model_at_end=True,  # Load the best model at the end
    metric_for_best_model="eval_loss"  # Use evaluation loss to determine the best model
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    )

# Train model
trainer.train()

# Load the best model
best_model = trainer.model

# Save the best model
torch.save({
    'model_state_dict': best_model.state_dict(),
    'tokenizer': tokenizer,
    'label_mapping': label_mapping  # Save the label mapping for later use
}, "./best_model.pt")
print("Best model saved: ./best_model.pt")

# Evaluate and print metrics for the best model
val_results = trainer.evaluate()
print("Best Model Validation Results:", val_results)

# Generate detailed classification report using the best model
val_predictions = trainer.predict(val_dataset)
y_val_pred = np.argmax(val_predictions.predictions, axis=1)
print("\nClassification Report:\n")
print(classification_report(y_val.tolist(), y_val_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_val, y_val_pred, normalize='true')
labels = list(label_mapping.keys())
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='.2f', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.title('Normalized Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

# Predict test data using the best model
test_dataset = NewsDataset(test_data["Text"].tolist(), [0] * len(test_data), tokenizer, max_len=512)
test_predictions = trainer.predict(test_dataset)
test_data["Label"] = pd.Series(np.argmax(test_predictions.predictions, axis=1)).map({v: k for k, v in label_mapping.items()})

# Save test file with predictions
test_data.to_csv("test_with_predictions.csv", index=False)