In [None]:
# Install necessary libraries
!pip install transformers[torch] datasets torch scikit-learn accelerate>=0.26.0 lollms_client

# Fine-tuning BERT for Multi-Class Text Classification
In this notebook, we will fine-tune a pre-trained BERT model for the text classification task in the Frugal AI Challenge. The steps include:
1. Loading the dataset.
2. Preprocessing the text data.
3. Adding a custom classification head to BERT.
4. Fine-tuning the model.
5. Evaluating the model's performance.


In [None]:
# Import necessary libraries
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd

In [None]:
# Load the dataset
dataset = load_dataset("QuotaClimat/frugalaichallenge-text-train", "default")

# Display the dataset structure
print(dataset)

# Split the training set into train and validation subsets
from sklearn.model_selection import train_test_split

# Convert the training set to a Pandas DataFrame for splitting
train_df = pd.DataFrame(dataset["train"])

# Drop the duplicate column "__index_level_0__" if it exists
if "__index_level_0__" in train_df.columns:
    train_df = train_df.drop(columns=["__index_level_0__"])

# Define the label text and organize them by their prefixes
label_texts = [
    "0_not_relevant",
    "1_not_happening",
    "2_not_human",
    "3_not_bad",
    "4_solutions_harmful_unnecessary",
    "5_science_unreliable",
    "6_proponents_biased",
    "7_fossil_fuels_needed"
]

# Sort the labels alphabetically by their prefix
sorted_labels = sorted(label_texts, key=lambda x: x.split("_")[0])

# Create a mapping from label text to integers based on the sorted order
label_mapping = {label: idx for idx, label in enumerate(sorted_labels)}

# Map the labels in the DataFrame
train_df["label"] = train_df["label"].map(label_mapping)

# Perform an 80-20 split for training and validation
train_data, val_data = train_test_split(train_df, test_size=0.2, stratify=train_df["label"], random_state=42)

# Convert the split data back to Hugging Face Dataset format
from datasets import Dataset
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

# Update the dataset dictionary to include the validation set
dataset = {
    "train": train_dataset,
    "validation": val_dataset
}

# Display the updated dataset structure
print(dataset)

# Display the label mapping for reference
print("Label Mapping:", label_mapping)


In [None]:
from lollms_client import LollmsClient
from datasets import Dataset
import pandas as pd
import json

# Initialize LollmsClient
lc = LollmsClient("http://localhost:9600")

# Check class distribution
class_counts = train_df["label"].value_counts()
print("Class Distribution Before Balancing:")
print(class_counts)

# Determine the target number of examples per class (based on the majority class)
target_count = class_counts.max()

import random

# Function to generate additional examples for a class
def generate_examples_for_class(class_name, existing_examples, num_examples_needed):
    # Randomly select up to 10 examples from the existing examples
    random_examples = random.sample(existing_examples, min(len(existing_examples), 10))
    
    # Prepare the prompt with the randomly selected examples
    example_texts = ",\n".join([f'"{text}"' for text in random_examples])  # Use up to 10 random examples for the prompt
    batch_size = 10  # Generate 10 examples at a time
    
    # Build the JSON structure as a string
    json_structure = (
        "{\n"
        '    "class": "' + class_name + '",\n'
        '    "examples": [\n'
        + example_texts + "\n"
        "    ]\n"
        "}"
    )
    
    # Build the full prompt
    prompt_template = (
        "Build a JSON code that contains a list of new text examples in the same class: "
        + class_name
        + ".\nHere are some examples from the class:\n```json\n"
        + json_structure
        + "\n```\n\n"
        + "Generate "
        + str(batch_size)
        + " new examples in the same style and tone."
    )
    
    generated_examples = []
    while num_examples_needed > 0:
        current_batch_size = min(batch_size, num_examples_needed)  # Adjust batch size for the remaining examples
        prompt = prompt_template.replace(str(batch_size), str(current_batch_size))  # Update batch size in the prompt
        
        # Debug: Print the prompt
        print("Prompt being sent:")
        print(prompt)
        
        # Generate synthetic examples using LollmsClient
        response = lc.generate_code(prompt)
        
        # Debug: Print the response
        print("Response received:")
        print(response)
        
        # Parse the generated JSON
        try:
            generated_data = json.loads(response.strip())  # Parse the JSON response
            if "examples" in generated_data:
                generated_examples.extend(generated_data["examples"])
                num_examples_needed -= len(generated_data["examples"])
            else:
                print(f"Unexpected response format: {response}")
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON response for class {class_name}: {e}")
            break  # Exit the loop if there's an error
        except KeyError as e:
            print(f"KeyError in response for class {class_name}: {e}")
            break  # Exit the loop if there's an error
    
    return generated_examples
from tqdm.notebook import tqdm  # Import tqdm for progress bar
import random

# Generate additional examples for underrepresented classes
new_data = []

for label, count in tqdm(class_counts.items(), desc="Balancing Classes"):
    if count < target_count:
        class_name = [key for key, value in label_mapping.items() if value == label][0]  # Get the class name
        num_examples_needed = target_count - count
        existing_examples = train_df[train_df["label"] == label]["quote"].tolist()
        
        # Debug: Check existing examples
        print(f"Class: {class_name}, Existing Examples: {len(existing_examples)}")
        
        # Randomly select up to 10 examples
        random_examples = random.sample(existing_examples, min(len(existing_examples), 10))
        
        # Generate new examples
        generated_examples = generate_examples_for_class(class_name, random_examples, num_examples_needed)
        
        # Debug: Check generated examples
        print(f"Class: {class_name}, Needed: {num_examples_needed}, Generated: {len(generated_examples)}")
        
        # Add the generated examples to the new data
        for example in generated_examples:
            new_data.append({"quote": example, "label": label})

# Convert the new data to a DataFrame
new_data_df = pd.DataFrame(new_data)

# Debug: Check new data shape
print(f"New Data Shape: {new_data_df.shape}")

# Append the new data to the training DataFrame
balanced_train_df = pd.concat([train_df, new_data_df], ignore_index=True)

# Debug: Check final distribution
print("Class Distribution After Balancing:")
print(balanced_train_df["label"].value_counts())


In [None]:
print("Class Distribution After Balancing:")
print(balanced_train_df["label"].value_counts())

In [None]:
# Determine the target number of examples per class (based on the majority class)
target_count = class_counts.max()

print(balanced_train_df)
# Convert the balanced DataFrame back to Hugging Face Dataset format
balanced_train_dataset = Dataset.from_pandas(balanced_train_df)

# Update the dataset dictionary
dataset["train"] = balanced_train_dataset

# Check the new class distribution
balanced_class_counts = balanced_train_df["label"].value_counts()
print("Class Distribution After Balancing:")
print(balanced_class_counts)

In [None]:
# Save the updated dataset to a local directory
balanced_train_dataset.save_to_disk("balanced_train_dataset")

In [None]:
from datasets import DatasetDict

# Load the dataset dictionary from the saved directory
dataset = DatasetDict.load_from_disk("balanced_dataset")


## Class weight
This database is not balanced, so we need to try using a technique to solve this problem. Here we decided to do class weighting.

In [8]:
train_df = balanced_train_df

In [None]:
import numpy as np
from torch.utils.data import DataLoader
from sklearn.utils.class_weight import compute_class_weight

# Get the labels from the training dataset
train_labels = train_df["label"].values

# Compute class weights
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(train_labels),
    y=train_labels
)

# Convert class weights to a PyTorch tensor
class_weights = torch.tensor(class_weights, dtype=torch.float)

print("Class Weights:", class_weights)


In [None]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the text data
def preprocess_data(examples):
    return tokenizer(examples["quote"], padding="max_length", truncation=True, max_length=128)

# Apply the tokenizer to the train and validation datasets
tokenized_train_dataset = dataset["train"].map(preprocess_data, batched=True)
tokenized_val_dataset = dataset["validation"].map(preprocess_data, batched=True)

# Set the format for PyTorch
tokenized_train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
tokenized_val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


In [None]:
# Load the pre-trained BERT model with a classification head
num_labels = 8  # Number of unique labels in the dataset
from transformers import Trainer, TrainingArguments
from torch.nn import CrossEntropyLoss

# Define a custom model with weighted loss
class WeightedBERT(BertForSequenceClassification):
    def __init__(self, config, class_weights):
        super().__init__(config)
        self.class_weights = class_weights

    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
        outputs = super().forward(input_ids=input_ids, attention_mask=attention_mask, labels=labels, **kwargs)
        if labels is not None:
            loss_fn = CrossEntropyLoss(weight=self.class_weights.to(outputs.logits.device))
            loss = loss_fn(outputs.logits, labels)
            outputs.loss = loss
        return outputs

# Load the model with class weights
model = WeightedBERT.from_pretrained("bert-base-uncased", num_labels=len(label_mapping), class_weights=class_weights)



In [12]:
# Define a function to compute evaluation metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}


In [15]:
# Define training arguments with gradient clipping
training_args = TrainingArguments(
    output_dir="./results",          # Output directory
    evaluation_strategy="epoch",    # Evaluate every epoch
    learning_rate=2e-5,             # Learning rate
    per_device_train_batch_size=16, # Batch size for training
    per_device_eval_batch_size=16,  # Batch size for evaluation
    num_train_epochs=10,            # Increased epochs to allow early stopping
    weight_decay=0.01,              # Weight decay for regularization
    logging_dir="./logs",           # Directory for logs
    logging_steps=10,
    save_strategy="epoch",          # Save model at the end of each epoch
    load_best_model_at_end=True,    # Load the best model based on validation
    metric_for_best_model="eval_loss",  # Metric to monitor for best model
    greater_is_better=False,        # Lower eval_loss is better
    gradient_accumulation_steps=2,  # Accumulate gradients over 2 steps
    max_grad_norm=1.0               # Gradient clipping
)


In [None]:
from transformers import EarlyStoppingCallback

# Add EarlyStoppingCallback
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=2,  # Stop after 2 epochs without improvement
    early_stopping_threshold=0.01  # Minimum improvement threshold
)

# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping]  # Add early stopping callback
)

# Train the model
trainer.train()


In [None]:
# Save the fine-tuned model and tokenizer
model.save_pretrained("./bert-fine-tuned")
tokenizer.save_pretrained("./bert-fine-tuned")


In [None]:
logs = pd.DataFrame(trainer.state.log_history)
print(logs)

In [None]:
print(label_mapping)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Get predictions on the validation set
predictions = trainer.predict(tokenized_val_dataset)

# Convert predictions to class labels
predicted_labels = predictions.predictions.argmax(axis=-1)
true_labels = predictions.label_ids

# Compute the confusion matrix
conf_matrix = confusion_matrix(true_labels, predicted_labels)

# Plot the confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=list(label_mapping.keys()), yticklabels=list(label_mapping.keys()))
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix")
plt.show()


In [None]:
label_mapping

In [None]:
from sklearn.metrics import classification_report

# Get predictions on the validation set
predictions = trainer.predict(tokenized_val_dataset)

# Convert predictions to class labels
predicted_labels = predictions.predictions.argmax(axis=-1)
true_labels = predictions.label_ids

# Compute classification metrics
report = classification_report(true_labels, predicted_labels, target_names=list(label_mapping.keys()))
print(report)


In [None]:
print(logs["epoch"])

In [None]:
# Save the classification report to a text file
with open("./bert-fine-tuned/classification_report.txt", "w") as f:
    f.write(report)

# Save the training evolution plots
plt.figure(figsize=(10, 5))
plt.plot(logs["epoch"], logs["loss"], label="Training Loss", marker="o")
plt.plot(logs["epoch"], logs["eval_loss"], label="Validation Loss", marker="o")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training and Validation Loss")
plt.legend()
plt.grid()
plt.savefig("./bert-fine-tuned/training_loss_plot.png")

plt.figure(figsize=(10, 5))
plt.plot(logs["epoch"], logs["eval_accuracy"], label="Validation Accuracy", marker="o", color="green")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Validation Accuracy")
plt.legend()
plt.grid()
plt.savefig("./bert-fine-tuned/validation_accuracy_plot.png")


In [None]:
# Save the confusion matrix plot
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=list(label_mapping.keys()), yticklabels=list(label_mapping.keys()))
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix")
plt.savefig("./bert-fine-tuned/confusion_matrix.png")


In [None]:
# Save the confusion matrix plot
print(conf_matrix)

In [None]:
#Please open a console and type:
# huggingface-cli login
# log in to hugging face with your credentials

In [33]:
def generate_model_card(
    model_dir, model_name, description, metrics_data, limitations, citation, label_mapping
):
    """
    Generates a model card and saves it as README.md in the specified directory.

    Args:
        model_dir (str): Directory where the model card will be saved.
        model_name (str): Name of the model.
        description (str): Description of the model.
        metrics_data (dict): Performance metrics of the model (e.g., precision, recall, F1-score, accuracy).
        limitations (str): Limitations of the model.
        citation (str): Citation for the model.
        label_mapping (dict): Mapping between model output indices and class names.
    """
    # Generate the metrics table dynamically
    metrics_table = "| Class | Precision | Recall | F1-Score | Support |\n"
    metrics_table += "|-------|-----------|--------|----------|---------|\n"
    print(metrics_data)
    for class_name, idx in label_mapping.items():
        print(idx)
        metrics_table += f"| {class_name} | {metrics_data['precision'][idx]:.2f} | {metrics_data['recall'][idx]:.2f} | {metrics_data['f1'][idx]:.2f} | {metrics_data['support'][idx]} |\n"

    # Add overall metrics
    overall_metrics = (
        f"- **Overall Accuracy**: {metrics_data['accuracy']:.2f}\n"
        f"- **Macro Average**: Precision: {metrics_data['macro_precision']:.2f}, Recall: {metrics_data['macro_recall']:.2f}, F1-Score: {metrics_data['macro_f1']:.2f}\n"
        f"- **Weighted Average**: Precision: {metrics_data['weighted_precision']:.2f}, Recall: {metrics_data['weighted_recall']:.2f}, F1-Score: {metrics_data['weighted_f1']:.2f}\n"
    )

    # Generate the model card content
    model_card_content = f"""
---
license: apache-2.0
datasets:
- QuotaClimat/frugalaichallenge-text-train
language:
- en
metrics:
- accuracy
- f1
base_model:
- google-bert/bert-base-uncased
library_name: transformers
---

# Model Card: {model_name}

## Model Overview
{description}

## Dataset
- **Source**: Frugal AI Challenge Text Task Dataset
- **Classes**: {len(label_mapping)} unique labels representing various categories of text
- **Preprocessing**: Tokenization using `BertTokenizer` with padding and truncation to a maximum sequence length of 128.

## Model Architecture
- **Base Model**: `bert-base-uncased`
- **Classification Head**: Custom head with weighted cross-entropy loss to handle class imbalance.
- **Number of Labels**: {len(label_mapping)}

## Training Details
- **Optimizer**: AdamW
- **Learning Rate**: 2e-5
- **Batch Size**: 16 (for both training and evaluation)
- **Epochs**: 3
- **Weight Decay**: 0.01
- **Evaluation Strategy**: Performed at the end of each epoch
- **Hardware**: Trained on GPUs for efficient computation

## Performance Metrics (Validation Set)
The following metrics were computed on the validation set (not the test set, which remains private for the competition):

{metrics_table}

{overall_metrics}

## Training Evolution
### Training and Validation Loss
The training and validation loss evolution over epochs is shown below:

![Training Loss](./training_loss_plot.png)

### Validation Accuracy
The validation accuracy evolution over epochs is shown below:

![Validation Accuracy](./validation_accuracy_plot.png)

## Confusion Matrix
The confusion matrix below illustrates the model's performance on the validation set, highlighting areas of strength and potential misclassifications:

![Confusion Matrix](./confusion_matrix.png)

## Key Features
- **Class Weighting**: Addressed dataset imbalance by incorporating class weights during training.
- **Custom Loss Function**: Used weighted cross-entropy loss for better handling of underrepresented classes.
- **Evaluation Metrics**: Accuracy, precision, recall, and F1-score were computed to provide a comprehensive understanding of the model's performance.

## Class Mapping
The mapping between model output indices and class names is as follows:
{', '.join([f"{idx}: {class_name}" for idx, class_name in label_mapping.items()])}

## Usage
This model can be used for multi-class text classification tasks where the input text needs to be categorized into one of the eight predefined classes. It is particularly suited for datasets with class imbalance, thanks to its weighted loss function.

### Example Usage
```python
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load the fine-tuned model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("{model_name}")
tokenizer = AutoTokenizer.from_pretrained("{model_name}")

# Tokenize input text
text = "Your input text here"
inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=128)

# Perform inference
outputs = model(**inputs)
predicted_class = outputs.logits.argmax(-1).item()

print(f"Predicted Class: {{predicted_class}}")
```

## Limitations
{limitations}

## Citation
{citation}

## Acknowledgments
Special thanks to the Frugal AI Challenge organizers for providing the dataset and fostering innovation in AI research.
    """
    # Save the model card as README.md
    with open(f"{model_dir}/README.md", "w") as f:
        f.write(model_card_content)


In [None]:
from pathlib import Path

# Assuming these are your metrics after training
metrics_data = {
    'precision': {
        0: trainer.evaluate()['eval_precision'],  # Add class-wise precision
        # Add more class-specific metrics
    },
    'recall': {
        0: trainer.evaluate()['eval_recall'],  # Add class-wise recall
        # Add more class-specific metrics
    },
    'f1': {
        0: trainer.evaluate()['eval_f1'],  # Add class-wise f1
        # Add more class-specific metrics
    },
    'support': {
        0: len(tokenized_val_dataset),  # Add class-wise support
        # Add more class-specific metrics
    },
    'accuracy': trainer.evaluate()['eval_accuracy'],
    'macro_precision': trainer.evaluate()['eval_precision'],
    'macro_recall': trainer.evaluate()['eval_recall'],
    'macro_f1': trainer.evaluate()['eval_f1'],
    'weighted_precision': trainer.evaluate()['eval_precision'],
    'weighted_recall': trainer.evaluate()['eval_recall'],
    'weighted_f1': trainer.evaluate()['eval_f1']
}

# Create your label mapping
model_dir = Path("./results")

generate_model_card(
    model_dir=model_dir,
    model_name="bert-frugal-ai-text-classification ",
    description="""This model implements a novel approach to handling class imbalance in text classification 
    by utilizing Large Language Models (LLMs) for data rebalancing. The base architecture uses BERT with 
    custom modifications for handling imbalanced datasets. The model employs early stopping, gradient clipping, 
    and weighted cross-entropy loss to optimize performance.""",
    metrics_data=metrics_data,
    limitations="""- Performance may vary on extremely imbalanced datasets
    - Requires significant computational resources for training
    - Model performance is dependent on the quality of LLM-generated balanced data
    - May not perform optimally on very long text sequences (>128 tokens)""",
    citation="""If you use this model, please cite:
    @article{your_name2024llmrebalanced,
        title={LLM-Rebalanced Transformer for Improved Text Classification},
        author={Your Name},
        year={2024},
        journal={Preprint}
    }""",
    label_mapping=label_mapping
)


In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load the saved model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("./bert-fine-tuned")
tokenizer = AutoTokenizer.from_pretrained("./bert-fine-tuned")

# Push the model to Hugging Face
model.push_to_hub("ParisNeo/bert-frugal-ai-text-classification")
tokenizer.push_to_hub("ParisNeo/bert-frugal-ai-text-classification")
