# 3 level causal classification fine tuning BERT SciBERT and RoBERTa

## Download the dependencies and import them

In [None]:
!pip install transformers datasets scikit-learn accelerate peft bitsandbytes

In [None]:
import torch
import pandas as pd
from datasets import load_dataset, DatasetDict
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)

from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import numpy as np
import random
import os


## Configuration of the three models

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
seed_value = 8642
config = {
    "BERT": {
        "model_settings": {
            "model_type": "bert-base-uncased",
            "num_labels": 3
        },
        "training_args": TrainingArguments(
            output_dir="final_BERT",
            num_train_epochs=6,
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            eval_strategy="epoch",
            save_strategy="epoch",
            logging_steps=1,
            logging_dir='./logs',
            weight_decay=0.1,
            learning_rate=5e-5,
            do_train=True,
            do_eval=True,
            warmup_steps=100,
            lr_scheduler_type="linear",
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            seed=seed_value,
            fp16=False,
            save_safetensors=True,
            report_to="wandb"
        )
    },
    "RoBERTa": {
        "model_settings": {
            "model_type": "roberta-base",
            "num_labels": 3
        },
        "training_args": TrainingArguments(
            output_dir="final_RoBERTa",
            num_train_epochs=6,
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            eval_strategy="epoch",
            save_strategy="epoch",
            logging_steps=1,
            logging_dir='./logs',
            weight_decay=0.1,
            learning_rate=5e-5,
            do_train=True,
            do_eval=True,
            warmup_steps=100,
            lr_scheduler_type="linear",
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            seed=seed_value,
            fp16=False,
            save_safetensors=True,
            report_to="wandb"
        )
    },
    "SciBERT": {
        "model_settings": {
            "model_type": "allenai/scibert_scivocab_uncased",
            "num_labels": 3
        },
        "training_args": TrainingArguments(
            output_dir="final_SciBERT",
            num_train_epochs=6,
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            eval_strategy="epoch",
            save_strategy="epoch",
            logging_steps=1,
            logging_dir='./logs',
            weight_decay=0.1,
            learning_rate=5e-5,
            do_train=True,
            do_eval=True,
            warmup_steps=100,
            lr_scheduler_type="linear",
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            seed=seed_value,
            fp16=False,
            save_safetensors=False,
            report_to="wandb"
        )
    }
    
}

## Set seeds for reproducabilty
The `set_all_seeds()` function ensures reproducibility by setting a fixed seed for all random number generators, including PyTorch, NumPy, and Python's `random` module. It also enforces deterministic behavior in PyTorch's cuDNN backend to ensure consistent results across runs.


In [None]:
def set_all_seeds(seed_value=seed_value):
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed_value)
    np.random.seed(seed_value)
    random.seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_all_seeds()


## Save the models for inference
Each epoch of the model is saved in the output so you can always download the best model

In [None]:
import os
from transformers import TrainerCallback

class SaveModelAndTokenizerCallback(TrainerCallback):
    def __init__(self, tokenizer, model, output_dir):
        self.tokenizer = tokenizer
        self.model = model
        self.output_dir = output_dir

    def on_save(self, args, state, control, **kwargs):
        checkpoint_dir = os.path.join(self.output_dir, f"checkpoint-{state.global_step}")
        
        # Create the checkpoint directory if it doesn't exist
        os.makedirs(checkpoint_dir, exist_ok=True)

        # Ensure that all tensors in the model are contiguous
        self.make_model_contiguous(self.model)

        # Save model and tokenizer inside the checkpoint directory
        self.model.save_pretrained(checkpoint_dir)
        self.tokenizer.save_pretrained(checkpoint_dir)
        print(f"Model and tokenizer saved to {checkpoint_dir}")

    def make_model_contiguous(self, model):
        """Make sure all model tensors are contiguous"""
        for param in model.parameters():
            if not param.is_contiguous():
                param.data = param.data.contiguous()


## The finetuner
The early stopping criteria in this code are based on monitoring the F1 score during training. Early stopping is a technique used to prevent overfitting by halting the training process if the model's performance on the validation dataset stops improving after a certain number of evaluation steps.

In this implementation, the `EarlyStoppingCallback` monitors the F1 score during training, and if the score does not improve for a specified number of evaluation steps (determined by `early_stopping_patience=2`), the training will stop early. This helps ensure that the model doesn't continue to train when it has reached its optimal performance based on the F1 score, reducing unnecessary computation and potentially improving generalization.

In [None]:
from transformers import EarlyStoppingCallback, Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import DataCollatorWithPadding
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

class FineTuner:

    def __init__(self, model, tokenizer, training_args):
        self.model = model
        self.tokenizer = tokenizer
        self.training_args = training_args
        self.trainer = None

    def tokenize_dataset(self, dataset):
        def tokenize_function(examples):
            return self.tokenizer(examples['sentence'], truncation=True, max_length=512)
        return dataset.map(tokenize_function, batched=True, remove_columns=['sentence'])

    def train(self, train_dataset, val_dataset):
        # Tokenize datasets
        tokenized_train_dataset = self.tokenize_dataset(train_dataset)
        tokenized_val_dataset = self.tokenize_dataset(val_dataset)

        data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)

        def compute_metrics(p):
            labels = p.label_ids
            preds = p.predictions.argmax(-1)
            precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
            acc = accuracy_score(labels, preds)
            return {
                'accuracy': acc,
                'f1': f1,
                'precision': precision,
                'recall': recall
            }

        # Early stopping callback
        early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=2)

        save_model_and_tokenizer_callback = SaveModelAndTokenizerCallback(
            tokenizer=self.tokenizer,
            model=self.model,
            output_dir=self.training_args.output_dir
        )

        self.trainer = Trainer(
            model=self.model,
            args=self.training_args,
            train_dataset=tokenized_train_dataset,
            eval_dataset=tokenized_val_dataset,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
            callbacks=[early_stopping_callback, save_model_and_tokenizer_callback],  # Add the custom callback here
        )

        self.trainer.train()

## Initialize models
Each model has its own tokenizer and settings

In [None]:
def initialize_model(model_name, config):
    if model_name == "BERT":
        model_type = config["BERT"]["model_settings"]["model_type"]
        model = AutoModelForSequenceClassification.from_pretrained(model_type, num_labels=config["BERT"]["model_settings"]["num_labels"])
        tokenizer = AutoTokenizer.from_pretrained(model_type)
    
    elif model_name == "RoBERTa":
        model_type = config["RoBERTa"]["model_settings"]["model_type"]
        model = AutoModelForSequenceClassification.from_pretrained(model_type, num_labels=config["RoBERTa"]["model_settings"]["num_labels"])
        tokenizer = AutoTokenizer.from_pretrained(model_type)
    
    elif model_name == "SciBERT":
        model_type = config["SciBERT"]["model_settings"]["model_type"]
        model = AutoModelForSequenceClassification.from_pretrained(model_type, num_labels=config["SciBERT"]["model_settings"]["num_labels"])
        tokenizer = AutoTokenizer.from_pretrained(model_type)
    
    return model, tokenizer

## Load the datasets

In [None]:
from datasets import load_dataset, DatasetDict

def load_datasets(train_file_path,validation_file_path, test_file_path,test_ssc_file_path):
    """
    Load train, balanced test, and imbalanced test datasets from CSV files.

    Args:
        train_file_path (str): Path to the balanced training dataset CSV file.
        test_file_path (str): Path to the balanced testing dataset CSV file.
        validation_file_path (str): Path to the balanced validation dataset CSV file.
        test_ssc_file_path (str): Path to the ssc testing dataset csv file.

    Returns:
        DatasetDict: A dictionary containing the training and testing datasets.
    """
    try:
        # Load the training dataset
        train_dataset = load_dataset('csv', data_files=train_file_path)['train']
        print("Train Dataset Loaded Successfully.")

        # Load the testing dataset
        test_dataset = load_dataset('csv', data_files=test_file_path)['train']  
        print("Test Dataset Loaded Successfully.")

        # Load the ssc testing dataset
        test_ssc_dataset = load_dataset('csv', data_files=test_ssc_file_path)['train']
        print("Test SSC Dataset Loaded Successfully.")

        # Load the validation dataset
        validation_dataset = load_dataset('csv', data_files=validation_file_path)['train']
        print("Validation Dataset Loaded Successfully.")

        # Create a DatasetDict
        dataset_dict = DatasetDict({
            'train': train_dataset,
            'validation': validation_dataset,
            'test': test_dataset,  
            'test_ssc': test_ssc_dataset
        })

        return dataset_dict

    except Exception as e:
        print("An error occurred while loading the datasets:", str(e))
        return None


# Example usage
train_file_path = '/kaggle/input/3-classfication/3_balanced_train.csv'  
validation_file_path = '/kaggle/input/3-classfication/3_balanced_val.csv'
test_file_path = '/kaggle/input/3-classfication/3_balanced_test.csv'
test_ssc_file_path = '/kaggle/input/3-classfication/3_ssc_test.csv' 

# Load the datasets
datasets = load_datasets(train_file_path, validation_file_path, test_file_path,test_ssc_file_path)

# Check if datasets are loaded successfully
if datasets:
    print("Datasets loaded successfully!")
else:
    print("Failed to load datasets.")

In [None]:
datasets

In [None]:
print(set(datasets['train']['label']))  
print(set(datasets['validation']['label']))  
print(set(datasets['test_ssc']['label']))  

In [None]:
# Define a function to remap labels: 1 -> 0, 2 -> 1, 3 -> 2
def remap_labels(example):
    example['label'] = example['label'] - 1
    return example

# Apply the remap_labels function to the train, validation, and test datasets
datasets['train'] = datasets['train'].map(remap_labels)
datasets['validation'] = datasets['validation'].map(remap_labels)
datasets['test'] = datasets['test'].map(remap_labels)
datasets['test_ssc'] = datasets['test_ssc'].map(remap_labels)




In [None]:
print(set(datasets['train']['label']))  
print(set(datasets['validation']['label'])) 
print(set(datasets['test']['label'])) 
print(set(datasets['test_ssc']['label']))  

## Check if GPUs are available

In [None]:
import torch

# Check if CUDA is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Training will be done on the GPU: {torch.cuda.get_device_name(device)}")
else:
    device = torch.device("cpu")
    print("CUDA is not available. Training will be done on CPU.")

## Login weights and biases to log the evaluation and training

In [None]:
import wandb
wandb.login(key="your_API_key")

## Choose model type

In [None]:
model_name = "SciBERT"  # Change this to "BERT", "RoBERTa", or "SciBERT" to try different models
model, tokenizer = initialize_model(model_name, config)

## Remove files that are still in the output directory

In [None]:
!rm -rf /kaggle/working/*

## Train the model

In [None]:
import wandb
wandb.init(project="finals", name="SciBERT_finetune_run")
print("WandB is connected:", wandb.run)
print("WandB run URL: ", wandb.run.get_url())

# Initialize FineTuner with one of the BERT version configuration
training_args = config[model_name]["training_args"]
fine_tuner = FineTuner(model=model, tokenizer=tokenizer, training_args=training_args)

# Start training
fine_tuner.train(datasets['train'], datasets['validation'])

## Zip the best model for download

In [None]:
import shutil

# Define the folder path and the output zip file name
folder_path = '/kaggle/working/final_SciBERT/checkpoint-454'
zip_file_path = '/kaggle/working/SciBERT.zip'

# Create a zip file of the folder
shutil.make_archive(zip_file_path.replace('.zip', ''), 'zip', folder_path)

print(f"Folder compressed into: {zip_file_path}")

In [None]:
model_path = '/kaggle/working/final_SciBERT/checkpoint-454'
model_name = "SciBERT"

# Load SciBERT model and tokenizer
print("Loading SciBERT model and tokenizer...")
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Evaluation

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import PreTrainedTokenizer, PreTrainedModel

class SimpleDataset(Dataset):
    def __init__(self, texts, labels, sources=None):
        self.texts = texts
        self.labels = labels
        self.sources = sources  
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        item = {'sentence': self.texts[idx], 'label': self.labels[idx]}
        if self.sources:
            item['source'] = self.sources[idx]  
        return item
        
def plot_confusion_matrix(true_labels, predicted_labels, labels=["Correlational", "Conditional Causal", "Direct Causal"]):
    cm = confusion_matrix(true_labels, predicted_labels)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='g', cmap='Blues', xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.title('Confusion Matrix')
    plt.show()

def evaluate(
        model: PreTrainedModel,
        tokenizer: PreTrainedTokenizer,
        dataset, batch_size=8
        ):

    # Extract sentences, labels, and sources
    texts = [item['sentence'] for item in dataset]
    labels = [item['label'] for item in dataset]
    sources = [item['source'] for item in dataset]  # Add sources
    simple_dataset = SimpleDataset(texts, labels, sources)

    def collate_fn(batch):
        texts = [item['sentence'] for item in batch]
        labels = torch.tensor([item['label'] for item in batch])
        sources = [item['source'] for item in batch]  # Collect source data

        encoding = tokenizer(texts, truncation=True, padding=True, max_length=512, return_tensors="pt")
        encoding['labels'] = labels
        encoding['sentences'] = texts  # Retain the original sentences for misclassification tracking
        encoding['sources'] = sources 

        return encoding

    dataloader = DataLoader(simple_dataset, batch_size=batch_size, collate_fn=collate_fn)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()

    true_labels = []
    predicted_labels = []
    misclassified_examples = []

    with torch.no_grad():
        for batch_idx, batch in enumerate(dataloader):
            inputs = {key: val.to(device) for key, val in batch.items() if key not in ['labels', 'sentences', 'sources']}
            outputs = model(**inputs)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            true_batch_labels = batch['labels'].cpu().numpy()
            true_labels.extend(true_batch_labels)
            predicted_labels.extend(preds)

            for i in range(len(preds)):
                if preds[i] != true_batch_labels[i]:
                    misclassified_examples.append({
                        'sentence': batch['sentences'][i],  # Access the original sentence
                        'true_label': true_batch_labels[i],
                        'predicted_label': preds[i],
                        'source': batch['sources'][i]  # Add the source to misclassified examples
                    })

    class_report = classification_report(true_labels, predicted_labels, target_names=["Correlational", "Conditional Causal", "Direct Causal"])
    print("Classification Report:")
    print("=================================")
    print(class_report)
    print("=================================")

    # Create DataFrame for misclassified examples including source
    misclassified_df_imbalanced = pd.DataFrame(misclassified_examples)
    misclassified_df_imbalanced.to_csv('misclassified_sentences_scibert.csv', index=False)
    print("Misclassified sentences saved to 'misclassified_sentences_scibert.csv'")

    return true_labels, predicted_labels

true_labels, predicted_labels = evaluate(model, tokenizer, datasets['test'])

plot_confusion_matrix(true_labels, predicted_labels)


In [None]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.expand_frame_repr', False) 
df=pd.read_csv("/kaggle/working/misclassified_sentences_scibert.csv")
df.groupby("source").count()
df

In [None]:
import pandas as pd

train_source_counts = {
    "haber": 1619,
    "press_release": 859,
    "pubmed": 851,
    "ssc": 292
}

# Misclassification count from your DataFrame
misclassified_counts = df.groupby("source")['sentence'].count()

# Calculate the misclassification rate for each source
misclassification_percentage = (misclassified_counts / misclassified_counts.sum()) * 100

# Calculate the proportion of each source in the training data
train_source_percentage = {source: (count / sum(train_source_counts.values())) * 100 for source, count in train_source_counts.items()}

# Combine the results into a DataFrame for comparison, adding raw counts
comparison_df = pd.DataFrame({
    'Training Data Count': train_source_counts,
    'Training Data Proportion (%)': train_source_percentage,
    'Misclassification Count': misclassified_counts,
    'Misclassification Proportion (%)': misclassification_percentage
})

# Display the comparison
print("Training Data vs Misclassification Comparison:")
print(comparison_df)

# Now, calculate how frequently each misclassification happens per source
misclassification_frequency = df.groupby(['source', 'true_label', 'predicted_label']).size().reset_index(name='count')

# Pivot the table to show the misclassification frequency for each source
misclassification_pivot = misclassification_frequency.pivot_table(index=['source', 'true_label'], columns='predicted_label', values='count', fill_value=0)

# Display the misclassification frequencies per source
print("\nMisclassification Frequency per Source:")
print(misclassification_pivot)


In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import PreTrainedTokenizer, PreTrainedModel

class SimpleDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        # Ensure 'sentence' key is consistent
        return {'sentence': self.texts[idx], 'label': self.labels[idx]}
        
def plot_confusion_matrix(true_labels, predicted_labels, labels=["correlational", "conditional causal", "direct causal"]):
    cm = confusion_matrix(true_labels, predicted_labels)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='g', cmap='Blues', xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.title('Confusion Matrix')
    plt.show()

def evaluate(
        model: PreTrainedModel,
        tokenizer: PreTrainedTokenizer,
        dataset, batch_size=8
        ):

    texts = [item['sentence'] for item in dataset]
    labels = [item['label'] for item in dataset]
    simple_dataset = SimpleDataset(texts, labels)

    def collate_fn(batch):
        texts = [item['sentence'] for item in batch]
        labels = torch.tensor([item['label'] for item in batch])

        encoding = tokenizer(texts, truncation=True, padding=True, max_length=512, return_tensors="pt")
        encoding['labels'] = labels
        encoding['sentences'] = texts  # Retain the original sentences for misclassification tracking

        return encoding

    dataloader = DataLoader(simple_dataset, batch_size=batch_size, collate_fn=collate_fn)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    true_labels = []
    predicted_labels = []
    misclassified_examples = []

    with torch.no_grad():
        for batch_idx, batch in enumerate(dataloader):
            inputs = {key: val.to(device) for key, val in batch.items() if key not in ['labels', 'sentences']}
            outputs = model(**inputs)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            true_batch_labels = batch['labels'].cpu().numpy()
            true_labels.extend(true_batch_labels)
            predicted_labels.extend(preds)

            for i in range(len(preds)):
                if preds[i] != true_batch_labels[i]:
                    misclassified_examples.append({
                        'sentence': batch['sentences'][i],  # Access the original sentence
                        'true_label': true_batch_labels[i],
                        'predicted_label': preds[i]
                    })

    class_report = classification_report(true_labels, predicted_labels, target_names=["Correlational", "Conditional Causal", "Direct Causal"])
    print("Classification Report:")
    print("=================================")
    print(class_report)
    print("=================================")

    misclassified_df_imbalanced = pd.DataFrame(misclassified_examples)
    misclassified_df_imbalanced.to_csv('misclassified_sentences_ssc_scibert.csv', index=False)
    print("Misclassified sentences saved to 'misclassified_sentences_ssc_scibert.csv'.")

    return true_labels, predicted_labels

true_labels, predicted_labels = evaluate(model, tokenizer, datasets['test_ssc'])

plot_confusion_matrix(true_labels, predicted_labels)

# Inference

In [None]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch

# Check device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load model and tokenizer
model_path = '/kaggle/input/scibert/pytorch/default/1/SciBERTT'
model_name = "SciBERT"
initialize_model(model_name, config)
# Move the model to the appropriate device
model.to(device)

def classify_sentence(sentence):
    # Tokenize the input sentence
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512)

    # Move inputs to the same device as the model
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1).item()  # Get the predicted class

    return predictions

# Example usage
sentence = "Mini Nutritional Assessment was able to predict the development of PUs.	"
prediction = classify_sentence(sentence)

# Print the result
print(f"The predicted class for the sentence is: {prediction}")
