In [None]:
import torch
import pandas as pd
from evaluate import load
from datasets import Dataset
from transformers import Trainer
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import DataCollatorWithPadding
from sklearn.metrics import f1_score

In [None]:
df1 = pd.read_csv("News-Classification-Dataset/barely-true-cleaned.csv")
df2 = pd.read_csv("News-Classification-Dataset/false-cleaned.csv")
df3 = pd.read_csv("News-Classification-Dataset/half-true-cleaned.csv")
df4 = pd.read_csv("News-Classification-Dataset/mostly-true-cleaned.csv")
df5 = pd.read_csv("News-Classification-Dataset/pants-fire-cleaned.csv")
df6 = pd.read_csv("News-Classification-Dataset/true-cleaned.csv")


In [None]:
df = pd.concat([df1, df2, df3, df4, df5, df6], ignore_index=True)

## Checking the data for imbalances

- Class imbalances can be a serious problem for machine learning models.

If there are class imbalances, the model may not learn to predict the minority class well. This can be solves using 
- oversampling
- Weighted classes
- SMOTE (Synthetic Minority Over-sampling Technique)

In [None]:
# Basic overview of the dataset
print(f"Dataset shape: {df.shape}")
df.info()

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print("\nMissing values per column:")
print(missing_values[missing_values > 0])  # Only show columns with missing values


In [None]:
# Check class distribution 
class_count = df['Label'].value_counts()
print("\nClass count (%):")
print(class_count)

In [None]:
df_train = df.sample(frac=0.8, random_state=42)
df_eval = df.drop(df_train.index)

## Label Mapping

In [None]:
label_to_id = {label: id for id, label in enumerate(df['Label'].unique())}
print("\nLabel to ID mapping: ", end="")
print(label_to_id)

id_to_label = {id: label for label, id in label_to_id.items()}
print("\nID to Label mapping: ", end="")
print(id_to_label)

## Tokenization

- Padding and Truncation are very crucial for the model to work properly.
- Padding is used to make all sequences the same length.
- Truncation is used to cut off sequences that are too long.
- The tokenizer will automatically pad and truncate the sequences to the maximum length of the model.
- The tokenizer will also convert the text to input IDs and attention masks.

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

## Sample text and tokenization
sample_text = df['Statement'].iloc[0]
print("\nSample text:")
print(sample_text)
print("\nTokenized sample text:")
sample_tokens = tokenizer(sample_text, truncation=True, padding='max_length', max_length=128)
print(sample_tokens)

In [None]:
df_tokenized_train_dict = tokenizer(list(df_train['Statement']), truncation=True, padding='max_length', max_length=128)
df_tokenized_eval_dict = tokenizer(list(df_eval['Statement']), truncation=True, padding='max_length', max_length=128)

df_tokenized_train_dict['labels'] = [label_to_id[label] for label in df_train['Label'].tolist()]
df_tokenized_eval_dict['labels'] = [label_to_id[label] for label in df_eval['Label'].tolist()]

print(df_tokenized_train_dict.keys())
for key, value in df_tokenized_train_dict.items():
    print(f"{key}: {type(value[0])}")

In [None]:
df_tokenized_train = Dataset.from_dict(df_tokenized_train_dict)
df_tokenized_eval = Dataset.from_dict(df_tokenized_eval_dict)

print(df_tokenized_train)

## Model
- The model is a standard BERT model that is pre-trained.

Choosing the right model is very crucial
- distilbert is used for systems for low latency and high throughput.
- bert-base-uncased is used for systems that require high accuracy and can afford to run slower.
- bert-large-uncased is used for systems that require very high accuracy and can afford to run slower.

The performace difference between the models depends on the task and the dataset. But in general, the larger the model, the better the performance.
- But distilbert usually performs within a few percentage points of bert-base-uncased and is much faster.

In [None]:
model_name = "bert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_to_id))

print(model.config)

## Freezing layers

- Freezing layers is a technique used to prevent the model from updating the weights of certain layers during training.
- This is useful when you want to fine-tune a pre-trained model on a new task.
- Freezing layers can help to prevent overfitting and speed up training.

Generally a good approach would be:
- To freeze all the embedding layers and the first few layers of the model.

In [None]:
# Display the model architecture to see layers
for name, param in model.named_parameters():
    print(name)

num_layers = len(model.bert.encoder.layer)
print(f"\nNumber of layers in BERT encoder: {num_layers}")


In [None]:
num_layers_to_freeze = 6  

# Freeze embeddings
for param in model.bert.embeddings.parameters():
    param.requires_grad = False
print(f"Embeddings frozen")

# Freeze the first N encoder layers
for i in range(num_layers_to_freeze):
    for param in model.bert.encoder.layer[i].parameters():
        param.requires_grad = False
    print(f"Layer {i} frozen")

frozen_params = sum(p.numel() for p in model.parameters() if not p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"Frozen parameters: {frozen_params:,} / {total_params:,} ({100 * frozen_params / total_params:.2f}%)")
print(f"Trainable parameters: {total_params - frozen_params:,} / {total_params:,} ({100 * (total_params - frozen_params) / total_params:.2f}%)")



## Extending BERT architecture (if adventurous)

```python
class CustomBERTModel(nn.Module):
    def __init__(self, pretrained_model_name, num_labels):
        super(CustomBERTModel, self).__init__()
        self.bert = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name, num_labels=num_labels)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = self.dropout(output[1])  # Applying dropout
        logits = self.fc(pooled_output)  # Adding a fully connected layer
        return logits

# Initialize the custom model
custom_model = CustomBERTModel("bert-base-uncased", num_labels=3)

```

## Training the model

We need to define the training paramaters for the model such as:

Basic Configuration
- output_dir: directory to save the model checkpoints
- logging_dir: directory to save the logs
- logging_steps: number of steps to log the training progress

Traning Schedule
- num_train_epochs: number of epochs to train the model
- learning_rate: learning rate for the optimizer
- weight_decay: Applying L2 regularization to the optimizer

Batch Size
- per_device_train_batch_size: batch size for training
- per_device_eval_batch_size: batch size for evaluation

Evaluation & Saving
- evalutation_strategy: strategy to evaluate the model
- save_strategy: strategy to save the model
- save_total_limit: maximum number of checkpoints to save
- load_best_model_at_end: whether to load the best model at the end of training

Performance Optimization
- fp16: whether to use mixed precision training (requires NVIDIA GPU)

In [None]:
# Device 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
training_args = TrainingArguments(
    output_dir="./results",    
    logging_dir="./logs",            
    logging_steps=100, 
    num_train_epochs=3, 
    learning_rate=5e-5,  
    weight_decay=0.01,   
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=16,              
    evaluation_strategy="epoch",     
    save_strategy="epoch",                                           
    save_total_limit=2,              
    load_best_model_at_end=True,                 
    fp16 = True if torch.cuda.is_available() else False,                        
)

## Defining metric for evaluation

- Usualy accuracy is used as the metric for evaluation.
- However, for imbalanced datasets, it is better to use F1 score, precision, and recall.

What are logits?
- Logits are the raw, unnormalized scores output by the model before applying the softmax function.
- They represent the model's confidence in each class.
- The softmax function is applied to the logits to convert them into probabilities.

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    f1 = f1_score(labels, predictions, average="weighted")  # or "macro", "micro", "binary"
    return {"f1": f1}

## Data Collator

The DataCollatorWithPadding from Hugging Face Transformers automatically pads input sequences to the same length within a batch during training or evaluation.

This is important because:

- Transformer models (like BERT, RoBERTa) expect inputs to be of the same length.
- But real-world text samples usually have variable lengths.
- Padding too early (globally) wastes memory; padding just-in-time per batch is more efficient.

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Setting up the Trainer

The arguments for the Trainer class are crucial for training the model effectively. Here are some key arguments:
- model: The model to be trained.
- args: Training arguments that define the training configuration. (usually defined in TrainingArguments)
- train_dataset: The dataset to be used for training. The structure of the dataset should be a dictionary with 'input_ids', 'attention_mask', and 'labels'.
- eval_dataset: The dataset to be used for evaluation. It should have the same structure as the training dataset.
- compute_metrics: A function to compute metrics during evaluation. It should take the predictions and labels as input and return a dictionary of metrics.

Structure of the dataset:

```python
{
    'input_ids': [list of input IDs],
    'attention_mask': [list of attention masks],
    'labels': [list of labels]
}
```

This must be a dataset object, use the following code to convert a pandas dataframe to a dataset object:

```python
from datasets import Dataset    
Dataset.from_dict(...)
```


In [None]:
## Check the structure of the tokenized dataset
print(df_tokenized_train)


In [None]:
trainer = Trainer(
    model=model,                        
    args=training_args,                 
    train_dataset=df_tokenized_train,
    eval_dataset=df_tokenized_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,        
    compute_metrics=compute_metrics     
)

trainer.train()

## Other Important Arguments for Trainer

#### Gradient Accumulation
- Gradient accumulation is a technique used to simulate a larger batch size by accumulating gradients over multiple steps before performing an optimization step.
- This is useful when the model is too large to fit into memory with a large batch size.

``` python
training_args.gradient_accumulation_steps = 4
``` 

#### Early Stopping
- Early stopping is a technique used to stop training when the model's performance on the validation set stops improving.
- This helps to prevent overfitting and saves training time.
``` python
from transformers import EarlyStoppingCallback
training_args.load_best_model_at_end = True
training_args.metric_for_best_model = "f1"
training_args.greater_is_better = True
trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=3))
```




## Evaluating the model

In [None]:
results = trainer.evaluate()
print(results)

## Displaying the results

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay


predictions = trainer.predict(df_tokenized_eval)
predicted_labels = predictions.predictions.argmax(axis=-1)

# Classification report
print(classification_report(df_tokenized_eval["labels"], predicted_labels))

# Confusion matrix
cm = confusion_matrix(df_tokenized_eval["labels"], predicted_labels)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=list(label_to_id.keys()))
disp.plot(cmap='Blues')