In [2]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    BertTokenizer, BertForSequenceClassification,
    AdamW
)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from tqdm import tqdm

In [3]:
#Used processed datasets
train_df = pd.read_csv('/content/drive/MyDrive/Fake_News_ShareTask/Task1/Preprocessing/processed_train.csv')
train_df.head()

Unnamed: 0,text,label
0,നലല അവതരണ സതയ പറതത വരടട,Fake
1,masha allah,Fake
2,അനവഷണ കഴയമപൾ യകക എതരയ കലപ ആകമ സവനത തനന പടചച അക...,Fake
3,illathentha avaru purath vidayittalland verenth,Fake
4,barana pakshathin matoru niyamamnalla moyanth ...,original


In [4]:
dev_df = pd.read_csv('/content/drive/MyDrive/Fake_News_ShareTask/Task1/Preprocessing/processed_dev.csv')
dev_df.head()

Unnamed: 0,text,label
0,full musilm verodamum,Fake
1,പകഷകള മഗങങള ലകതത സഖമയ ജവകകനന വവരവ വദയഭയസവ ഉണട ...,Fake
2,തടകകരൻ പതര കടടൻ പറഞഞപപ മററര ഒനന ഇലലതതവൻ പറയനന,original
3,കഭളമമറനന,original
4,തരവതര അലല കറണയ കകടട കലലകയണ,original


In [5]:
test_df= pd.read_csv('/content/drive/MyDrive/Fake_News_ShareTask/Task1/Preprocessing/processed_test.csv')
test_df.head()

Unnamed: 0,Id,text
0,Fake_01,ഉളള ലഗഡവൻ ഇപപള എനത കരണ
1,Fake_02,രജനഷ പറഞഞപല എനകകപപൾ തനനയത അങങനയണ ഇപപൾ തനനനനത ഇ...
2,Fake_03,ചടട വർതത വയകകനനത കരളതതലണ ഭരകകനന നർതത ഇനതയയലലലഇ...
3,Fake_04,shame for entir woman kerala
4,Fake_05,code janagh andhu wide busi cheythalum vijayik...


In [6]:
train_df.dropna(subset=['text'], inplace=True)
dev_df.dropna(subset=['text'], inplace=True)
test_df.dropna(subset=['text'], inplace=True)

In [7]:
label_mapping = {'Fake': 0, 'original': 1}
train_df['label'] = train_df['label'].map(label_mapping)
dev_df['label'] = dev_df['label'].map(label_mapping)

In [8]:
MAX_LEN = 128
BATCH_SIZE = 16
EPOCHS = 10
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [9]:
class FakeNewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx] if self.labels is not None else None
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        if label is not None:
            item['labels'] = torch.tensor(label, dtype=torch.long)
        return item

In [10]:
# Load mBERT tokenizer and model
MODEL_NAME = "bert-base-multilingual-cased"
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

model = model.to(DEVICE)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Prepare datasets
train_dataset = FakeNewsDataset(
    texts=train_df['text'].tolist(),
    labels=train_df['label'].tolist(),
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

dev_dataset = FakeNewsDataset(
    texts=dev_df['text'].tolist(),
    labels=dev_df['label'].tolist(),
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

test_dataset = FakeNewsDataset(
    texts=test_df['text'].tolist(),
    labels=None,
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

In [12]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [13]:
optimizer = AdamW(model.parameters(), lr=2e-5)



In [14]:
# Learning Rate Scheduler
from transformers import get_scheduler

lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * EPOCHS
)

In [15]:
def train_model(model, data_loader, optimizer, lr_scheduler):
    model.train()
    total_loss = 0
    for batch in tqdm(data_loader, desc="Training"):
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()

        # Gradient update
        optimizer.step()

        # Update learning rate
        lr_scheduler.step()
    return total_loss / len(data_loader)


In [16]:
def evaluate_model(model, data_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)

            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    prec = precision_score(all_labels, all_preds, average='macro')
    rec = recall_score(all_labels, all_preds, average='macro')
    f1 = f1_score(all_labels, all_preds, average='macro')
    report = classification_report(all_labels, all_preds)

    return acc, prec, rec, f1, report

In [17]:
best_val_acc = 0

for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch + 1}/{EPOCHS}")
    train_loss = train_model(model, train_loader, optimizer, lr_scheduler)
    print(f"Training Loss: {train_loss:.4f}")

    acc, prec, rec, f1, report = evaluate_model(model, dev_loader)
    print("\nValidation Metrics:")
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(report)
    if acc > best_val_acc:
        best_val_acc = acc

print(f"\nFinal Validation Accuracy: {best_val_acc:.4f}")


Epoch 1/10


Training: 100%|██████████| 203/203 [01:10<00:00,  2.90it/s]


Training Loss: 0.6425


Evaluating: 100%|██████████| 51/51 [00:06<00:00,  8.28it/s]



Validation Metrics:
Accuracy: 0.7531
Precision: 0.7683
Recall: 0.7528
F1 Score: 0.7494
              precision    recall  f1-score   support

           0       0.83      0.63      0.72       406
           1       0.70      0.87      0.78       408

    accuracy                           0.75       814
   macro avg       0.77      0.75      0.75       814
weighted avg       0.77      0.75      0.75       814


Epoch 2/10


Training: 100%|██████████| 203/203 [01:15<00:00,  2.70it/s]


Training Loss: 0.4839


Evaluating: 100%|██████████| 51/51 [00:06<00:00,  8.10it/s]



Validation Metrics:
Accuracy: 0.7690
Precision: 0.7903
Recall: 0.7687
F1 Score: 0.7646
              precision    recall  f1-score   support

           0       0.87      0.63      0.73       406
           1       0.71      0.90      0.80       408

    accuracy                           0.77       814
   macro avg       0.79      0.77      0.76       814
weighted avg       0.79      0.77      0.76       814


Epoch 3/10


Training: 100%|██████████| 203/203 [01:14<00:00,  2.72it/s]


Training Loss: 0.3551


Evaluating: 100%|██████████| 51/51 [00:06<00:00,  8.21it/s]



Validation Metrics:
Accuracy: 0.7961
Precision: 0.8068
Recall: 0.7958
F1 Score: 0.7942
              precision    recall  f1-score   support

           0       0.86      0.70      0.77       406
           1       0.75      0.89      0.81       408

    accuracy                           0.80       814
   macro avg       0.81      0.80      0.79       814
weighted avg       0.81      0.80      0.79       814


Epoch 4/10


Training: 100%|██████████| 203/203 [01:14<00:00,  2.73it/s]


Training Loss: 0.2344


Evaluating: 100%|██████████| 51/51 [00:06<00:00,  8.06it/s]



Validation Metrics:
Accuracy: 0.8108
Precision: 0.8127
Recall: 0.8109
F1 Score: 0.8106
              precision    recall  f1-score   support

           0       0.79      0.85      0.82       406
           1       0.84      0.77      0.80       408

    accuracy                           0.81       814
   macro avg       0.81      0.81      0.81       814
weighted avg       0.81      0.81      0.81       814


Epoch 5/10


Training: 100%|██████████| 203/203 [01:14<00:00,  2.72it/s]


Training Loss: 0.1602


Evaluating: 100%|██████████| 51/51 [00:06<00:00,  8.04it/s]



Validation Metrics:
Accuracy: 0.8096
Precision: 0.8109
Recall: 0.8097
F1 Score: 0.8094
              precision    recall  f1-score   support

           0       0.79      0.84      0.82       406
           1       0.83      0.78      0.80       408

    accuracy                           0.81       814
   macro avg       0.81      0.81      0.81       814
weighted avg       0.81      0.81      0.81       814


Epoch 6/10


Training: 100%|██████████| 203/203 [01:14<00:00,  2.73it/s]


Training Loss: 0.1133


Evaluating: 100%|██████████| 51/51 [00:06<00:00,  8.21it/s]



Validation Metrics:
Accuracy: 0.8305
Precision: 0.8305
Recall: 0.8305
F1 Score: 0.8305
              precision    recall  f1-score   support

           0       0.83      0.83      0.83       406
           1       0.83      0.83      0.83       408

    accuracy                           0.83       814
   macro avg       0.83      0.83      0.83       814
weighted avg       0.83      0.83      0.83       814


Epoch 7/10


Training: 100%|██████████| 203/203 [01:14<00:00,  2.72it/s]


Training Loss: 0.0639


Evaluating: 100%|██████████| 51/51 [00:06<00:00,  8.05it/s]



Validation Metrics:
Accuracy: 0.8194
Precision: 0.8253
Recall: 0.8196
F1 Score: 0.8186
              precision    recall  f1-score   support

           0       0.78      0.89      0.83       406
           1       0.87      0.75      0.81       408

    accuracy                           0.82       814
   macro avg       0.83      0.82      0.82       814
weighted avg       0.83      0.82      0.82       814


Epoch 8/10


Training: 100%|██████████| 203/203 [01:14<00:00,  2.73it/s]


Training Loss: 0.0506


Evaluating: 100%|██████████| 51/51 [00:06<00:00,  8.10it/s]



Validation Metrics:
Accuracy: 0.8317
Precision: 0.8318
Recall: 0.8317
F1 Score: 0.8317
              precision    recall  f1-score   support

           0       0.84      0.82      0.83       406
           1       0.83      0.84      0.83       408

    accuracy                           0.83       814
   macro avg       0.83      0.83      0.83       814
weighted avg       0.83      0.83      0.83       814


Epoch 9/10


Training: 100%|██████████| 203/203 [01:14<00:00,  2.72it/s]


Training Loss: 0.0389


Evaluating: 100%|██████████| 51/51 [00:06<00:00,  8.16it/s]



Validation Metrics:
Accuracy: 0.8231
Precision: 0.8231
Recall: 0.8231
F1 Score: 0.8231
              precision    recall  f1-score   support

           0       0.82      0.82      0.82       406
           1       0.82      0.82      0.82       408

    accuracy                           0.82       814
   macro avg       0.82      0.82      0.82       814
weighted avg       0.82      0.82      0.82       814


Epoch 10/10


Training: 100%|██████████| 203/203 [01:14<00:00,  2.73it/s]


Training Loss: 0.0300


Evaluating: 100%|██████████| 51/51 [00:06<00:00,  8.02it/s]


Validation Metrics:
Accuracy: 0.8268
Precision: 0.8278
Recall: 0.8268
F1 Score: 0.8267
              precision    recall  f1-score   support

           0       0.81      0.85      0.83       406
           1       0.85      0.80      0.82       408

    accuracy                           0.83       814
   macro avg       0.83      0.83      0.83       814
weighted avg       0.83      0.83      0.83       814


Final Validation Accuracy: 0.8317





In [20]:
# Make predictions on the test set
def predict_test(model, data_loader):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Predicting"):
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)

            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)
            predictions.extend(preds.cpu().numpy())
    return predictions

test_predictions = predict_test(model, test_loader)

Predicting: 100%|██████████| 64/64 [00:08<00:00,  7.87it/s]


In [22]:
# Map predicted labels back to string representation
test_df['Predicted_Label'] = test_predictions
label_reverse_mapping = {0: 'Fake', 1: 'original'}
test_df['Predicted_Label'] = test_df['Predicted_Label'].map(label_reverse_mapping)

# Save the file
output_path = '/content/drive/MyDrive/Fake_News_ShareTask/Task1/Training_Models/Novelty_Models/TL_mBERT.csv'
test_df.to_csv(output_path, index=False, encoding='utf-8')
print(f"\nTest predictions saved to {output_path}")


Test predictions saved to /content/drive/MyDrive/Fake_News_ShareTask/Task1/Training_Models/Novelty_Models/TL_mBERT.csv
