In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    XLNetTokenizer, XLNetForSequenceClassification,
    AdamW
)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from tqdm import tqdm

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/Fake_News2/processed_train.csv')
dev_df = pd.read_csv('/content/drive/MyDrive/Fake_News2/processed_dev.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Fake_News2/processed_test.csv')

In [None]:
train_df.dropna(subset=['text'], inplace=True)
dev_df.dropna(subset=['text'], inplace=True)
test_df.dropna(subset=['text'], inplace=True)

In [None]:
label_mapping = {'Fake': 0, 'original': 1}
train_df['label'] = train_df['label'].map(label_mapping)
dev_df['label'] = dev_df['label'].map(label_mapping)

In [None]:
MAX_LEN = 128
BATCH_SIZE = 16
EPOCHS = 10
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
class FakeNewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx] if self.labels is not None else None
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        if label is not None:
            item['labels'] = torch.tensor(label, dtype=torch.long)
        return item

In [None]:
MODEL_NAME = "xlnet-base-cased"
tokenizer = XLNetTokenizer.from_pretrained(MODEL_NAME)
model = XLNetForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

model = model.to(DEVICE)

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
train_dataset = FakeNewsDataset(
    texts=train_df['text'].tolist(),
    labels=train_df['label'].tolist(),
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

dev_dataset = FakeNewsDataset(
    texts=dev_df['text'].tolist(),
    labels=dev_df['label'].tolist(),
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

test_dataset = FakeNewsDataset(
    texts=test_df['text'].tolist(),
    labels=None,
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

In [None]:
# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5)



In [None]:
from transformers import get_scheduler
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * EPOCHS
)

In [None]:
def train_model(model, data_loader, optimizer, lr_scheduler):
    model.train()
    total_loss = 0
    for batch in tqdm(data_loader, desc="Training"):
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
    return total_loss / len(data_loader)


In [None]:
def evaluate_model(model, data_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)

            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    prec = precision_score(all_labels, all_preds, average='macro')
    rec = recall_score(all_labels, all_preds, average='macro')
    f1 = f1_score(all_labels, all_preds, average='macro')
    report = classification_report(all_labels, all_preds)

    return acc, prec, rec, f1, report

In [None]:
best_val_acc = 0

for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch + 1}/{EPOCHS}")
    train_loss = train_model(model, train_loader, optimizer, lr_scheduler)
    print(f"Training Loss: {train_loss:.4f}")

    acc, prec, rec, f1, report = evaluate_model(model, dev_loader)
    print("\nValidation Metrics:")
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(report)
    if acc > best_val_acc:
        best_val_acc = acc

print(f"\nFinal Validation Accuracy: {best_val_acc:.4f}")


Epoch 1/10


Training: 100%|██████████| 203/203 [01:25<00:00,  2.37it/s]


Training Loss: 0.6907


Evaluating: 100%|██████████| 51/51 [00:08<00:00,  5.85it/s]



Validation Metrics:
Accuracy: 0.5872
Precision: 0.6331
Recall: 0.5879
F1 Score: 0.5493
              precision    recall  f1-score   support

           0       0.55      0.88      0.68       406
           1       0.71      0.30      0.42       408

    accuracy                           0.59       814
   macro avg       0.63      0.59      0.55       814
weighted avg       0.63      0.59      0.55       814


Epoch 2/10


Training: 100%|██████████| 203/203 [01:26<00:00,  2.35it/s]


Training Loss: 0.6454


Evaluating: 100%|██████████| 51/51 [00:08<00:00,  5.91it/s]



Validation Metrics:
Accuracy: 0.6634
Precision: 0.6689
Recall: 0.6632
F1 Score: 0.6604
              precision    recall  f1-score   support

           0       0.70      0.57      0.63       406
           1       0.64      0.75      0.69       408

    accuracy                           0.66       814
   macro avg       0.67      0.66      0.66       814
weighted avg       0.67      0.66      0.66       814


Epoch 3/10


Training: 100%|██████████| 203/203 [01:26<00:00,  2.35it/s]


Training Loss: 0.6067


Evaluating: 100%|██████████| 51/51 [00:08<00:00,  5.90it/s]



Validation Metrics:
Accuracy: 0.6720
Precision: 0.6924
Recall: 0.6716
F1 Score: 0.6627
              precision    recall  f1-score   support

           0       0.75      0.51      0.61       406
           1       0.63      0.84      0.72       408

    accuracy                           0.67       814
   macro avg       0.69      0.67      0.66       814
weighted avg       0.69      0.67      0.66       814


Epoch 4/10


Training: 100%|██████████| 203/203 [01:26<00:00,  2.35it/s]


Training Loss: 0.5535


Evaluating: 100%|██████████| 51/51 [00:08<00:00,  5.87it/s]



Validation Metrics:
Accuracy: 0.6929
Precision: 0.6941
Recall: 0.6928
F1 Score: 0.6923
              precision    recall  f1-score   support

           0       0.71      0.65      0.68       406
           1       0.68      0.73      0.71       408

    accuracy                           0.69       814
   macro avg       0.69      0.69      0.69       814
weighted avg       0.69      0.69      0.69       814


Epoch 5/10


Training: 100%|██████████| 203/203 [01:26<00:00,  2.35it/s]


Training Loss: 0.5074


Evaluating: 100%|██████████| 51/51 [00:08<00:00,  5.89it/s]



Validation Metrics:
Accuracy: 0.7113
Precision: 0.7214
Recall: 0.7110
F1 Score: 0.7078
              precision    recall  f1-score   support

           0       0.77      0.60      0.68       406
           1       0.67      0.82      0.74       408

    accuracy                           0.71       814
   macro avg       0.72      0.71      0.71       814
weighted avg       0.72      0.71      0.71       814


Epoch 6/10


Training: 100%|██████████| 203/203 [01:26<00:00,  2.35it/s]


Training Loss: 0.4621


Evaluating: 100%|██████████| 51/51 [00:08<00:00,  5.88it/s]



Validation Metrics:
Accuracy: 0.7125
Precision: 0.7130
Recall: 0.7126
F1 Score: 0.7124
              precision    recall  f1-score   support

           0       0.70      0.73      0.72       406
           1       0.72      0.69      0.71       408

    accuracy                           0.71       814
   macro avg       0.71      0.71      0.71       814
weighted avg       0.71      0.71      0.71       814


Epoch 7/10


Training: 100%|██████████| 203/203 [01:26<00:00,  2.35it/s]


Training Loss: 0.4280


Evaluating: 100%|██████████| 51/51 [00:08<00:00,  5.93it/s]



Validation Metrics:
Accuracy: 0.7039
Precision: 0.7064
Recall: 0.7038
F1 Score: 0.7029
              precision    recall  f1-score   support

           0       0.73      0.65      0.69       406
           1       0.68      0.76      0.72       408

    accuracy                           0.70       814
   macro avg       0.71      0.70      0.70       814
weighted avg       0.71      0.70      0.70       814


Epoch 8/10


Training: 100%|██████████| 203/203 [01:26<00:00,  2.35it/s]


Training Loss: 0.3984


Evaluating: 100%|██████████| 51/51 [00:08<00:00,  5.89it/s]



Validation Metrics:
Accuracy: 0.6855
Precision: 0.6909
Recall: 0.6857
F1 Score: 0.6834
              precision    recall  f1-score   support

           0       0.66      0.77      0.71       406
           1       0.72      0.60      0.66       408

    accuracy                           0.69       814
   macro avg       0.69      0.69      0.68       814
weighted avg       0.69      0.69      0.68       814


Epoch 9/10


Training: 100%|██████████| 203/203 [01:26<00:00,  2.35it/s]


Training Loss: 0.3828


Evaluating: 100%|██████████| 51/51 [00:08<00:00,  5.87it/s]



Validation Metrics:
Accuracy: 0.7076
Precision: 0.7209
Recall: 0.7073
F1 Score: 0.7030
              precision    recall  f1-score   support

           0       0.77      0.58      0.67       406
           1       0.67      0.83      0.74       408

    accuracy                           0.71       814
   macro avg       0.72      0.71      0.70       814
weighted avg       0.72      0.71      0.70       814


Epoch 10/10


Training: 100%|██████████| 203/203 [01:26<00:00,  2.35it/s]


Training Loss: 0.3686


Evaluating: 100%|██████████| 51/51 [00:08<00:00,  5.89it/s]


Validation Metrics:
Accuracy: 0.7088
Precision: 0.7174
Recall: 0.7086
F1 Score: 0.7058
              precision    recall  f1-score   support

           0       0.76      0.61      0.68       406
           1       0.67      0.81      0.74       408

    accuracy                           0.71       814
   macro avg       0.72      0.71      0.71       814
weighted avg       0.72      0.71      0.71       814


Final Validation Accuracy: 0.7125





In [None]:
# Prediction function for the test set
def predict_test(model, data_loader):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Predicting"):
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)

            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)
            predictions.extend(preds.cpu().numpy())
    return predictions

In [None]:
# Predict and save the test results
test_predictions = predict_test(model, test_loader)

# Add predicted labels to the test dataframe
test_df['Predicted_Label'] = test_predictions

# Map numerical predictions back to their string representation
label_reverse_mapping = {0: 'Fake', 1: 'original'}
test_df['Predicted_Label'] = test_df['Predicted_Label'].map(label_reverse_mapping)

# Save the updated dataframe to a CSV file
output_path = '/content/drive/MyDrive/Fake_News2/XLNet_Model/TL_XLNet.csv'
test_df.to_csv(output_path, index=False, encoding='utf-8')
print(f"\nTest predictions saved to {output_path}")


Predicting: 100%|██████████| 64/64 [00:10<00:00,  5.97it/s]


Test predictions saved to /content/drive/MyDrive/Fake_News2/XLNet_Model/TL_XLNet.csv



