<a href="https://colab.research.google.com/github/OmidGhadami95/Fake_Detection_BERT_Pruning/blob/main/Fake_News_Detection_Without_Pruning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import numpy as np
import pandas as pd
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
import torch.nn as nn
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import re
import pycaret
import transformers
from transformers import AutoModel, BertTokenizerFast
#from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import AutoModel, BertTokenizerFast
from tqdm import tqdm
# -*- coding: utf-8 -*-

# specify GPU
device = torch.device("cuda")


"""# Load Dataset"""

def clean_text(text):
  # Replace special characters with an empty string
  return re.sub(r'[^a-zA-Z0-9\s]', '', str(text))

# Load Dataset
true_data = pd.read_csv('a1_True.csv')
fake_data = pd.read_csv('a2_Fake.csv')
# Generate labels True/Fake under new Target Column in 'true_data' and 'fake_data'
true_data['Target'] = ['True']*len(true_data)
fake_data['Target'] = ['Fake']*len(fake_data)
# Merge 'true_data' and 'fake_data', by random mixing into a single df called 'data'
# data = true_data.concat(fake_data).sample(frac=1).reset_index().drop(columns=['index'])
data = pd.concat([true_data, fake_data]).sample(frac=1).reset_index(drop=True)

# Remove special characters from 'title' and 'text' columns
data['title'] = data['title'].apply(clean_text)
data['text'] = data['text'].apply(clean_text)

print(data.shape)
data.head()

# Check for null values in the 'text' column
null_text_rows = data['text'].isnull().sum()
print(f"Number of null values in 'text' column: {null_text_rows}")

# Drop rows where 'text' column is null
data = data.dropna(subset=['text'])

# Verify the changes
print(f"Shape of data after dropping rows with null 'text' values: {data.shape}")

# Target column is made of string values True/Fake, let's change it to numbers 0/1 (Fake=1)
# data['label'] = pd.get_dummies(data.Target)['Fake']
data['label'] = data['Target'].map({'True': 0, 'Fake': 1})

data.head()

# Checking if our data is well balanced
label_size = [data['label'].sum(),len(data['label'])-data['label'].sum()]
plt.pie(label_size,explode=[0.1,0.1],colors=['firebrick','navy'],startangle=90,shadow=True,labels=['Fake','True'],autopct='%1.1f%%')

"""# Train-Test Split"""

# Train-Validation-Test set split into 70:15:15 ratio
# Train-Temp split
train_text, temp_text, train_labels, temp_labels = train_test_split(data['text'], data['label'],
                                                                    random_state=2018,
                                                                    test_size=0.3,
                                                                    stratify=data['Target'])
# Validation-Test split
val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels,
                                                                random_state=2018,
                                                                test_size=0.5,
                                                                stratify=temp_labels)

"""# BERT Fine-tuning"""

# Load Pre-trained BERT Model
# Load BERT model and tokenizer via HuggingFace Transformers
bert = AutoModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# Plot histogram of the number of words in train data 'text'
seq_len = [len(text.split()) for text in train_text]
pd.Series(seq_len).hist(bins = 40,color='firebrick')
plt.xlabel('Number of Words')
plt.ylabel('Number of texts')

# Majority of titles above have word length under 250. So, we set max title length as 250
MAX_LENGHT = 250
# Tokenize and encode sequences in the train set
tokens_train = tokenizer.batch_encode_plus(
    train_text.tolist(),
    max_length = MAX_LENGHT,
    pad_to_max_length=True,
    truncation=True
)
# tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(
    val_text.tolist(),
    max_length = MAX_LENGHT,
    pad_to_max_length=True,
    truncation=True
)
# tokenize and encode sequences in the test set
tokens_test = tokenizer.batch_encode_plus(
    test_text.tolist(),
    max_length = MAX_LENGHT,
    pad_to_max_length=True,
    truncation=True
)

# Convert lists to tensors
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels.tolist())

test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels.tolist())

# Data Loader structure definition
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
batch_size = 32                                               #define a batch size
train_data = TensorDataset(train_seq, train_mask, train_y)    # wrap tensors
train_sampler = RandomSampler(train_data)                     # sampler for sampling the data during training
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
                                                              # dataLoader for train set
val_data = TensorDataset(val_seq, val_mask, val_y)            # wrap tensors
val_sampler = SequentialSampler(val_data)                     # sampler for sampling the data during training
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)
                                                              # dataLoader for validation set


# Load the pre-trained BERT model and tokenizer
bert = AutoModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# Define the BERT architecture
class BERT_Arch(nn.Module):
    def __init__(self, bert):
        super(BERT_Arch, self).__init__()
        self.bert = bert
        self.dropout = nn.Dropout(0.1)
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(768, 512)
        self.fc2 = nn.Linear(512, 2)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, sent_id, mask):
        cls_hs = self.bert(sent_id, attention_mask=mask)['pooler_output']
        x = self.fc1(cls_hs)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

# Initialize the model
model = BERT_Arch(bert)

# Load the trained model weights
model.load_state_dict(torch.load('c3_new_model_weights.pt'))
model.eval()

# Tokenize and encode the validation data
MAX_LENGTH = 250
tokens_val = tokenizer.batch_encode_plus(
    val_text.tolist(),
    max_length=MAX_LENGTH,
    pad_to_max_length=True,
    truncation=True
)

# Convert to tensors
val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels.tolist())

# Create DataLoader
batch_size = 32
val_data = TensorDataset(val_seq, val_mask, val_y)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Evaluation function with progress tracking
def evaluate_model(model, dataloader):
    model.eval()
    predictions = []
    true_labels = []

    progress_bar = tqdm(dataloader, desc="Evaluating", total=len(dataloader))

    with torch.no_grad():
        for batch in progress_bar:
            sent_id, mask, labels = batch
            outputs = model(sent_id, mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            true_labels.extend(labels.cpu().tolist())

            progress_bar.set_postfix({"Batch": f"{progress_bar.n}/{len(dataloader)}"})

    return predictions, true_labels

print("Starting evaluation...")
predictions, true_labels = evaluate_model(model, val_dataloader)
print("Evaluation completed.")

# Generate classification report
report = classification_report(true_labels, predictions, target_names=['True', 'Fake'])
print(report)

# Generate confusion matrix
cm = confusion_matrix(true_labels, predictions)

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.savefig('confusion_matrix.pdf')
plt.close()

# Plot classification report
report_dict = classification_report(true_labels, predictions, target_names=['True', 'Fake'], output_dict=True)
df_report = pd.DataFrame(report_dict).transpose()

plt.figure(figsize=(12, 8))
sns.heatmap(df_report.iloc[:-1, :-1].astype(float), annot=True, cmap='YlGnBu')
plt.title('Classification Report Heatmap')
plt.savefig('classification_report.pdf')
plt.close()

print("Evaluation completed. Check 'confusion_matrix.pdf' and 'classification_report.pdf' for visualizations.")
import torch
from transformers import AutoModel, BertTokenizerFast
import torch.nn as nn

# Load BERT model and tokenizer
bert = AutoModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')