## BiLSTM - Fake News Detection
#### This Bilateral Long Short Term Memory Model aims to determine, based on an article title and content, if it is fake news or misinformation.

#### The data was sourced from Kaggle where it was prelabled from multiple different news sources.

### 0. Imports

In [None]:
# Imports
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
from tqdm import tqdm
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from google.colab import drive

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


### 1. Load Data

In [None]:
# Loading the data
drive.mount('/content/drive')
data = pd.read_csv('/content/drive/MyDrive/CS4801/WELFake_Dataset.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Taking an initial look
# label: 0 = fake and 1 = real
data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [None]:
# Dropping unnamed index column
data.drop('Unnamed: 0', axis=1, inplace=True)
data.head()

Unnamed: 0,title,text,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,,Did they post their votes for Hillary already?,1
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [None]:
data.isna().sum()

Unnamed: 0,0
title,558
text,39
label,0


In [None]:
# Just checking to make sure we don't have any total NAs (i.e. both title and text are empty)
data['both_na'] = data['title'].isna() & data['text'].isna()
data['both_na'].sum()

0

In [None]:
# Don't need to keep this
data.drop('both_na', axis=1, inplace=True)

### 2. Process Data

#### 2.1: Combine title and article

In [None]:
# Preprocess Data
data['label'] = data['label'].astype(float) # Need this to be a float for future use
# Since there are some NA values in both columns, we must fill them when creating the combined column
data['content'] = data['title'].fillna('') + data['text'].fillna('')
data['content'].head()

Unnamed: 0,content
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...
1,Did they post their votes for Hillary already?
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...
3,"Bobby Jindal, raised Hindu, uses story of Chri..."
4,SATAN 2: Russia unvelis an image of its terrif...


Right now, the columns are combined regardless if one is empty.
Do we want to drop all rows with NAs intitally instead? Or maybe drop them if the text column is empty, as that significantly limits the words it has to make a prediction with.

Maybe keep them


#### 2.2: Tokenize

In [None]:
# Tokenize and Vocabulary (Convert words to IDs)
# Using set to ignore duplicate values
all_words = set(word for content in data['content'] for word in word_tokenize(content.lower()))
vocab = {word: i + 1 for i, word in enumerate(all_words)}  # Reserve 0 for padding

In [None]:
print(all_words)
print(vocab)

#### 2.3: Encode

In [None]:
# Encoding: Converts each article into a list of integers
def encode_text(text):
    return [vocab.get(word, 0) for word in word_tokenize(text.lower())]

data['encoded'] = data['content'].apply(encode_text)
data['encoded'].head()

Unnamed: 0,encoded
0,"[229350, 352673, 417710, 115200, 413817, 30128..."
1,"[373119, 281106, 76152, 146962, 373551, 18569,..."
2,"[183370, 11011, 91600, 316069, 392537, 284186,..."
3,"[186496, 371391, 322860, 311234, 400013, 32286..."
4,"[411031, 67384, 249366, 408421, 8390, 390875, ..."


#### 2.4: Pad (also limit the length of the text to be analyzed)

In [None]:
# Pad sequences to fixed length (adds 0s if they are less than the max length)
MAX_SEQ_LENGTH = 100 # Just chose 100 words for intital tests, this can be changed
data['padded'] = data['encoded'].apply(lambda x: x[:MAX_SEQ_LENGTH] + [0] * (MAX_SEQ_LENGTH - len(x)) if
                                       len(x) < MAX_SEQ_LENGTH else x[:MAX_SEQ_LENGTH])
data['padded'].head()

Unnamed: 0,padded
0,"[229350, 352673, 417710, 115200, 413817, 30128..."
1,"[373119, 281106, 76152, 146962, 373551, 18569,..."
2,"[183370, 11011, 91600, 316069, 392537, 284186,..."
3,"[186496, 371391, 322860, 311234, 400013, 32286..."
4,"[411031, 67384, 249366, 408421, 8390, 390875, ..."


In [None]:
# Just to see all the new columns together
data.head()

Unnamed: 0,title,text,label,content,encoded,padded
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1.0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,"[229350, 352673, 417710, 115200, 413817, 30128...","[229350, 352673, 417710, 115200, 413817, 30128..."
1,,Did they post their votes for Hillary already?,1.0,Did they post their votes for Hillary already?,"[373119, 281106, 76152, 146962, 373551, 18569,...","[373119, 281106, 76152, 146962, 373551, 18569,..."
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1.0,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"[183370, 11011, 91600, 316069, 392537, 284186,...","[183370, 11011, 91600, 316069, 392537, 284186,..."
3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0.0,"Bobby Jindal, raised Hindu, uses story of Chri...","[186496, 371391, 322860, 311234, 400013, 32286...","[186496, 371391, 322860, 311234, 400013, 32286..."
4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1.0,SATAN 2: Russia unvelis an image of its terrif...,"[411031, 67384, 249366, 408421, 8390, 390875, ...","[411031, 67384, 249366, 408421, 8390, 390875, ..."


### 3. Split Data

In [None]:
# Train-test split
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
# Dataset Class: access and batch the data
class WELfakeDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.labels = data['label'].values
        self.texts = torch.tensor(data['padded'].tolist(), dtype=torch.long)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.texts[idx], torch.tensor(self.labels[idx], dtype=torch.float32)

In [None]:
# Dataloaders
train_dataset = WELfakeDataset(train_data)
test_dataset = WELfakeDataset(test_data)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

### 4. Define Model

In [None]:
# Define Basic Bidirectional LSTM Model
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(BiLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, 1)  # hidden_dim * 2 because it's bidirectional

    def forward(self, text):
        embedded = self.embedding(text)
        _, (hidden, _) = self.lstm(embedded)
        # Concatenate the final forward and backward hidden states
        hidden = torch.cat((hidden[-2], hidden[-1]), dim=1)
        output = self.fc(hidden)
        return output.squeeze(1)


In [None]:
# Hyperparameters
EMBEDDING_DIM = 50
HIDDEN_DIM = 64
LEARNING_RATE = 0.001
EPOCHS = 5

# Initialize model, optimizer, and loss function
vocab_size = len(vocab) + 1
model = BiLSTM(vocab_size, EMBEDDING_DIM, HIDDEN_DIM)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.BCEWithLogitsLoss()

### 5.Train

In [None]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Training Loop
for epoch in range(EPOCHS):
    model.train()
    train_loss = 0
    for texts, labels in tqdm(train_loader):
        texts, labels = texts.to(device), labels.to(device)

        optimizer.zero_grad()
        predictions = model(texts)
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
    print(f'Epoch {epoch+1}, Training Loss: {train_loss / len(train_loader)}')

# Evaluation
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for texts, labels in test_loader:
        texts, labels = texts.to(device), labels.to(device)
        predictions = torch.sigmoid(model(texts)).round()
        all_preds.extend(predictions.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
print(f'Test Accuracy: {accuracy:.4f}')


100%|██████████| 902/902 [08:27<00:00,  1.78it/s]


Epoch 1, Training Loss: 0.23206055519107044


100%|██████████| 902/902 [08:22<00:00,  1.79it/s]


Epoch 2, Training Loss: 0.10193138393757249


100%|██████████| 902/902 [08:24<00:00,  1.79it/s]


Epoch 3, Training Loss: 0.05799823343194195


100%|██████████| 902/902 [08:26<00:00,  1.78it/s]


Epoch 4, Training Loss: 0.03828749321752172


100%|██████████| 902/902 [08:27<00:00,  1.78it/s]


Epoch 5, Training Loss: 0.02283794846314216
Test Accuracy: 0.9780


#### 6. Evaluate