# **Fake News Detection**

# **Project Description**



# **1. Imports**

In [None]:
import os
#os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
import re
import nltk
import optuna

import torch
import torch.nn as nn
import torch.optim as optim

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report

from ydata_profiling import ProfileReport
from wordcloud import WordCloud
from gensim.models import Word2Vec
from tqdm import tqdm


nltk.download('stopwords')
nltk.download('wordnet')

pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [None]:
wel_fake = pd.read_csv('data/WELFake_Dataset.csv', index_col = 0)
fake_news_net = pd.read_csv('data/FakeNewsNet.csv', index_col=False)
true = pd.read_csv('data/true.csv', index_col=False)
fake = pd.read_csv('data/fake.csv', index_col=False)

# **2. Data analysis**

In [None]:
def balance_plot(df, label, plot_name):
    df.groupby(label)[label].count().plot(kind='pie', autopct='%1.1f%%', title=plot_name)

In [None]:
stopwords = set(stopwords.words('english'))
def word_cloud_plot(df, groupby, agg_column):
    wc = WordCloud(width = 800, height = 800,
                background_color ='white',
                stopwords = stopwords,
                min_font_size = 10).generate(df.groupby(groupby)[agg_column].sum()[0])
    plt.figure(figsize=(10,10))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis("off")
    plt.show()

## **2.1 WEL Fake dataset**

In [None]:
wel_fake

In [None]:
#ProfileReport(wel_fake, title='WELFake').to_file('WELFake.html')

In [None]:
missing_values = wel_fake.isnull().sum()
missing_values

In [None]:
wel_fake.fillna(" ", inplace=True)
wel_fake.dropna(inplace=True)
wel_fake = wel_fake.reset_index(drop=True)

In [None]:
plt.figure(figsize=(8, 6))
wel_fake['label'].value_counts().plot(kind='bar')
plt.xlabel('Label')
plt.ylabel('Count')
plt.title('Distribution of Label')
plt.show()

In [None]:
word_cloud_plot(wel_fake,'label', 'title')

## **2.2 Fake news net dataset**

In [None]:
fake_news_net

In [None]:
sns.countplot(x='real', data=fake_news_net)
plt.title('Distribution of Real and Fake News')
plt.xlabel('News Type')
plt.ylabel('Count')
plt.xticks([0, 1], ['Fake', 'Real'])
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(fake_news_net[fake_news_net['real'] == 1]['tweet_num'], bins=30, color='blue', label='Real')
sns.histplot(fake_news_net[fake_news_net['real'] == 0]['tweet_num'], bins=30, color='red', label='Fake')
plt.title('Distribution of Tweet Numbers for Real and Fake News')
plt.xlabel('Number of Tweets')
plt.ylabel('Frequency')
plt.legend()
plt.show()

In [None]:
# Top 10 source domains for real and fake news
plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
fake_news_net[fake_news_net['real'] == 1]['source_domain'].value_counts().head(10).plot(kind='barh', color='blue')
plt.title('Top 10 Source Domains for Real News')
plt.xlabel('Count')
plt.ylabel('Source Domain')

plt.subplot(1, 2, 2)
fake_news_net[fake_news_net['real'] == 0]['source_domain'].value_counts().head(10).plot(kind='barh', color='red')
plt.title('Top 10 Source Domains for Fake News')
plt.xlabel('Count')
plt.ylabel('Source Domain')

plt.tight_layout()
plt.show()

In [None]:
fake_news_net.isnull().sum()

In [None]:
#ProfileReport(fake_news_net, title='FakeNewsNet').to_file('FakeNewsNet.html')

In [None]:
fake_news_net = fake_news_net.dropna()

print('Shape after removing missing values:', fake_news_net.shape)

real_news = fake_news_net[fake_news_net['real'] == 1]
fake_news = fake_news_net[fake_news_net['real'] == 0]

real_news_downsampled = real_news.sample(len(fake_news), random_state=1)

fake_news_net_balanced = pd.concat([real_news_downsampled, fake_news])

print('Shape of balanced dataset:', fake_news_net_balanced.shape)

sns.countplot(x='real', data=fake_news_net_balanced)
plt.title('Distribution of Real and Fake News in the Balanced Dataset')
plt.xlabel('News Type')
plt.ylabel('Count')
plt.xticks([0, 1], ['Fake', 'Real'])
plt.show()

In [None]:
word_cloud_plot(fake_news_net,'real', 'title')

In [None]:
fake_news_net = fake_news_net_balanced

## **2.3 Fake and True dataset**

In [None]:
fake['label'] = 0

true['label'] = 1

In [None]:
fake_true = pd.concat([fake, true]).reset_index(drop=True)

In [None]:
#ProfileReport(fake_true, title='FakeTrue').to_file('FakeTrue.html')

In [None]:
fake_true

In [None]:
fake_true.groupby(['subject', 'label']).count()

In [None]:
fake_true.isnull().sum()

In [None]:
balance_plot(fake_true,'label', 'Fake True')

In [None]:
fake_true['text'].str.len().hist()

In [None]:
word_cloud_plot(fake_true,'label', 'text')

# **3. Data preprocessing**

In [None]:
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords]

    preprocessed_text = ' '.join(words)
    
    if preprocessed_text == "":
        preprocessed_text = "placeholder"

    return preprocessed_text

## **3.1 WEL Fake preprocessing**

In [None]:
wel_fake['text'] = wel_fake['text'].apply(preprocess_text)
wel_fake.head()

### **3.1.2 Bag of Words**

In [None]:
vectorizer = CountVectorizer(min_df=2, max_features=1000)

bow = vectorizer.fit_transform(wel_fake['text'])
feature_names = vectorizer.get_feature_names_out()
X_bow = pd.DataFrame(bow.toarray(), columns=feature_names)

X_bow.shape

### **Split Data**

In [None]:
y = wel_fake['label']
X_train_bow_wel, X_test_bow_wel, y_train_bow_wel, y_test_bow_wel = train_test_split(X_bow, y, test_size=0.2, random_state=42)
X_train_bow_wel, X_val_bow_wel, y_train_bow_wel, y_val_bow_wel = train_test_split(X_train_bow_wel, y_train_bow_wel, test_size=0.2, random_state=42)

In [None]:
y_train_bow_wel.value_counts(normalize=True) #imbalance check

### **3.1.3 Word 2 Vec**

In [None]:
sentences = wel_fake['text'].apply(lambda x: x.split())
model = Word2Vec(sentences, min_count=1)

print(model)

In [None]:
def w2v_train_test_prepatarion(data):
    X_train_vec = [] 
    for sentence in data:
        sentence_vec = []
        for word in sentence.split():
            if word in model.wv.key_to_index:
                sentence_vec.append(model.wv[word])
            else:
                sentence_vec.append(np.zeros(100)) #100 equal to vector_size in w2v_model
        X_train_vec.append(np.mean(sentence_vec, axis=0))
    return X_train_vec

In [None]:
X_w2v_wel = w2v_train_test_prepatarion(wel_fake['text'])

### **Split Data**

In [None]:
y = wel_fake['label']
X_train_w2v_wel, X_test_w2v_wel, y_train_w2v_wel, y_test_w2v_wel = train_test_split(X_w2v_wel, y, test_size=0.2, random_state=42)
X_train_w2v_wel, X_val_w2v_wel, y_train_w2v_wel, y_val_w2v_wel = train_test_split(X_train_w2v_wel, y_train_w2v_wel, test_size=0.2, random_state=42)

In [None]:
y_train_w2v_wel.value_counts(normalize=True) #imbalance check

## **3.2 Fake News Net preprocessing**

In [None]:
fake_news_net['title'] = fake_news_net['title'].apply(preprocess_text)
fake_news_net.head()

### 3.2.1 Bag of Words

In [None]:
vectorizer = CountVectorizer(min_df=2, max_features=1000)

bow = vectorizer.fit_transform(fake_news_net['title'])
feature_names = vectorizer.get_feature_names_out()
X_bow = pd.DataFrame(bow.toarray(), columns=feature_names)

X_bow.shape

In [None]:
y = fake_news_net['real']
X_train_bow_net, X_test_bow_net, y_train_bow_net, y_test_bow_net = train_test_split(X_bow, y, test_size=0.2, random_state=42)
X_train_bow_net, X_val_bow_net, y_train_bow_net, y_val_bow_net = train_test_split(X_train_bow_net, y_train_bow_net, test_size=0.2, random_state=42)

In [None]:
y_train_bow_net.value_counts(normalize=True) #imbalance check

### **3.2.2 Bag of Word 2 Vec**

In [None]:
sentences = fake_news_net['title'].apply(lambda x: x.split())
model = Word2Vec(sentences, min_count=1)

print(model)

In [None]:
X_w2v_net = w2v_train_test_prepatarion(fake_news_net['title'])

In [None]:
y = fake_news_net['real']
X_train_w2v_net, X_test_w2v_net, y_train_w2v_net, y_test_w2v_net = train_test_split(X_w2v_net, y, test_size=0.2, random_state=42)
X_train_w2v_net, X_val_w2v_net, y_train_w2v_net, y_val_w2v_net = train_test_split(X_train_w2v_net, y_train_w2v_net, test_size=0.2, random_state=42)

In [None]:
y_train_w2v_net.value_counts(normalize=True) #imbalance check

## **3.3 Fake and True preprocessing**

In [None]:
fake_true.duplicated().sum()

In [None]:
fake_true.drop_duplicates(inplace=True)

In [None]:
fake_true['text'] = fake_true['text'].apply(preprocess_text)
fake_true.head()

### **3.3.1 Bag of words**

In [None]:
vectorizer = CountVectorizer(min_df=2, max_features=1000)

bow = vectorizer.fit_transform(fake_true['text'])
feature_names = vectorizer.get_feature_names_out()
X_bow = pd.DataFrame(bow.toarray(), columns=feature_names)

X_bow.shape

In [None]:
y = fake_true['label']
X_train_bow_ft, X_test_bow_ft, y_train_bow_ft, y_test_bow_ft = train_test_split(X_bow, y, test_size=0.2, random_state=42)
X_train_bow_ft, X_val_bow_ft, y_train_bow_ft, y_val_bow_ft = train_test_split(X_train_bow_ft, y_train_bow_ft, test_size=0.2, random_state=42)

In [None]:
y_train_bow_ft.value_counts(normalize=True) #imbalance check

### **3.3.2 Word 2 Vec**

In [None]:
sentences = fake_true['text'].apply(lambda x: x.split())
model = Word2Vec(sentences, min_count=1)

print(model)

In [None]:
X_w2v_ft = w2v_train_test_prepatarion(fake_true['text'])

In [None]:
y = fake_true['label']
X_train_w2v_ft, X_test_w2v_ft,y_train_w2v_ft, y_test_w2v_ft = train_test_split(X_w2v_ft, y, test_size=0.2, random_state=42)
X_train_w2v_ft, X_val_w2v_ft,y_train_w2v_ft, y_val_w2v_ft = train_test_split(X_train_w2v_ft, y_train_w2v_ft, test_size=0.2, random_state=42)

In [None]:
y_train_w2v_ft.value_counts(normalize=True) #imbalance check

## 4. Models

In [None]:
device = (
    "cuda"
    if torch.cuda.is_available()
#     else "mps"
#     if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

In [None]:
torch.set_default_dtype(torch.float32)

In [None]:
class LogisticRegression(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        outputs =torch.sigmoid( self.linear(x))
        return outputs

In [None]:
def train(model, train_loader, valid_loader, criterion, optimizer, device, num_epochs):
    for epoch in range(num_epochs):
        model.train()
        for batch_idx, (inputs, target) in tqdm(enumerate(train_loader), desc="Epoch %s: " % (epoch+1), total=train_loader.__len__()):
            inputs = inputs.to(device)
            target = target.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), target)
            loss.backward()
            optimizer.step()
            
        model.eval()
        valid_loss = 0.0
        correct = 0
        with torch.no_grad():
            for inputs, target in valid_loader:
                inputs = inputs.to(device)
                target = target.to(device)
                outputs = model(inputs)
                loss = criterion(outputs.squeeze(), target)
                valid_loss += loss.item() * inputs.size(0)
                pred = (outputs > 0.5).float() 
                correct += (pred.squeeze() == target).float().sum().item()
                
        valid_loss /= len(valid_loader.dataset)
        valid_acc = correct / len(valid_loader.dataset)
        print ('Epoch [{}/{}], Loss: {:.4f}, Validation Loss: {:.4f}, Validation Accuracy: {:.4f}' 
               .format(epoch+1, num_epochs, loss.item(), valid_loss, valid_acc))

In [None]:
def predict(model, data_loader):
    model.eval()
    predictions = []
    with torch.no_grad():
        for inputs, target in data_loader:
            inputs = inputs.to(device)
            target = target.to(device)
            outputs = model(inputs)
            pred = (outputs > 0.5).float() 
            predictions.append(pred.squeeze().cpu().numpy())
    predictions = np.concatenate(predictions)
    return predictions

In [None]:
class LoadDataset(Dataset): #created due to lack of memory
    def __init__(self, X, y, use_toarray=False):
        self.X = X
        self.y = y
        self.use_toarray = use_toarray

    def __getitem__(self, index):
        if self.use_toarray:
            X_dense = torch.tensor(self.X[index]).float()
        else:
            X_dense = torch.tensor(self.X.iloc[index]).float()
        y_dense = torch.tensor(self.y.iloc[index]).float()
        return X_dense, y_dense

    def __len__(self):
        return len(self.y)

In [None]:
lr=0.01
epochs_num=2
criterion = nn.BCELoss()

## 4.1 WELFake

### 4.1.1 Bag of Words

In [None]:
valid_loader = LoadDataset(X_val_bow_wel, y_val_bow_wel)
val_loader_wel= DataLoader(valid_loader, 
                           batch_size=16, 
                           shuffle=True)

train_loader = LoadDataset(X_train_bow_wel, y_train_bow_wel)
train_loader_wel= DataLoader(train_loader, 
                             batch_size=16, 
                             shuffle=True)

In [None]:
model_bow_wel = LogisticRegression(input_dim=X_train_bow_wel.shape[1], output_dim=1)
model_bow_wel.to(device)
optimizer = torch.optim.Adam(model_bow_wel.parameters(), lr=lr)

In [None]:
%%time
train(model_bow_wel, train_loader_wel, val_loader_wel, criterion, optimizer, device, epochs_num)

In [None]:
test_loader = LoadDataset(X_test_bow_wel, y_test_bow_wel)
test_loader_wel= DataLoader(test_loader, batch_size=32)

In [None]:
y_pred_wel = predict(model_bow_wel, test_loader_wel)

print(classification_report(y_test_bow_wel, y_pred_wel))

### 4.1.2 Word 2 vec

In [None]:
valid_loader = LoadDataset(X_val_w2v_wel, y_val_w2v_wel,use_toarray=True)
val_loader_w2v_wel= DataLoader(valid_loader, 
                           batch_size=32, 
                           shuffle=True)

train_loader = LoadDataset(X_train_w2v_wel, y_train_w2v_wel, use_toarray=True)
train_loader_w2v_wel= DataLoader(train_loader, 
                                 batch_size=32, 
                                 shuffle=True)

In [None]:
model_w2v_wel = LogisticRegression(input_dim=100, output_dim=1)
model_w2v_wel.to(device)
optimizer = torch.optim.Adam(model_w2v_wel.parameters(), lr=lr)

In [None]:
train(model_w2v_wel, train_loader_w2v_wel, val_loader_wel,  criterion, optimizer, device, epochs_num)

In [None]:
test_loader = LoadDataset(X_test_w2v_wel, y_test_w2v_wel, use_toarray=True)
test_loader_w2v= DataLoader(test_loader, batch_size=32)

In [None]:
y_pred_w2v = predict(model_w2v_wel, test_loader_w2v)

print(classification_report(y_test_w2v_wel, y_pred_w2v))

## 4.2 Fake_news_net

### 4.2.1 Bag of words

In [None]:
valid_loader = LoadDataset(X_val_bow_net, y_val_bow_net)
val_loader_net= DataLoader(valid_loader, 
                           batch_size=32, 
                           shuffle=True)

train_loader = LoadDataset(X_train_bow_net, y_train_bow_net)
train_loader_net= DataLoader(train_loader, 
                             batch_size=32, 
                             shuffle=True)

In [None]:
model_bow_net = LogisticRegression(input_dim=X_train_bow_net.shape[1], output_dim=1)
model_bow_net.to(device)
optimizer = torch.optim.Adam(model_bow_net.parameters(), lr=lr)

In [None]:
%%time
train(model_bow_net, train_loader_net, val_loader_net, criterion, optimizer, device, epochs_num)

In [None]:
test_loader = LoadDataset(X_test_bow_net, y_test_bow_net)
test_loader_net= DataLoader(test_loader, 
                            batch_size=32, 
                            shuffle=False)

In [None]:
y_pred_net = predict(model_bow_net, test_loader_net)

print(classification_report(y_test_bow_net, y_pred_net))

### 4.2.2 Word 2 vec

In [None]:
valid_loader = LoadDataset(X_val_w2v_net, y_val_w2v_net,use_toarray=True)
val_loader_net= DataLoader(valid_loader, 
                           batch_size=32, 
                           shuffle=True)

train_loader = LoadDataset(X_train_w2v_net, y_train_w2v_net,use_toarray=True)
train_loader_net= DataLoader(train_loader, 
                             batch_size=32, 
                             shuffle=True)

In [None]:
model_w2v_net = LogisticRegression(input_dim=100, output_dim=1)
model_w2v_net.to(device)
optimizer = torch.optim.Adam(model_w2v_net.parameters(), lr=lr)

In [None]:
%%time
train(model_w2v_net, train_loader_net, val_loader_net, criterion, optimizer, device, epochs_num)

In [None]:
test_loader = LoadDataset(X_test_w2v_net, y_test_w2v_net, use_toarray=True)
test_loader_net= DataLoader(test_loader, 
                            batch_size=32, 
                            shuffle=False)

In [None]:
y_pred_net = predict(model_w2v_net, test_loader_net)

print(classification_report(y_test_w2v_net, y_pred_net))

## 4.3 Fake True

### 4.3.1 Bag of words

In [None]:
valid_loader = LoadDataset(X_val_bow_ft, y_val_bow_ft)
val_loader_ft= DataLoader(valid_loader, 
                          batch_size=32, 
                          shuffle=True)

train_loader = LoadDataset(X_train_bow_ft, y_train_bow_ft)
train_loader_ft= DataLoader(train_loader, 
                            batch_size=32, 
                            shuffle=True)

In [None]:
model_bow_ft = LogisticRegression(input_dim=X_train_bow_ft.shape[1], output_dim=1)
model_bow_ft.to(device)
optimizer = torch.optim.Adam(model_bow_ft.parameters(), lr=lr)

In [None]:
%%time
train(model_bow_ft, train_loader_ft, val_loader_ft, criterion, optimizer, device, epochs_num)

In [None]:
test_loader = LoadDataset(X_test_bow_ft, y_test_bow_ft)
test_loader_ft= DataLoader(test_loader, 
                            batch_size=32, 
                            shuffle=False)

In [None]:
y_pred_ft = predict(model_bow_ft, test_loader_ft)

print(classification_report(y_test_bow_ft, y_pred_ft))

### 4.3.2 Word 2 vec

In [None]:
valid_loader = LoadDataset(X_val_w2v_ft, y_val_w2v_ft,use_toarray=True)
val_loader_ft= DataLoader(valid_loader, 
                           batch_size=32, 
                           shuffle=True)

train_loader = LoadDataset(X_train_w2v_ft, y_train_w2v_ft,use_toarray=True)
train_loader_ft= DataLoader(train_loader, 
                             batch_size=32, 
                             shuffle=True)

In [None]:
model_w2v_ft = LogisticRegression(input_dim=100, output_dim=1)
model_w2v_ft.to(device)
optimizer = torch.optim.Adam(model_w2v_ft.parameters(), lr=lr)

In [None]:
%%time
train(model_w2v_ft, train_loader_ft, val_loader_ft, criterion, optimizer, device, epochs_num)

In [None]:
test_loader = LoadDataset(X_test_w2v_ft, y_test_w2v_ft, use_toarray=True)
test_loader_ft= DataLoader(test_loader, 
                            batch_size=32, 
                            shuffle=False)

In [None]:
y_pred_ft = predict(model_w2v_ft, test_loader_ft)

print(classification_report(y_test_w2v_ft, y_pred_ft))

## **5. Hyperparameters tunning**

In [None]:
def objective(trial):
    lr = trial.suggest_loguniform('lr', 1e-3, 1e-1)
    weight_decay = trial.suggest_loguniform('weight_decay', 1e-3, 1e-1)


    model = LogisticRegression(input_dim=X, output_dim=1)
    model.to(device)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    for epoch in range(20):
        for inputs, labels in train_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.view(-1), labels)
            loss.backward()
            optimizer.step()

    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            imputs = inputs.to(device)
            labels = labels.to(device)
            outputs = model(inputs)
            predicted = (outputs > 0.5).float()
            total += labels.size(0)
            correct += (predicted.squeeze() == labels).float().sum().item()

    return correct / total

In [None]:
def callback(study, trial):
    print("Trial finished with value: ", trial.value)

### 5.1 WELFake

### 5.1.1 Bag of Words

In [None]:
X = X_train_bow_wel.shape[1]
train_loader = train_loader= train_loader_wel
val_loader = val_loader_wel

In [None]:
%%time
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

print('Best hyperparameters:', study.best_params)

In [None]:
X = 100
train_loader = train_loader= train_loader_w2v_wel
val_loader = val_loader_w2v_wel

In [None]:
%%time
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10, callbacks=[callback])

print('Best hyperparameters:', study.best_params)

In [None]:
optuna.visualization.plot_optimization_history(study)