<a href="https://colab.research.google.com/github/soph-colo/cs4801-fake-news-detection/blob/cnn/CS4801_CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Fake News Detection: CNN

In [20]:
# imports
import pandas as pd
import os
import csv
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
import spacy
import tensorflow as tf
from tensorflow.keras import layers, models
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Embedding


In [6]:
os.chdir("/content")
print(os.getcwd())
os.listdir()

/content


['.config', 'WELFake_Dataset.csv', 'sample_data']

In [8]:
data = pd.read_csv("WELFake_Dataset.csv", on_bad_lines='error', delimiter=',', quotechar='"', encoding='utf-8')
data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


### 1. Analyze Titles

#### Goal: Aim to classify if an article title is fake news/misinformation (target = 1) or not (target = 0)

### 1.1 Remove NA Titles

In [9]:
data = data.dropna(subset=['title'])
data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1
5,5,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1


#### 1.2 Split Data

In [10]:
X = data['title']
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


### 1.3 Text Preprocessing

#### 1.3a Tokenization

In [15]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

#### 1.3b Padding

In [19]:
max_length = max(len(seq) for seq in X_train_seq)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length)

### 1.4 Create CNN

#### Use Keras.

In [21]:
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=100, input_length=max_length))
model.add(Conv1D(filters=32, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(10, activation = 'relu'))
model.add(Dense(units=1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])



#### 1.4 Train Model

In [22]:
model.fit(X_train_pad, y_train, epochs=10, batch_size=32, validation_data=(X_test_pad, y_test))

Epoch 1/10
[1m1678/1678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 14ms/step - accuracy: 0.8045 - loss: 0.4006 - val_accuracy: 0.8925 - val_loss: 0.2611
Epoch 2/10
[1m1678/1678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 14ms/step - accuracy: 0.9226 - loss: 0.2010 - val_accuracy: 0.8970 - val_loss: 0.2573
Epoch 3/10
[1m1678/1678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 13ms/step - accuracy: 0.9506 - loss: 0.1392 - val_accuracy: 0.8937 - val_loss: 0.2770
Epoch 4/10
[1m1678/1678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 15ms/step - accuracy: 0.9730 - loss: 0.0847 - val_accuracy: 0.8894 - val_loss: 0.3218
Epoch 5/10
[1m1678/1678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 13ms/step - accuracy: 0.9846 - loss: 0.0484 - val_accuracy: 0.8883 - val_loss: 0.4069
Epoch 6/10
[1m1678/1678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 15ms/step - accuracy: 0.9925 - loss: 0.0254 - val_accuracy: 0.8842 - val_loss: 0.5270
Epoc

<keras.src.callbacks.history.History at 0x787b05a99210>

In [23]:
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

[1m560/560[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.8807 - loss: 0.8782
Test Loss: 0.8682183623313904, Test Accuracy: 0.8788979649543762


In [30]:
new_texts = X_test[:100].tolist()
new_seq = tokenizer.texts_to_sequences(new_texts)
new_pad = pad_sequences(new_seq, maxlen=max_length)
predictions = model.predict(new_pad)

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 


In [33]:
for i, text in enumerate(new_texts):
    print(f"Text: {text}")
    print(f"Prediction: {'Fake' if predictions[i] > 0.5 else 'Real'}")
    print(f"True Label: {'Fake' if y_test.iloc[i] == 1 else 'Real'}\n\n")


Text: New York mayor criticized for proposed limits on legal aid to immigrants
Prediction: Real
True Label: Real


Text: FIRE THIS GUY! MUSLIM CNN HOST TWEETS Out Vile Response to President Trump’s Tweet After #LondonBridge Terror Attack
Prediction: Fake
True Label: Fake


Text: CNN HOST And Crybaby Hillary Surrogate Get BRUTAL SLAP DOWN When Dr. Gina Loudon Uses Facts Against Them [VIDEO]
Prediction: Fake
True Label: Fake


Text: Senate panel to hear from U.S. antitrust bosses
Prediction: Real
True Label: Real


Text: U.N. freedom of speech expert concerned about net neutrality
Prediction: Real
True Label: Real


Text: BREAKING: WikiLeaks Just Released Full ISIS Donor List With Names
Prediction: Fake
True Label: Fake


Text: Hillary Cancels All Events In 3 CRITICAL Swing States, THIS IS IT! SHE IS QUITTING THE RACE! • USA Newsflash
Prediction: Fake
True Label: Fake


Text: ENABLERS WHO LIVE IN GLASS HOUSES…Why Hillary Embracing Porn Star, Former Beauty Queen Will Backfire
Prediction: 