# Fake News Detection: Judging Tweet authenticity through comments

Yifei Xue

First step of experiment, import all packages and dataset

import pandas as pd
import numpy as np
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import accuracy_score,plot_roc_curve,RocCurveDisplay
from sklearn.ensemble import RandomForestClassifier
import xgboost 
import lightgbm
import matplotlib.pyplot as plt
import warnings 
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot
from keras.models import Sequential
from keras.layers import Dense,LSTM,Bidirectional,Dropout,Embedding
warnings.filterwarnings('ignore')

In [2]:
path = 'datasetforproject.xlsx'
dataset = pd.read_excel(path)
target = dataset['label']
label = target.apply(lambda x:1 if x==True else 0)

In [3]:
dataset.head()

Unnamed: 0,label,source,text
0,False,Charlie Hebdo became well known for publishing...,"Now 10 dead in a shooting there today RT \""@BB..."
1,False,Charlie Hebdo became well known for publishing...,@BBCDanielS @BBCWorld I'm guessing this is bei...
2,False,Charlie Hebdo became well known for publishing...,@BBCDanielS @BBCWorld why would you mention th...
3,False,Charlie Hebdo became well known for publishing...,@BBCDanielS @BBCWorld perps identified?
4,False,Charlie Hebdo became well known for publishing...,@BBCDanielS @BBCWorld who is charlie hebdo?


data clean, remove all parts that i dont need in experiment

In [4]:
def wordopt(text):
    text = text.lower()
    #
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

data split, we split the dataset to trainset and testset

In [5]:
data = dataset['text'].apply(wordopt)



corpus = []
dataset['text'].apply(lambda x: corpus.append(x))

Tfidf = TfidfVectorizer()
x_train,x_test,y_train,y_test = train_test_split(data,label,test_size=0.2,random_state=1)
x_train = Tfidf.fit_transform((x_train))
x_test = Tfidf.transform(x_test)

In [6]:
data.head()

0    now  dead in a shooting there today rt    bbcd...
1     bbcdaniels  bbcworld i m guessing this is bei...
2     bbcdaniels  bbcworld why would you mention th...
3               bbcdaniels  bbcworld perps identified 
4           bbcdaniels  bbcworld who is charlie hebdo 
Name: text, dtype: object

LogicRegression

In [7]:
LR = LogisticRegression()
LR.fit(x_train,y_train)
y_pred_lr = LR.predict(x_test)
acc_lr = accuracy_score(y_test,y_pred_lr)
print(f'lr accuracy is {np.round(acc_lr,4)}')

lr accuracy is 0.9191


Evalution of LogicRegression

In [8]:
from sklearn.metrics import confusion_matrix


confusion_matrix(y_test,y_pred_lr)

cfm = confusion_matrix(y_test,y_pred_lr)

print(cfm)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_lr))

[[53 10]
 [ 1 72]]
              precision    recall  f1-score   support

           0       0.98      0.84      0.91        63
           1       0.88      0.99      0.93        73

    accuracy                           0.92       136
   macro avg       0.93      0.91      0.92       136
weighted avg       0.93      0.92      0.92       136



SVC

In [9]:
svc = SVC()
param = {'kernel':['linear','rbf'],'C':[0.001,0.01,0.1,1,10,100]}
clf_svc = GridSearchCV(svc,param,cv=5,n_jobs=-1)
clf_svc.fit(x_train,y_train)
best_svc = clf_svc.best_estimator_
best_svc.fit(x_train,y_train)
y_pred_svc = best_svc.predict(x_test)
acc_svc = accuracy_score(y_test,y_pred_svc)
print(f'svc accuracy is {np.round(acc_svc,4)}')

svc accuracy is 0.9265


Evalution of SVC

In [10]:
from sklearn.metrics import confusion_matrix


confusion_matrix(y_test,y_pred_svc)

cfm = confusion_matrix(y_test,y_pred_svc)

print(cfm)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_svc))

[[58  5]
 [ 5 68]]
              precision    recall  f1-score   support

           0       0.92      0.92      0.92        63
           1       0.93      0.93      0.93        73

    accuracy                           0.93       136
   macro avg       0.93      0.93      0.93       136
weighted avg       0.93      0.93      0.93       136



Random Forest

In random forest, I tried differnet n_estimators

In [11]:
rf = RandomForestClassifier(random_state=1)
param_rf = {'n_estimators':[20,25,30,35,40,45,50]}
clf_rf = GridSearchCV(rf,param_rf,cv=5,n_jobs=-1)
clf_rf.fit(x_train,y_train)
best_rf = clf_rf.best_estimator_
best_rf.fit(x_train,y_train)
y_pred_rf = best_rf.predict(x_test)
acc_rf = accuracy_score(y_test,y_pred_rf)
print(f'rf accuracy is {np.round(acc_rf,4)}')

rf accuracy is 0.9044


In [12]:
rf = RandomForestClassifier(random_state=1)
param_rf = {'n_estimators':[100,120,130,140,150,160,170]}
clf_rf = GridSearchCV(rf,param_rf,cv=5,n_jobs=-1)
clf_rf.fit(x_train,y_train)
best_rf = clf_rf.best_estimator_
best_rf.fit(x_train,y_train)
y_pred_rf = best_rf.predict(x_test)
acc_rf = accuracy_score(y_test,y_pred_rf)
print(f'rf accuracy is {np.round(acc_rf,4)}')

rf accuracy is 0.9338


Evalution of SVC

In [13]:
confusion_matrix(y_test,y_pred_rf)

cfm = confusion_matrix(y_test,y_pred_rf)

print(cfm)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_rf))

[[58  5]
 [ 4 69]]
              precision    recall  f1-score   support

           0       0.94      0.92      0.93        63
           1       0.93      0.95      0.94        73

    accuracy                           0.93       136
   macro avg       0.93      0.93      0.93       136
weighted avg       0.93      0.93      0.93       136



XGBoost

In XGBoost, I tried different learning_rate and max_depth

In [14]:
xgb = xgboost.XGBClassifier(random_state=1)
xgb_param = {'n_estimators':[400,800,1000],'learning_rate':[0.01,0.1],'max_depth':[5,6]}
clf_xgb = GridSearchCV(xgb,xgb_param,cv=5,n_jobs=-1)
clf_xgb.fit(x_train,y_train)
best_xgb = clf_xgb.best_estimator_
print(clf_xgb.best_params_)
best_xgb.fit(x_train,y_train)
y_pred_xgb = best_xgb.predict(x_test)
acc_xgb = accuracy_score(y_test,y_pred_xgb)

print(f'xgb accuracy is {np.round(acc_xgb,4)}')

{'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 800}
xgb accuracy is 0.9338


In [15]:
confusion_matrix(y_test,y_pred_xgb)

cfm = confusion_matrix(y_test,y_pred_xgb)

print(cfm)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_xgb))

[[56  7]
 [ 2 71]]
              precision    recall  f1-score   support

           0       0.97      0.89      0.93        63
           1       0.91      0.97      0.94        73

    accuracy                           0.93       136
   macro avg       0.94      0.93      0.93       136
weighted avg       0.94      0.93      0.93       136



In [16]:
xgb = xgboost.XGBClassifier(random_state=1)
xgb_param = {'n_estimators':[400,800,1000],'learning_rate':[0.01,0.1],'max_depth':[3,4]}
clf_xgb = GridSearchCV(xgb,xgb_param,cv=5,n_jobs=-1)
clf_xgb.fit(x_train,y_train)
best_xgb = clf_xgb.best_estimator_
print(clf_xgb.best_params_)
best_xgb.fit(x_train,y_train)
y_pred_xgb = best_xgb.predict(x_test)
acc_xgb = accuracy_score(y_test,y_pred_xgb)

print(f'xgb accuracy is {np.round(acc_xgb,4)}')

{'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 1000}
xgb accuracy is 0.9265


In [17]:
xgb = xgboost.XGBClassifier(random_state=1)
xgb_param = {'n_estimators':[400,800,1000],'learning_rate':[0.1,0.3],'max_depth':[5,6]}
clf_xgb = GridSearchCV(xgb,xgb_param,cv=5,n_jobs=-1)
clf_xgb.fit(x_train,y_train)
best_xgb = clf_xgb.best_estimator_
print(clf_xgb.best_params_)
best_xgb.fit(x_train,y_train)
y_pred_xgb = best_xgb.predict(x_test)
acc_xgb = accuracy_score(y_test,y_pred_xgb)

print(f'xgb accuracy is {np.round(acc_xgb,4)}')

{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 400}
xgb accuracy is 0.8162


Lstm

In [18]:
import tensorflow as tf
tf.random.set_seed(2000)
voc_size = 4000
onehoe_repr = [one_hot(words,voc_size) for words in corpus]
sent_length = 400
embedded_docs = pad_sequences(onehoe_repr,padding='pre',maxlen=sent_length)

embedding_dim = 50

model = Sequential()
model.add(Embedding(voc_size,embedding_dim,input_length=sent_length))
model.add(Dropout(0.5))
model.add(LSTM(100))
model.add(Dropout(0.5))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

x_train,x_test,y_train,y_test = train_test_split(np.array(embedded_docs),np.array(label),\
                                                 test_size=0.2,random_state=1)
model.fit(x_train,y_train,validation_data=(x_test,y_test),epochs=20,batch_size=32)
acc_lstm = model.evaluate(x_test,y_test)[1]
print(acc_lstm)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
0.8970588445663452


BiLstm

In [19]:
tf.random.set_seed(2000)
model_bilstm=Sequential()
model_bilstm.add(Embedding(voc_size,embedding_dim,input_length=sent_length))
model.add(Dropout(0.5))
model_bilstm.add(Bidirectional(LSTM(100))) # Bidirectional LSTM layer
model_bilstm.add(Dropout(0.5))
model_bilstm.add(Dense(1,activation='sigmoid'))
model_bilstm.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

model_bilstm.fit(x_train,y_train,validation_data=(x_test,y_test),epochs=20,batch_size=32)
acc_bilstm = model_bilstm.evaluate(x_test,y_test)[1]
print(acc_bilstm)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
0.7941176295280457
