In [1]:
# Import dependencies 
import pandas as pd
import numpy as np
import re
import string
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [2]:
# Read in the csv data
fake_df = pd.read_csv('Data/Fake.csv')
true_df = pd.read_csv('Data/True.csv')

In [3]:
# Delete articles with blank(" ") Text
fake_df = fake_df.loc[fake_df["text"] != " "]
fake_df.shape

# Drop duplicate records
fake_df = fake_df.drop_duplicates(['text'],keep= 'last')

# Titles under Subject = 'News'
fake_df[fake_df['subject'] == 'News']['title']

# Rename Subjects as 'US News' and 'World News'
fake_df['subject'] = fake_df['subject'].replace(['News','politics','left-news','Government News','US_News'],'US News')
fake_df['subject'] = fake_df['subject'].replace(['Middle-east'],'World News')

# Drop the date column
fake_df = fake_df.drop(columns='date')

# Add label column and set value as 1
fake_df['label'] = 1

In [4]:
# Delete articles with blank(" ") Text
true_df = true_df.loc[true_df["text"] != " "]
true_df = true_df.loc[true_df["text"] != ""]

# Drop duplicate records
true_df = true_df.drop_duplicates(['text'],keep= 'last')

# Rename Subjects as 'US News' and 'World News'
true_df['subject'] = true_df['subject'].replace(['politicsNews'],'US News')
true_df['subject'] = true_df['subject'].replace(['worldnews'],'World News')

# Drop the date column
true_df = true_df.drop(columns='date')

# Add label column and set value as 0
true_df['label'] = 0

In [5]:
# # Merge the Dataframes
# dataframes = [fake_df, true_df]
# df = pd.concat(dataframes)

In [6]:
# Take sample from each dataframe and merge
fake_sp = fake_df.sample(frac = 0.10)
true_sp = true_df.sample(frac = 0.10)

# Merge dataframes
dfs = [fake_sp, true_sp]
df_sp = pd.concat(dfs)

df_sp.head()

Unnamed: 0,title,text,subject,label
637,Official GOP Twitter Account Went Fully Delus...,Donald Trump s entire political strategy boils...,US News,1
2207,WATCH: 2 GOP Reps. Are COMPLETELY Stumped Ove...,The House s plan to repeal and replace the ACA...,US News,1
14764,FOX News Just Announced Moderators For Next GO...,Will Megyn Kelly be working towards a gotcha ...,US News,1
16077,THE SHOCKING REASON THE STATE DEPT IS HIDING D...,The reason the State Department is hiding paym...,US News,1
7440,Cliven Bundy Denied Bail Because He’s A Viole...,"Cliven Bundy, the leader of the Moron Militia ...",US News,1


In [7]:
# Encode Subject column (US News = 0 and World News = 1)
le = LabelEncoder()
df_sp['subject'] = le.fit_transform(df_sp['subject']) 

#Check subject column
df_sp.subject

637      0
2207     0
14764    0
16077    0
7440     0
        ..
2301     0
7572     0
4649     0
14453    1
10615    0
Name: subject, Length: 3864, dtype: int64

In [8]:
# Remove Punctuation 
def wordpre(title):
    # title = re.sub(r"(\w+)'s", r'\1s', title) #removing ' between word and s and joing s with the word
    # title = re.sub(r"(\w+)'t", r'\1t', title) #removing ' between word and t and joing t with the word
    # title = re.sub(r'(\w+).(\w+)',r'\1', title) # removing . between words and joining words
    title = re.sub('\[.*?\]', '', title)
    title = re.sub("\\W"," ",title) # remove special chars
    title = re.sub('https?://\S+|www\.\S+', '', title)
    title = re.sub('<.*?>+', '', title)
    title = re.sub('[%s]' % re.escape(string.punctuation), '', title)
    title = re.sub('\n', '', title)
    title = re.sub('\w*\d\w*', '', title)
    
    return title

In [9]:
# ##  Applying the wordpre method to the dataset
# df['title_wordpre']= df['title'].apply(wordpre)
# df['text_wordpre']= df['text'].apply(wordpre)
# df.head()

In [10]:
##  Applying the wordpre method to the dataset
df_sp['title_wordpre']= df_sp['title'].apply(wordpre)
df_sp['text_wordpre']= df_sp['text'].apply(wordpre)
df_sp.head()

Unnamed: 0,title,text,subject,label,title_wordpre,text_wordpre
637,Official GOP Twitter Account Went Fully Delus...,Donald Trump s entire political strategy boils...,0,1,Official GOP Twitter Account Went Fully Delus...,Donald Trump s entire political strategy boils...
2207,WATCH: 2 GOP Reps. Are COMPLETELY Stumped Ove...,The House s plan to repeal and replace the ACA...,0,1,WATCH GOP Reps Are COMPLETELY Stumped Over...,The House s plan to repeal and replace the ACA...
14764,FOX News Just Announced Moderators For Next GO...,Will Megyn Kelly be working towards a gotcha ...,0,1,FOX News Just Announced Moderators For Next GO...,Will Megyn Kelly be working towards a gotcha ...
16077,THE SHOCKING REASON THE STATE DEPT IS HIDING D...,The reason the State Department is hiding paym...,0,1,THE SHOCKING REASON THE STATE DEPT IS HIDING D...,The reason the State Department is hiding paym...
7440,Cliven Bundy Denied Bail Because He’s A Viole...,"Cliven Bundy, the leader of the Moron Militia ...",0,1,Cliven Bundy Denied Bail Because He s A Viole...,Cliven Bundy the leader of the Moron Militia ...


In [11]:
# Initialize TfidVectorizer
# maximum document frequency of 0.7 (terms with a higher document frequency will be discarded).
tf_idf = TfidfVectorizer(stop_words = 'english', max_df = 0.7)

In [12]:
# Store title and text in variables
title = df_sp.title_wordpre
text = df_sp.text_wordpre

In [13]:
# Fit and transform title data using TfidVectorizer
tf_idf_title= tf_idf.fit_transform(title)

# Convert into dataframe 
tf_title_df = pd.DataFrame(tf_idf_title.toarray(), columns=tf_idf.get_feature_names())

# print(tf_title_df)

# # Show results
# print(tf_idf_title)

# # Check number of records
# len(tf_idf_title)
tf_title_df.head()

Unnamed: 0,abadi,abandoning,abbas,abbott,abc,abdicate,abducted,abe,abiding,ability,...,zimbabwe,zimbabwean,zimbabweans,zimmerman,zinke,zombie,zone,zor,zuckerberg,zuma
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# Fit and transform text data using TfidVectorizer
tf_idf_text= tf_idf.fit_transform(text)

# Convert into dataframe 
tf_text_df = pd.DataFrame(tf_idf_text.toarray(), columns=tf_idf.get_feature_names())

# print(tf_text_df)

# # Show results
# print(tf_idf_text)

# # Check number of records
# len(tf_idf_text)

tf_text_df.head()

Unnamed: 0,aa,aaa,aaarf,aachen,aai,aaliyah,aamer,aapl,aaron,aaroncovfefe,...,zuweid,zvizdic,zwak,zweli,zwillich,zwkhcksycy,zwolinski,zych,zynga,zzjjpdaivn
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# Merge original df + tfidf title + tfidf text 
df_sp = df_sp.reset_index()
df_sp1 = pd.concat([df_sp,tf_title_df,tf_text_df], axis=1)
df_sp1

Unnamed: 0,index,title,text,subject,label,title_wordpre,text_wordpre,abadi,abandoning,abbas,...,zuweid,zvizdic,zwak,zweli,zwillich,zwkhcksycy,zwolinski,zych,zynga,zzjjpdaivn
0,637,Official GOP Twitter Account Went Fully Delus...,Donald Trump s entire political strategy boils...,0,1,Official GOP Twitter Account Went Fully Delus...,Donald Trump s entire political strategy boils...,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2207,WATCH: 2 GOP Reps. Are COMPLETELY Stumped Ove...,The House s plan to repeal and replace the ACA...,0,1,WATCH GOP Reps Are COMPLETELY Stumped Over...,The House s plan to repeal and replace the ACA...,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,14764,FOX News Just Announced Moderators For Next GO...,Will Megyn Kelly be working towards a gotcha ...,0,1,FOX News Just Announced Moderators For Next GO...,Will Megyn Kelly be working towards a gotcha ...,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,16077,THE SHOCKING REASON THE STATE DEPT IS HIDING D...,The reason the State Department is hiding paym...,0,1,THE SHOCKING REASON THE STATE DEPT IS HIDING D...,The reason the State Department is hiding paym...,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7440,Cliven Bundy Denied Bail Because He’s A Viole...,"Cliven Bundy, the leader of the Moron Militia ...",0,1,Cliven Bundy Denied Bail Because He s A Viole...,Cliven Bundy the leader of the Moron Militia ...,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3859,2301,Factbox: Long history of U.S. leakers to media...,(Reuters) - While one focus of the leak crackd...,0,0,Factbox Long history of U S leakers to media...,Reuters While one focus of the leak crackd...,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3860,7572,"U.S. bolsters cyber defense for election, few ...",WASHINGTON (Reuters) - Federal and state autho...,0,0,U S bolsters cyber defense for election few ...,WASHINGTON Reuters Federal and state autho...,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3861,4649,Trump to sign order sweeping away Obama-era cl...,WASHINGTON (Reuters) - U.S. President Donald T...,0,0,Trump to sign order sweeping away Obama era cl...,WASHINGTON Reuters U S President Donald T...,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3862,14453,"Party set to sack Mugabe, Zimbabweans celebrat...",HARARE (Reuters) - Zimbabwe s ruling party wil...,1,0,Party set to sack Mugabe Zimbabweans celebrat...,HARARE Reuters Zimbabwe s ruling party wil...,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
# Drop columns that will not be used as features 
df_sp1 = df_sp1.drop(columns=['index','title','text','title_wordpre','text_wordpre'])
df_sp1.head()

Unnamed: 0,subject,label,abadi,abandoning,abbas,abbott,abc,abdicate,abducted,abe,...,zuweid,zvizdic,zwak,zweli,zwillich,zwkhcksycy,zwolinski,zych,zynga,zzjjpdaivn
0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# RENAME DUPLICATE LABELS COLUMN
cols = []
count = 1
for column in df_sp1.columns:
    if column == 'label':
        cols.append(f'label_{count}')
        count+=1
        continue
    cols.append(column)
df_sp1.columns = cols

df_sp1

Unnamed: 0,subject,label_1,abadi,abandoning,abbas,abbott,abc,abdicate,abducted,abe,...,zuweid,zvizdic,zwak,zweli,zwillich,zwkhcksycy,zwolinski,zych,zynga,zzjjpdaivn
0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3859,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3860,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3861,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3862,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# Scale data

# Initiate StandardScaler
scaler = StandardScaler()

# Create dataframe for columns that will be scaled 
df_nolabel = df_sp1.drop(columns= ['label_1'])

# Transform
df_scaled = pd.DataFrame(scaler.fit_transform(df_nolabel),columns=df_nolabel.columns)

df_scaled

Unnamed: 0,subject,abadi,abandoning,abbas,abbott,abc,abdicate,abducted,abe,abiding,...,zuweid,zvizdic,zwak,zweli,zwillich,zwkhcksycy,zwolinski,zych,zynga,zzjjpdaivn
0,-0.621641,-0.016089,-0.016089,-0.022558,-0.016089,-0.041997,-0.016089,-0.016089,-0.039018,-0.016089,...,-0.021953,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089
1,-0.621641,-0.016089,-0.016089,-0.022558,-0.016089,-0.041997,-0.016089,-0.016089,-0.039018,-0.016089,...,-0.021953,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089
2,-0.621641,-0.016089,-0.016089,-0.022558,-0.016089,-0.041997,-0.016089,-0.016089,-0.039018,-0.016089,...,-0.021953,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089
3,-0.621641,-0.016089,-0.016089,-0.022558,-0.016089,-0.041997,-0.016089,-0.016089,-0.039018,-0.016089,...,-0.021953,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089
4,-0.621641,-0.016089,-0.016089,-0.022558,-0.016089,-0.041997,-0.016089,-0.016089,-0.039018,-0.016089,...,-0.021953,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3859,-0.621641,-0.016089,-0.016089,-0.022558,-0.016089,-0.041997,-0.016089,-0.016089,-0.039018,-0.016089,...,-0.021953,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089
3860,-0.621641,-0.016089,-0.016089,-0.022558,-0.016089,-0.041997,-0.016089,-0.016089,-0.039018,-0.016089,...,-0.021953,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089
3861,-0.621641,-0.016089,-0.016089,-0.022558,-0.016089,-0.041997,-0.016089,-0.016089,-0.039018,-0.016089,...,-0.021953,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089
3862,1.608647,-0.016089,-0.016089,-0.022558,-0.016089,-0.041997,-0.016089,-0.016089,-0.039018,-0.016089,...,-0.021953,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089


In [19]:
# Add label back to dataframe
df_scaled ['label'] = df_sp1.label_1
df_scaled.head()

Unnamed: 0,subject,abadi,abandoning,abbas,abbott,abc,abdicate,abducted,abe,abiding,...,zvizdic,zwak,zweli,zwillich,zwkhcksycy,zwolinski,zych,zynga,zzjjpdaivn,label
0,-0.621641,-0.016089,-0.016089,-0.022558,-0.016089,-0.041997,-0.016089,-0.016089,-0.039018,-0.016089,...,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,1
1,-0.621641,-0.016089,-0.016089,-0.022558,-0.016089,-0.041997,-0.016089,-0.016089,-0.039018,-0.016089,...,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,1
2,-0.621641,-0.016089,-0.016089,-0.022558,-0.016089,-0.041997,-0.016089,-0.016089,-0.039018,-0.016089,...,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,1
3,-0.621641,-0.016089,-0.016089,-0.022558,-0.016089,-0.041997,-0.016089,-0.016089,-0.039018,-0.016089,...,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,1
4,-0.621641,-0.016089,-0.016089,-0.022558,-0.016089,-0.041997,-0.016089,-0.016089,-0.039018,-0.016089,...,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,-0.016089,1


In [20]:
# Assign variables (features and target y and X)
y = df_scaled.label
X = df_scaled.drop(columns=['label'])

In [21]:
# Split data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, test_size=0.20)

In [22]:
# MODEL 1: Naive Bayes/GaussianNB()

#Import library
from sklearn.naive_bayes import GaussianNB

# Initiate model
model = GaussianNB()

# Train model
model.fit(X_train, y_train)

# Create model predictions
y_pred = model.predict(X_test)

# Validate Model(1): Accuracy Score 
accuracy = accuracy_score(y_test, y_pred)*100
print(accuracy)

# Validate Model(2): Confusion Matrix
print(confusion_matrix(y_test, y_pred))

# Validate Model(3): Classification Report
print(classification_report(y_test, y_pred))

83.18240620957309
[[342  80]
 [ 50 301]]
              precision    recall  f1-score   support

           0       0.87      0.81      0.84       422
           1       0.79      0.86      0.82       351

    accuracy                           0.83       773
   macro avg       0.83      0.83      0.83       773
weighted avg       0.84      0.83      0.83       773



In [23]:
# MODEL 2: SVM

#Import library 
from sklearn.svm import SVC

# Initiate model
svm_model = SVC(kernel='linear')

# Train model
svm_model.fit(X_train, y_train)

# Create model predictions
y_pred = svm_model.predict(X_test)

# Validate Model(1): Accuracy Score 
accuracy = accuracy_score(y_test, y_pred)*100
print(accuracy)

# Validate Model(2): Confusion Matrix
print(confusion_matrix(y_test, y_pred))

# Validate Model(3): Classification Report
print(classification_report(y_test, y_pred))

96.248382923674
[[416   6]
 [ 23 328]]
              precision    recall  f1-score   support

           0       0.95      0.99      0.97       422
           1       0.98      0.93      0.96       351

    accuracy                           0.96       773
   macro avg       0.96      0.96      0.96       773
weighted avg       0.96      0.96      0.96       773

