In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tasli\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df=pd.read_csv("news.csv")

In [3]:
df.sample(5)

Unnamed: 0.1,Unnamed: 0,title,text,label
563,188,House yanks first spending bill off the floor,Notable names include Ray Washburne (Commerce)...,REAL
5147,6070,Are you taking your iodine?,"Wed, 26 Oct 2016 08:04 UTC © periodictable.com...",FAKE
4700,8778,Crooked Hillary Risks Having ‘Blue Dress Momen...,Crooked Hillary Risks Having ‘Blue Dress Mom...,FAKE
6120,1637,Takeaways from the Republican debate,(CNN) As the first primaries creep ever closer...,REAL
1763,1640,Sanders draws early support for White House bi...,Democratic presidential candidate and Vermont ...,REAL


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6335 entries, 0 to 6334
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  6335 non-null   int64 
 1   title       6335 non-null   object
 2   text        6335 non-null   object
 3   label       6335 non-null   object
dtypes: int64(1), object(3)
memory usage: 198.1+ KB


In [5]:
df.drop('Unnamed: 0',axis=1,inplace=True)

In [6]:
df.tail(4)

Unnamed: 0,title,text,label
6331,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL
6334,Jeb Bush Is Suddenly Attacking Trump. Here's W...,Jeb Bush Is Suddenly Attacking Trump. Here's W...,REAL


In [7]:
df.describe()

Unnamed: 0,title,text,label
count,6335,6335,6335
unique,6256,6060,2
top,OnPolitics | 's politics blog,"Killing Obama administration rules, dismantlin...",REAL
freq,5,58,3171


In [8]:
df['label'].value_counts()

REAL    3171
FAKE    3164
Name: label, dtype: int64

In [9]:
len(df['text'].unique())

6060

# Remove duplicate row

In [10]:
df.drop_duplicates(["text","title"])

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...
6330,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


In [11]:
df['text']=df['text'].astype(str)

# Remove white space

In [12]:
df['text']=df['text'].str.strip()

In [13]:
df.head()

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


# Convert lower case

In [14]:
df['text']=df['text'].str.lower()

# Remove punctuation

In [15]:
funct=string.punctuation
def remove_to_functuation(text):
    return text.translate(str.maketrans('','',funct))
df['text']=df['text'].apply(lambda text: remove_to_functuation(text))
df.head()

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,daniel greenfield a shillman journalism fellow...,FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,google pinterest digg linkedin reddit stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,us secretary of state john f kerry said monday...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,— kaydee king kaydeeking november 9 2016 the l...,FAKE
4,The Battle of New York: Why This Primary Matters,its primary day in new york and frontrunners h...,REAL


# Remove stopwords

In [16]:
stop_word=set(stopwords.words('english'))
def remove_stopword(text):
    return " ".join([word for word in str(text).split() if word not in stop_word])
df['text']=df['text'].apply(lambda text : remove_stopword(text))
df.head()

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,daniel greenfield shillman journalism fellow f...,FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,google pinterest digg linkedin reddit stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,us secretary state john f kerry said monday st...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,— kaydee king kaydeeking november 9 2016 lesso...,FAKE
4,The Battle of New York: Why This Primary Matters,primary day new york frontrunners hillary clin...,REAL


# Remove Frequent words

In [17]:
from collections import Counter
cnt=Counter()
for text in df['text'].values:
    for word in text.split():
        cnt[word]+=1
        
cnt.most_common(10)

[('said', 21136),
 ('trump', 18123),
 ('clinton', 14286),
 ('would', 12624),
 ('us', 12351),
 ('people', 11004),
 ('one', 10961),
 ('new', 9179),
 ('—', 8737),
 ('also', 8207)]

In [18]:

fre_word=set([w for (w,e) in cnt.most_common(10)])
def remove_fre_word(text):
    return " ".join([word for word in str(text).split() if word not in fre_word])

df['text']=df['text'].apply(lambda text : remove_fre_word(text))
df.head()



Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,daniel greenfield shillman journalism fellow f...,FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,google pinterest digg linkedin reddit stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,secretary state john f kerry monday stop paris...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,kaydee king kaydeeking november 9 2016 lesson ...,FAKE
4,The Battle of New York: Why This Primary Matters,primary day york frontrunners hillary donald l...,REAL


# Remove Rare words

In [19]:

r_word=10
rare_word=set([w for (w,wd) in cnt.most_common()[:-r_word-1:-1]])

def remove_rare_word(text):
    return " ".join([word for word in str(text).split() if word not in rare_word])

df['text']=df['text'].apply(lambda text : remove_rare_word(text))
df.head()


Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,daniel greenfield shillman journalism fellow f...,FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,google pinterest digg linkedin reddit stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,secretary state john f kerry monday stop paris...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,kaydee king kaydeeking november 9 2016 lesson ...,FAKE
4,The Battle of New York: Why This Primary Matters,primary day york frontrunners hillary donald l...,REAL


# Stemming

In [20]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def stemming(text):
    return " ".join([stemmer.stem(word) for word in text.split()])
df['text']=df['text'].apply(lambda text : stemming(text))
df.head()

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,daniel greenfield shillman journal fellow free...,FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,googl pinterest digg linkedin reddit stumbleup...,FAKE
2,Kerry to go to Paris in gesture of sympathy,secretari state john f kerri monday stop pari ...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,kayde king kaydeek novemb 9 2016 lesson tonigh...,FAKE
4,The Battle of New York: Why This Primary Matters,primari day york frontrunn hillari donald lead...,REAL


# Lemmatization

In [21]:
from nltk.stem import WordNetLemmatizer

lemmati=WordNetLemmatizer()

def lemmatize(text):
    return " ".join([lemmati.lemmatize(word) for word in text.split()])
df['text']=df['text'].apply(lambda text : lemmatize(text))
df.head()

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,daniel greenfield shillman journal fellow free...,FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,googl pinterest digg linkedin reddit stumbleup...,FAKE
2,Kerry to go to Paris in gesture of sympathy,secretari state john f kerri monday stop pari ...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,kayde king kaydeek novemb 9 2016 lesson tonigh...,FAKE
4,The Battle of New York: Why This Primary Matters,primari day york frontrunn hillari donald lead...,REAL


# Remove number

In [22]:
num=r"[0-9]"

import re
def remove_number(text):
    text=re.sub(num, r" ", text)
    return " ".join(text.split())
df['text']=df['text'].apply(lambda text : remove_number(text))
df.head()

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,daniel greenfield shillman journal fellow free...,FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,googl pinterest digg linkedin reddit stumbleup...,FAKE
2,Kerry to go to Paris in gesture of sympathy,secretari state john f kerri monday stop pari ...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,kayde king kaydeek novemb lesson tonight dem l...,FAKE
4,The Battle of New York: Why This Primary Matters,primari day york frontrunn hillari donald lead...,REAL


In [23]:
df['label']=df['label'].map({'FAKE':0,'REAL':1})

# Model

In [24]:
vectorizer=TfidfVectorizer()
x=vectorizer.fit_transform(df['text'].values)
y=df['label']

In [25]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [32]:
models=[]
models.append(("PassiveAggressiveClassifier",PassiveAggressiveClassifier()))
models.append(("DecisionTreeClassifier",DecisionTreeClassifier()))
models.append(("RandomForestClassifier",RandomForestClassifier()))
models.append(("KNeighborsClassifier",KNeighborsClassifier()))
models.append(("LogisticRegression",LogisticRegression()))
models.append(("AdaBoostClassifier",AdaBoostClassifier()))                                                           
models.append(("SGDClassifier",SGDClassifier()))
models.append(("SVM",SVC()))

In [30]:
for model_name, model in models:
    model.fit(x_train,y_train)
    prd=model.predict(x_test)
    acc=accuracy_score(y_test,prd)
    print(model_name,"Accuracy : ",acc)
    print()
    print("Confusion Matrix")
    print(confusion_matrix(y_test,prd))
    print()
    print(classification_report(y_test,prd))
    

PassiveAggressiveClassifier Accuracy :  0.9329123914759274

Confusion Matrix
[[589  39]
 [ 46 593]]

              precision    recall  f1-score   support

           0       0.93      0.94      0.93       628
           1       0.94      0.93      0.93       639

    accuracy                           0.93      1267
   macro avg       0.93      0.93      0.93      1267
weighted avg       0.93      0.93      0.93      1267

DecisionTreeClassifier Accuracy :  0.7971586424625099

Confusion Matrix
[[504 124]
 [133 506]]

              precision    recall  f1-score   support

           0       0.79      0.80      0.80       628
           1       0.80      0.79      0.80       639

    accuracy                           0.80      1267
   macro avg       0.80      0.80      0.80      1267
weighted avg       0.80      0.80      0.80      1267

RandomForestClassifier Accuracy :  0.9076558800315706

Confusion Matrix
[[569  59]
 [ 58 581]]

              precision    recall  f1-score   support

# Crossed Validation

In [31]:
for model_name,model in models:
    kfold=KFold(n_splits=10,random_state=42)
    accuracy=cross_val_score(model,x_train,y_train,cv=kfold,scoring='accuracy')
    print(model_name,":",accuracy.mean())



PassiveAggressiveClassifier : 0.931136032306601
DecisionTreeClassifier : 0.8072171418325265




RandomForestClassifier : 0.8896972035768023




KNeighborsClassifier : 0.5789223596915904




LogisticRegression : 0.9094280858494906




AdaBoostClassifier : 0.8683884120338969




SGDClassifier : 0.9285703705436147
SVM : 0.92639801669902
