# Fake News Classification

The idea is to develop a machine learning program to identify whether an article might be fake news or not.

Dataset link: https://www.kaggle.com/c/fake-news/data

In [1]:
# importing libraries
import numpy as np
import pandas as pd
import re
import string

In [2]:
# reading the train dataset
train=pd.read_csv("train.csv")

In [3]:
# 5 values from the top
train.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
# description of data
train.describe

<bound method NDFrame.describe of           id                                              title  \
0          0  House Dem Aide: We Didn’t Even See Comey’s Let...   
1          1  FLYNN: Hillary Clinton, Big Woman on Campus - ...   
2          2                  Why the Truth Might Get You Fired   
3          3  15 Civilians Killed In Single US Airstrike Hav...   
4          4  Iranian woman jailed for fictional unpublished...   
...      ...                                                ...   
20795  20795  Rapper T.I.: Trump a ’Poster Child For White S...   
20796  20796  N.F.L. Playoffs: Schedule, Matchups and Odds -...   
20797  20797  Macy’s Is Said to Receive Takeover Approach by...   
20798  20798  NATO, Russia To Hold Parallel Exercises In Bal...   
20799  20799                          What Keeps the F-35 Alive   

                                          author  \
0                                  Darrell Lucus   
1                                Daniel J. Flynn   
2    

In [5]:
# information about the data
train.info

<bound method DataFrame.info of           id                                              title  \
0          0  House Dem Aide: We Didn’t Even See Comey’s Let...   
1          1  FLYNN: Hillary Clinton, Big Woman on Campus - ...   
2          2                  Why the Truth Might Get You Fired   
3          3  15 Civilians Killed In Single US Airstrike Hav...   
4          4  Iranian woman jailed for fictional unpublished...   
...      ...                                                ...   
20795  20795  Rapper T.I.: Trump a ’Poster Child For White S...   
20796  20796  N.F.L. Playoffs: Schedule, Matchups and Odds -...   
20797  20797  Macy’s Is Said to Receive Takeover Approach by...   
20798  20798  NATO, Russia To Hold Parallel Exercises In Bal...   
20799  20799                          What Keeps the F-35 Alive   

                                          author  \
0                                  Darrell Lucus   
1                                Daniel J. Flynn   
2      

In [6]:
# reading the test dataset
test=pd.read_csv("test.csv")

In [7]:
# 5 elements from the top
test.head()

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...


In [8]:
# description of test dataset
test.describe

<bound method NDFrame.describe of          id                                              title  \
0     20800  Specter of Trump Loosens Tongues, if Not Purse...   
1     20801  Russian warships ready to strike terrorists ne...   
2     20802  #NoDAPL: Native American Leaders Vow to Stay A...   
3     20803  Tim Tebow Will Attempt Another Comeback, This ...   
4     20804                    Keiser Report: Meme Wars (E995)   
...     ...                                                ...   
5195  25995  The Bangladeshi Traffic Jam That Never Ends - ...   
5196  25996  John Kasich Signs One Abortion Bill in Ohio bu...   
5197  25997  California Today: What, Exactly, Is in Your Su...   
5198  25998  300 US Marines To Be Deployed To Russian Borde...   
5199  25999  Awkward Sex, Onscreen and Off - The New York T...   

                       author  \
0            David Streitfeld   
1                         NaN   
2               Common Dreams   
3               Daniel Victor   
4     Tr

## Text Preprocessing

In [9]:
# filling the empty cells
train=train.fillna(' ')
test=test.fillna(' ')

In [10]:
train['final']=train['title']+" "+train['author']+" "+train['text']
test['final']=test['title']+' '+test['author']+test['text']

In [11]:
# final shape of the data
print(train.shape)
print(test.shape)

(20800, 6)
(5200, 5)


In [12]:
# removing the html tags
def clean_html(text):
    clean=re.compile('<.*?>')
    cleantext=re.sub(clean,'',text)
    return cleantext
    
# first round of cleaning
def clean_text1(text):
    text=text.lower()
    text=re.sub('\[.*?\]','',text)
    text=re.sub('[%s]'%re.escape(string.punctuation),'',text)
    text=re.sub('\w*\d\w*','',text)
    return text

# second round of cleaning
def clean_text2(text):
    text=re.sub('[''"",,,]','',text)
    text=re.sub('\n','',text)
    return text
    
cleaned_html=lambda x:clean_html(x)
cleaned1=lambda x:clean_text1(x)
cleaned2=lambda x:clean_text2(x)

train['final']=pd.DataFrame(train.final.apply(cleaned_html))
train['final']=pd.DataFrame(train.final.apply(cleaned1))
train['final']=pd.DataFrame(train.final.apply(cleaned2))

In [13]:
# applying the same on test data
test['final']=pd.DataFrame(test.final.apply(cleaned_html))
test['final']=pd.DataFrame(test.final.apply(cleaned1))
test['final']=pd.DataFrame(test.final.apply(cleaned2))

In [14]:
# 5 values from the top
train.head()

Unnamed: 0,id,title,author,text,label,final
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,house dem aide we didn’t even see comey’s lett...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,flynn hillary clinton big woman on campus bre...
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,why the truth might get you fired consortiumne...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,civilians killed in single us airstrike have ...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,iranian woman jailed for fictional unpublished...


In [15]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
#tfidf
transformer = TfidfTransformer(smooth_idf=False)
count_vectorizer = CountVectorizer(ngram_range=(1, 2))
counts = count_vectorizer.fit_transform(train['final'].values)
tfidf = transformer.fit_transform(counts)

In [16]:
y=train['label']
from sklearn.model_selection import train_test_split as tts
xtrain,xtest,ytrain,ytest=tts(tfidf,y,test_size=0.2,random_state=42)

## Making the Model

In [21]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier, StackingClassifier
from sklearn.tree import DecisionTreeClassifier

In [19]:
from sklearn.model_selection import GridSearchCV
p1={'n_neighbors':[1,2,3,4,5,6,7,8,9,10,11,12,13]}
knn=KNeighborsClassifier()
m1=GridSearchCV(knn,p1,n_jobs=-1)
m1.fit(xtrain,ytrain)
print("The model score on Test data is: ",m1.score(xtest,ytest))

The model score on Test data is:  0.8052884615384616


In [23]:
dt=DecisionTreeClassifier()
p2={'max_depth':[3,4,5]}
m2=GridSearchCV(dt,p2,n_jobs=-1)
m2.fit(xtrain,ytrain)
print("The Decision Tree models score on test data is: ",m2.score(xtest,ytest))

The Decision Tree models score on test data is:  0.9598557692307692


In [None]:
rn=RandomForestClassifier()
p3={'n_estimators':[200,300,400,500,600,700,800,900,1000]}
m3=GridSearchCV(rn,p3,n_jobs=-1)
m3.fit(xtrain,ytrain)
print("The Random Forest model's score on the test data is: ",m3.score(xtest,ytest))

In [None]:
ada=AdaBoostClassifier()
p4={}