In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.model_selection import train_test_split
import string,nltk
from nltk.corpus import stopwords

from nltk.stem import WordNetLemmatizer

In [10]:
train=pd.read_csv("../datasets/fake_news/train.csv")
test=pd.read_csv("../datasets/fake_news/test.csv")

In [11]:
print(train.shape, test.shape)

(20800, 5) (5200, 4)


## Data Analysis

In [12]:
train.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [13]:
test.head()

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...


In [15]:
# It shows the distribution of features and labels.
train.describe()

Unnamed: 0,id,label
count,20800.0,20800.0
mean,10399.5,0.500625
std,6004.587135,0.500012
min,0.0,0.0
25%,5199.75,0.0
50%,10399.5,1.0
75%,15599.25,1.0
max,20799.0,1.0


In [19]:
# This function shows the information about the data:
# Whether some Columns have missing data 
# Data type of different columns
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


In [20]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5200 entries, 0 to 5199
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      5200 non-null   int64 
 1   title   5078 non-null   object
 2   author  4697 non-null   object
 3   text    5193 non-null   object
dtypes: int64(1), object(3)
memory usage: 162.6+ KB


## Imputation

In [21]:
# How much data is missing ?
train.isnull().sum()
# Show count of 'NaN' in each column

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [23]:
test.isnull().sum()

id          0
title     122
author    503
text        7
dtype: int64

#### Dealing with Missing Data 
1. Replace the missing value with space ' ' in textual data
2. Remove the rows or columns containing missing data
    - Only valid when few rows have missing data
        - Here in data if text is missing then logically there is no sense to process that data point
        - As rows with missing text is only 39 we can remove those rows having missing data.
        - **Show How results vary when we remove these rows and when we keep these rows and replace missing values with space**
    - Dropping columns is stupid

In [24]:
train=train.fillna(' ')
test=test.fillna(' ')

train['check']=train['author']+train['title']+train['text']
test['check']=test['author']+test['title']+test['text']


In [26]:
train.head()

Unnamed: 0,id,title,author,text,label,check
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,Darrell LucusHouse Dem Aide: We Didn’t Even Se...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"Daniel J. FlynnFLYNN: Hillary Clinton, Big Wom..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Consortiumnews.comWhy the Truth Might Get You ...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,Jessica Purkiss15 Civilians Killed In Single U...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Howard PortnoyIranian woman jailed for fiction...


In [39]:
# Author , Title and Text are combined without any space in between, this can create ambiguity for machine
# Try adding space in between when combining these fields, see how result changes.

print('Title:', train.iloc[0,1])
print('Author:', train.iloc[0,2])
print('Text:', train.iloc[0,3][:200],"\n")
print('Combined:', train.iloc[0,5][:200])

Title: House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It
Author: Darrell Lucus
Text: House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It By Darrell Lucus on October 30, 2016 Subscribe Jason Chaffetz on the stump in American Fork, Utah ( image courtesy Mic 

Combined: Darrell LucusHouse Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted ItHouse Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It By Darrell Lucus on Octo


In [40]:
train.drop('author',axis=1,inplace=True)
train.drop('title',axis=1,inplace=True)
train.drop('text',axis=1,inplace=True)
#train


In [41]:
stop=stopwords.words('english')
stop=stop+list(string.punctuation)
lemm=WordNetLemmatizer()
#nltk.download('punkt')
from nltk.tokenize import sent_tokenize,word_tokenize

In [42]:
for index, row in train.iterrows():
    clean=''
    sentence=row['check']
    words=word_tokenize(sentence)
    words=[w for w in words if not w in stop]
    
    for w in words:
        w=str(lemm.lemmatize(w))
        clean=clean+' ' +w.lower()
    train.loc[index,'check']=clean

KeyboardInterrupt: 

In [7]:


from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2),max_features=1000)
x = vectorizer.fit_transform(train['check'])
#print(vectorizer.get_feature_names())



In [8]:
y=train['label']
test_data=vectorizer.transform(test['check'])
x_train,x_test,y_train,y_test=train_test_split(x,y)
#x_train.shape

In [9]:
from sklearn.linear_model import LogisticRegression
log=LogisticRegression()
log.fit(x_train,y_train)
print('Logistic regression on training data')
print(log.score(x_train,y_train))
print('Logistic regression on testing data')
print(log.score(x_test,y_test))

Logistic regression on training data
0.9558333333333333
Logistic regression on testing data
0.9496153846153846


In [10]:
from sklearn.svm import SVC
svc=SVC()
svc.fit(x_train,y_train)
print('SVM on training data')
print(svc.score(x_train,y_train))
print('SVM on testing data')
print(svc.score(x_test,y_test))


SVM on training data
0.9939743589743589
SVM on testing data
0.9625


In [None]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
rf.fit(x_train,y_train)
print('Random Forest on training data')
print(rf.score(x_train,y_train))
print('Random Forest on testing data')
print(rf.score(x_test,y_test))
