In [1]:
# importing necessary libraries
import nltk
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import accuracy_score,classification_report
import re

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /Users/apple/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/apple/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
eng_stopwords = stopwords.words('english')

In [4]:
eng_stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [5]:
# dataset
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [6]:
train.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [7]:
# taking title and author combination for further process as length of text is long.
train = train.drop(['id','text'],axis=1)

In [8]:
# combining text and author
train['content'] = train['title'] + ' ' + train['author']

In [9]:
train.head()

Unnamed: 0,title,author,label,content
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,1,House Dem Aide: We Didn’t Even See Comey’s Let...
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,0,"FLYNN: Hillary Clinton, Big Woman on Campus - ..."
2,Why the Truth Might Get You Fired,Consortiumnews.com,1,Why the Truth Might Get You Fired Consortiumne...
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,1,15 Civilians Killed In Single US Airstrike Hav...
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,1,Iranian woman jailed for fictional unpublished...


In [10]:
# shape of data
train.shape

(20800, 4)

In [11]:
train.isnull().sum()

title       558
author     1957
label         0
content    2515
dtype: int64

In [12]:
# removing null data
train = train.dropna()

In [13]:
# removing author and title 
train = train.drop(['title','author'],axis=1)

In [14]:
# removing stop words and lemmatize content
def clean_data(corpus):
    """This function removes stopwords and perform lemmatization on corpus"""
    lem = WordNetLemmatizer()
    data = [lem.lemmatize(i) for i in re.sub('[^a-zA-z]',' ',corpus).lower().split() if i not in eng_stopwords]
    data = ' '.join(data)
    return data

In [15]:
clean_data(train.content[3])

'civilian killed single u airstrike identified jessica purkiss'

In [16]:
train.content[3]

'15 Civilians Killed In Single US Airstrike Have Been Identified Jessica Purkiss'

In [17]:
# applying clean data to content columnn of dataset
train['content'] = train['content'].apply(clean_data)

In [18]:
train.head()

Unnamed: 0,label,content
0,1,house dem aide even see comey letter jason cha...
1,0,flynn hillary clinton big woman campus breitba...
2,1,truth might get fired consortiumnews com
3,1,civilian killed single u airstrike identified ...
4,1,iranian woman jailed fictional unpublished sto...


In [19]:
# converting dataset in to x and y
X = train.content.values
y = train.label.values

In [20]:
y

array([1, 0, 1, ..., 0, 1, 1])

In [21]:
# vectorizing X
vec = TfidfVectorizer()
vec.fit(X)

X = vec.transform(X)

In [22]:
print(X)

  (0, 18060)	0.38808713060107397
  (0, 15521)	0.2427798867980732
  (0, 10318)	0.33618589141274713
  (0, 9990)	0.27337483704855015
  (0, 9042)	0.22874292487537523
  (0, 8187)	0.20453018622955177
  (0, 5867)	0.25643142541787856
  (0, 4390)	0.27817412922908
  (0, 4137)	0.33618589141274713
  (0, 3319)	0.22974162845391216
  (0, 2785)	0.3489068640346291
  (0, 346)	0.27903321744530385
  (1, 19285)	0.26704535576553445
  (1, 7984)	0.19642085261079475
  (1, 6551)	0.725092869732774
  (1, 4102)	0.2681462253773103
  (1, 3147)	0.19726215439605485
  (1, 2499)	0.37267404784322355
  (1, 2150)	0.15378866660711596
  (1, 1709)	0.3016526168172225
  (2, 17984)	0.4001053549271917
  (2, 11056)	0.4729119847705221
  (2, 7070)	0.3425106239251714
  (2, 6420)	0.46586160270447174
  (2, 3540)	0.4377950327255126
  :	:
  (18282, 15151)	0.24955740848212765
  (18282, 14170)	0.3137810409498304
  (18282, 13925)	0.2432639155112406
  (18282, 11781)	0.07569019301654026
  (18282, 11022)	0.1691358729487994
  (18282, 10955)	0.2

In [23]:
# splitting data into train test
# stratify make sure training data has almost equal number of each class labels
X_train,X_test,Y_train,Y_test = train_test_split(X,y,test_size=0.1,stratify=y,random_state=123)

In [24]:
# training RandomForest Classifier model
rf = RandomForestClassifier()
rf.fit(X_train,Y_train)

In [25]:
y_pred = rf.predict(X_test)

In [26]:
# test accuracy
accuracy_score(Y_test,y_pred)

0.9912520503007107

In [27]:
# training accuracy
y_train_pred = rf.predict(X_train)
accuracy_score(Y_train,y_train_pred)

1.0

In [28]:
## Predict on testing data
test.head()

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...


# we need to follow same data processing here

In [30]:
# removing id, text and combining title and author, also removing title and author

test['content'] = test['title'] + ' ' + test['author']
test = test.drop(['id','text','title','author'],axis=1)

In [31]:
test.head()

Unnamed: 0,content
0,"Specter of Trump Loosens Tongues, if Not Purse..."
1,
2,#NoDAPL: Native American Leaders Vow to Stay A...
3,"Tim Tebow Will Attempt Another Comeback, This ..."
4,Keiser Report: Meme Wars (E995) Truth Broadcas...


In [32]:
test.shape

(5200, 1)

In [33]:
test.isnull().sum()

content    625
dtype: int64

In [34]:
# removing null values
test = test.dropna()

In [37]:
test.head()

Unnamed: 0,content
0,"Specter of Trump Loosens Tongues, if Not Purse..."
2,#NoDAPL: Native American Leaders Vow to Stay A...
3,"Tim Tebow Will Attempt Another Comeback, This ..."
4,Keiser Report: Meme Wars (E995) Truth Broadcas...
6,Pelosi Calls for FBI Investigation to Find Out...


In [38]:
# aaplying cleaning function to X_test
test['content'] = test['content'].apply(clean_data)

In [39]:
X_test = test.content.values

In [40]:
X_test

array(['specter trump loosens tongue purse string silicon valley new york time david streitfeld',
       'nodapl native american leader vow stay winter file lawsuit police common dream',
       'tim tebow attempt another comeback time baseball new york time daniel victor',
       ...,
       'john kasich sign one abortion bill ohio veto restrictive measure new york time sheryl gay stolberg',
       'california today exactly sushi new york time mike mcphate',
       'awkward sex onscreen new york time teddy wayne'], dtype=object)

In [41]:
# vectorizing X_test
# using same vectorizer used in training
X_test = vec.transform(X_test)

In [42]:
X_test_pred = rf.predict(X_test)

In [43]:
X_test_pred # 1 stand for fake news and 0 for True

array([0, 1, 0, ..., 0, 0, 0])