In [1]:
#mport libraries
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [36]:
# loading the dataset to a pandas DataFrame
news_dataset = pd.read_csv('fake-news/train.csv')

In [37]:
# print the first 5 rows of the dataframe
news_dataset.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [38]:
news_dataset['title'][0]

'House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It'

In [4]:
# counting the number of missing values in the dataset
news_dataset.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [5]:
# replacing the null values with empty string
news_dataset = news_dataset.fillna('')

In [6]:
# merging the author name and news title
news_dataset['content'] = news_dataset['author']+' '+news_dataset['title']

In [8]:
#streaming Function
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
def stre(dataset,col):
    corpus = []
    for i in range(0, len(dataset)):
        review = re.sub('[^a-zA-Z0-9]',' ', str(dataset[col][i]))
        review = review.lower()
        review = review.split()

        review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
        review = ' '.join(review)
        corpus.append(review)
    return corpus

In [9]:
#streaming apply on data
y=stre(news_dataset,"content")
news_dataset['content']=y

In [10]:
news_dataset

Unnamed: 0,id,title,author,text,label,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,darrel lucu hous dem aid even see comey letter...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,daniel j flynn flynn hillari clinton big woman...
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,consortiumnew com truth might get fire
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,jessica purkiss 15 civilian kill singl us airs...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,howard portnoy iranian woman jail fiction unpu...
...,...,...,...,...,...,...
20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0,jerom hudson rapper trump poster child white s...
20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0,benjamin hoffman n f l playoff schedul matchup...
20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0,michael j de la merc rachel abram maci said re...
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1,alex ansari nato russia hold parallel exercis ...


In [11]:
print(news_dataset['content'])

0        darrel lucu hous dem aid even see comey letter...
1        daniel j flynn flynn hillari clinton big woman...
2                   consortiumnew com truth might get fire
3        jessica purkiss 15 civilian kill singl us airs...
4        howard portnoy iranian woman jail fiction unpu...
                               ...                        
20795    jerom hudson rapper trump poster child white s...
20796    benjamin hoffman n f l playoff schedul matchup...
20797    michael j de la merc rachel abram maci said re...
20798    alex ansari nato russia hold parallel exercis ...
20799                         david swanson keep f 35 aliv
Name: content, Length: 20800, dtype: object


In [12]:
# separating the data & label
X = news_dataset['content']
Y = news_dataset['label']

In [13]:
#data cleaning function
import string
def remove_sp(news):
    news = news.lower()
    news=re.sub('\[.*?\]',' ',news)
    news = re.sub('@[^\s]+','',news)
    news = re.sub('http[^\s]+','',news)
    news=re.sub('\n',' ',news)
    news=re.sub('\w*\d\w*',' ',news)
    news=re.sub('[%s]'%re.escape(string.punctuation)," ",news)
    return news

In [14]:
#clean data
news_dataset = news_dataset.fillna('')
x=X.apply(remove_sp)

In [15]:
#Convert to dataframe
x=pd.DataFrame(x)
x

Unnamed: 0,content
0,darrel lucu hous dem aid even see comey letter...
1,daniel j flynn flynn hillari clinton big woman...
2,consortiumnew com truth might get fire
3,jessica purkiss civilian kill singl us airst...
4,howard portnoy iranian woman jail fiction unpu...
...,...
20795,jerom hudson rapper trump poster child white s...
20796,benjamin hoffman n f l playoff schedul matchup...
20797,michael j de la merc rachel abram maci said re...
20798,alex ansari nato russia hold parallel exercis ...


In [16]:
#Apply and save weights of TfidfVectorizer
import joblib
tf=TfidfVectorizer()
tf_x=tf.fit_transform(x['content']).toarray()
joblib.dump(tf, 'tfidf_vectorizer.pkl')
tf_x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [18]:

#convert to pandas data frame
tf_x=pd.DataFrame(tf_x)
tf_x


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17061,17062,17063,17064,17065,17066,17067,17068,17069,17070
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20795,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20796,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20797,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20798,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
#check data balanceing
y=pd.DataFrame(Y)
y.value_counts()

label
1        10413
0        10387
Name: count, dtype: int64

In [31]:
#apply train test split
x_train,x_test,y_train,y_test=train_test_split(tf_x,y,test_size=0.2)

In [28]:
#call model
ld=LogisticRegression(random_state=3)

In [29]:
#fit model on data
model=ld.fit(x_train,y_train)

  y = column_or_1d(y, warn=True)


In [30]:
joblib.dump(model,'nlppr.joblib')


['nlppr.joblib']

In [33]:
#Accuracy on validation data
pre=model.predict(x_test)
accuracy_score(y_test, pre)

0.984375

In [34]:
#cros validation score
from sklearn.model_selection import cross_val_score

#Your dataset is randomly divided into 5 roughly equal parts or "folds."

scores = cross_val_score(model,x_train,y_train, cv=5)
scores.mean()

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.9739182692307692

# Test Data

In [141]:
# loading the dataset to a pandas DataFrame
test1 = pd.read_csv('fake-news/test.csv')
test = test1.fillna('')

In [None]:
#Combine Auther & title feature
test['content'] = test['author']+' '+test['title']
y=stre(test,"content")
test['content']=y

In [None]:
# separating the data & label
X = test['content']

In [130]:
#clean test data
x=X.apply(remove_sp)

In [None]:
#Apply vectorizer
trs=joblib.load('tfidf_vectorizer.pkl')
tf_x=trs.transform(x).toarray()

In [132]:
#Convert to pandas dataframe
tf_x=pd.DataFrame(tf_x)
tf_x

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17061,17062,17063,17064,17065,17066,17067,17068,17069,17070
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5198,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [158]:
#prediction
pre=model.predict(tf_x)


In [159]:
#convert to pandas dataframe
pre=pd.DataFrame(pre,columns=(['target']))
pre

Unnamed: 0,target
0,0
1,1
2,1
3,0
4,1
...,...
5195,0
5196,0
5197,0
5198,1


In [160]:
#Concate both columns
c=pd.concat ([test1,pre],axis=1)

In [161]:
c

Unnamed: 0,id,title,author,text,target
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning...",0
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...,1
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...,1
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different...",0
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...,1
...,...,...,...,...,...
5195,25995,The Bangladeshi Traffic Jam That Never Ends - ...,Jody Rosen,Of all the dysfunctions that plague the world’...,0
5196,25996,John Kasich Signs One Abortion Bill in Ohio bu...,Sheryl Gay Stolberg,WASHINGTON — Gov. John Kasich of Ohio on Tu...,0
5197,25997,"California Today: What, Exactly, Is in Your Su...",Mike McPhate,Good morning. (Want to get California Today by...,0
5198,25998,300 US Marines To Be Deployed To Russian Borde...,,« Previous - Next » 300 US Marines To Be Deplo...,1
