In [1]:
# import the dependencies
# The re dependency is a regular expression, used for searching for text in a document or paragraph
# nltk - natural language toolkit
# Stemming removes the suffix and prefix of a word, and returns the base of the word
# TfidfVectorizer converts text into feature vectors (numbers)
# stopwords are the words that do not add too much value to the text, eg. articles, where at, etc.


import pandas as pd
import numpy as np
import re
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
# downloading the nltk package 

import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\VICTUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# words that do not add too much to a sentence, stopwords in english

stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [4]:
# data preprocessing 

news_data = pd.read_csv('./train.csv/train.csv')
news_data.shape

(20800, 5)

In [5]:
news_data.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [6]:
# counting the total number of labels
news_data['label'].value_counts()

label
1    10413
0    10387
Name: count, dtype: int64

In [None]:
# counting the total number of titles
# news_data['title'].value_counts()

In [7]:
# counting the total number of missing values in the dataset
news_data.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [8]:
# replacing the null values with empty strings

news_data = news_data.fillna('')


In [9]:
# all the missing data has been replaces with empty strings
news_data.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

In [10]:
# Merging the author and title, they are the ones that will be used in the data analysis.......text cannot be used because they are too large
# We are creating a new column called content, by combining author and title, but we leave a space between the two using the empty strings in between

news_data['content'] = news_data['author'] +' '+ news_data['title']
news_data['content']

0        Darrell Lucus House Dem Aide: We Didn’t Even S...
1        Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo...
2        Consortiumnews.com Why the Truth Might Get You...
3        Jessica Purkiss 15 Civilians Killed In Single ...
4        Howard Portnoy Iranian woman jailed for fictio...
                               ...                        
20795    Jerome Hudson Rapper T.I.: Trump a ’Poster Chi...
20796    Benjamin Hoffman N.F.L. Playoffs: Schedule, Ma...
20797    Michael J. de la Merced and Rachel Abrams Macy...
20798    Alex Ansary NATO, Russia To Hold Parallel Exer...
20799              David Swanson What Keeps the F-35 Alive
Name: content, Length: 20800, dtype: object

In [11]:
news_data.shape

(20800, 6)

In [12]:
news_data.head()

Unnamed: 0,id,title,author,text,label,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,Darrell Lucus House Dem Aide: We Didn’t Even S...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Consortiumnews.com Why the Truth Might Get You...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,Jessica Purkiss 15 Civilians Killed In Single ...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Howard Portnoy Iranian woman jailed for fictio...


In [13]:
# creating the inputs and outputs(labels)

X = news_data.drop(columns='label', axis=1)
y = news_data['label']
X, y

(          id                                              title  \
 0          0  House Dem Aide: We Didn’t Even See Comey’s Let...   
 1          1  FLYNN: Hillary Clinton, Big Woman on Campus - ...   
 2          2                  Why the Truth Might Get You Fired   
 3          3  15 Civilians Killed In Single US Airstrike Hav...   
 4          4  Iranian woman jailed for fictional unpublished...   
 ...      ...                                                ...   
 20795  20795  Rapper T.I.: Trump a ’Poster Child For White S...   
 20796  20796  N.F.L. Playoffs: Schedule, Matchups and Odds -...   
 20797  20797  Macy’s Is Said to Receive Takeover Approach by...   
 20798  20798  NATO, Russia To Hold Parallel Exercises In Bal...   
 20799  20799                          What Keeps the F-35 Alive   
 
                                           author  \
 0                                  Darrell Lucus   
 1                                Daniel J. Flynn   
 2                     

In [14]:
X.shape

(20800, 5)

In [15]:
# stemming procedure - extracting the base form of words by removing the prefix and suffixes using the stopwords
# example actor, actress, acting --> act
# importing the porter stemmer module

port_stem = PorterStemmer()




In [20]:
# Writing a function for the stemming
# Content is the value to be passed into the function
# The first line is the re.sub
# The re.sub, re, is a regular expression that searches through text, and the .sub method kind of filters the the words and returns only alphabets, both small and capital.
# It excludes all other characters like ,. numbers and returns alphabets alone.
# The operation will be done on the content that has been passed over there.
# The .lower function, converts all the alphabets to small letters
# The split function converts all the letters to a list 

def stemming (content):
    stemmed_content = re.sub('[^a-zA-Z]',' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content


In [21]:
# Applying the function to our content column, and returning a new content column with the stemmed words

news_data['content'] = news_data['content'].apply(stemming)


In [26]:
news_data['content']

0        darrel lucu hou dem aid even see comey letter ...
1        daniel j flynn flynn hillari clinton big woman...
2                   consortiumnew com truth might get fire
3        jessica purkiss civilian kill singl us airstri...
4        howard portnoy iranian woman jail fiction unpu...
                               ...                        
20795    jerom hudson rapper trump poster child white s...
20796    benjamin hoffman n f l playoff schedul matchup...
20797    michael j de la merc rachel abram maci said re...
20798    alex ansari nato russia hold parallel exerci b...
20799                            david swanson keep f aliv
Name: content, Length: 20800, dtype: object

In [44]:
# As we only want to work with the content column in making predictions, we will reassign X to the content column

X = news_data['content'].values
y = news_data['label'].values
X,y

(array(['darrel lucu hou dem aid even see comey letter jason chaffetz tweet',
        'daniel j flynn flynn hillari clinton big woman campu breitbart',
        'consortiumnew com truth might get fire', ...,
        'michael j de la merc rachel abram maci said receiv takeov approach hudson bay new york time',
        'alex ansari nato russia hold parallel exerci balkan',
        'david swanson keep f aliv'], dtype=object),
 array([1, 0, 1, ..., 0, 1, 1], dtype=int64))

In [45]:
# converting the stemmed text into numbers so that the machine can understand
# counts how much a word is repeated, and gives a value, and transforms the word to feature vectors which are basically numbers

vectorizer = TfidfVectorizer()
vectorizer.fit(X)
X = vectorizer.transform(X)
# X = vectorizer.fit_transform(X)
# X

In [48]:
X
# dense = X.toarray()
# dense


<20800x16984 sparse matrix of type '<class 'numpy.float64'>'
	with 210584 stored elements in Compressed Sparse Row format>

In [50]:
# Splitting the data into training and testing data

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, stratify=y, random_state=1)
X.shape, X_test.shape, X_train.shape

((20800, 16984), (4160, 16984), (16640, 16984))

In [51]:
# importing our model, logistic regression

model = LogisticRegression()


In [52]:
# training our model using the X_train and y_train

model.fit(X_train, y_train)


In [53]:
# Finding the accuracy score on the training data
# you compare the X_train prediction against the y_train

X_train_predict = model.predict(X_train)
X_accuracy = accuracy_score(X_train_predict, y_train)
X_accuracy


0.9868990384615385

In [54]:
# the accuracy score on the test data is the most important compared to the training data

X_test_prediction = model.predict(X_test)
X_test_accuracy = accuracy_score(X_test_prediction, y_test)
X_test_accuracy

0.9764423076923077

In [68]:
# Building the predictive system

X_predict = X_test[3]

prediction = model.predict(X_predict)
prediction

if(prediction[0] == 0):
    print('news is real')
else:
    print('news is fake')


news is fake


In [69]:
y_test[3]

1