Importing the Dataset :


In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Printing the stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

Loading the dataset :

In [4]:
news_dataset = pd.read_csv('/content/train.csv',error_bad_lines=False,engine='python')

Skipping line 13535: field larger than field limit (131072)
Skipping line 19783: field larger than field limit (131072)
Skipping line 13541: Expected 5 fields in line 13541, saw 6
Skipping line 19782: Expected 5 fields in line 19782, saw 7
Skipping line 19783: Expected 5 fields in line 19783, saw 10
Skipping line 19785: Expected 5 fields in line 19785, saw 14
Skipping line 19787: Expected 5 fields in line 19787, saw 8
Skipping line 19789: Expected 5 fields in line 19789, saw 7
Skipping line 19790: Expected 5 fields in line 19790, saw 15
Skipping line 19791: Expected 5 fields in line 19791, saw 9
Skipping line 19792: Expected 5 fields in line 19792, saw 9
Skipping line 19793: Expected 5 fields in line 19793, saw 12
Skipping line 19794: Expected 5 fields in line 19794, saw 15
Skipping line 19795: Expected 5 fields in line 19795, saw 8
Skipping line 19796: Expected 5 fields in line 19796, saw 20
Skipping line 19797: Expected 5 fields in line 19797, saw 8
Skipping line 19798: Expected 5 fi

In [5]:
news_dataset.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [6]:
news_dataset.shape

(20822, 5)

In [7]:
# Counting the number of missing values in the dataset
news_dataset.isna().sum()

id           0
title      565
author    1975
text        59
label       22
dtype: int64

In [8]:
# Replacing this missing values with empty strings
news_dataset = news_dataset.fillna("")

In [9]:
news_dataset.isna() # Now all the missing values have been replaced

Unnamed: 0,id,title,author,text,label
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False
...,...,...,...,...,...
20817,False,False,False,False,False
20818,False,False,False,False,False
20819,False,False,False,False,False
20820,False,False,False,False,False


Now we need to do data preprocessing so that the macihine can understand the text. That is we will have to make this text data into meaningful numbers for the machine to get trained.

In [10]:
# Merging the author and title as content
news_dataset['content'] = news_dataset['author'] + " " + news_dataset['title'] # we will use content to train the model

In [11]:
print(news_dataset['content'])

0        Darrell Lucus House Dem Aide: We Didn’t Even S...
1        Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo...
2        Consortiumnews.com Why the Truth Might Get You...
3        Jessica Purkiss 15 Civilians Killed In Single ...
4        Howard Portnoy Iranian woman jailed for fictio...
                               ...                        
20817    Jerome Hudson Rapper T.I.: Trump a ’Poster Chi...
20818    Benjamin Hoffman N.F.L. Playoffs: Schedule, Ma...
20819    Michael J. de la Merced and Rachel Abrams Macy...
20820    Alex Ansary NATO, Russia To Hold Parallel Exer...
20821              David Swanson What Keeps the F-35 Alive
Name: content, Length: 20822, dtype: object


In [12]:
# Stemming the data to get the root words
port_stem = PorterStemmer()

In [13]:
def stemming(content): # This functions does the textual data preprocessing
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [14]:
news_dataset['content'] = news_dataset['content'].apply(stemming) # Applying stemming on content column

In [15]:
print(news_dataset['content']) # This is the stemmed data

0        darrel lucu hous dem aid even see comey letter...
1        daniel j flynn flynn hillari clinton big woman...
2                   consortiumnew com truth might get fire
3        jessica purkiss civilian kill singl us airstri...
4        howard portnoy iranian woman jail fiction unpu...
                               ...                        
20817    jerom hudson rapper trump poster child white s...
20818    benjamin hoffman n f l playoff schedul matchup...
20819    michael j de la merc rachel abram maci said re...
20820    alex ansari nato russia hold parallel exercis ...
20821                            david swanson keep f aliv
Name: content, Length: 20822, dtype: object


In [16]:
# Separating the data and label
X = news_dataset['content'].values
Y = news_dataset['label'].values

In [17]:
print(X) # data --> features

['darrel lucu hous dem aid even see comey letter jason chaffetz tweet'
 'daniel j flynn flynn hillari clinton big woman campu breitbart'
 'consortiumnew com truth might get fire' ...
 'michael j de la merc rachel abram maci said receiv takeov approach hudson bay new york time'
 'alex ansari nato russia hold parallel exercis balkan'
 'david swanson keep f aliv']


In [18]:
print(Y) # labels (1-->fake news & 0-->real news)

['1' '0' '1' ... '0' '1' '1']


In [19]:
# Converting the textual data into numerical data for the machine to train itself
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [20]:
X = vectorizer.transform(X)

In [21]:
print(X) # All the textual data is converted into numerical data

  (0, 15695)	0.2848972510323515
  (0, 13478)	0.2560149172431211
  (0, 8912)	0.36364344795259246
  (0, 8633)	0.29217179802324905
  (0, 7695)	0.24789859828097843
  (0, 7008)	0.21878792690522433
  (0, 4975)	0.2333632865672138
  (0, 3794)	0.2705797813233836
  (0, 3602)	0.35994096489813754
  (0, 2960)	0.24689141014750257
  (0, 2484)	0.3676990592743697
  (0, 267)	0.27014778053672167
  (1, 16807)	0.30071934786077886
  (1, 6819)	0.1904866118084521
  (1, 5506)	0.7143145691090199
  (1, 3570)	0.2637458518155929
  (1, 2814)	0.19096625123288233
  (1, 2224)	0.3827200188088641
  (1, 1894)	0.15524631265972175
  (1, 1497)	0.2939921887587299
  (2, 15620)	0.4154491828226571
  (2, 9623)	0.4935010343775579
  (2, 5971)	0.3474726097377811
  (2, 5392)	0.3866575731579265
  (2, 3104)	0.4609666076640373
  :	:
  (20819, 13126)	0.24825540550304984
  (20819, 12348)	0.2726336911024517
  (20819, 12142)	0.2477854180074755
  (20819, 10310)	0.08040873089640542
  (20819, 9591)	0.17456730094721287
  (20819, 9521)	0.295416

Train-test splitting

In [24]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=2)

Training the model ----> Logistic regression

In [25]:
model = LogisticRegression()

In [26]:
model.fit(X_train,Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

Evaluating the model 

In [27]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction,Y_train)

In [28]:
print("The accuracy on training data = ",training_data_accuracy)

The accuracy on training data =  0.9924356126553401


In [29]:
# accuracy on testing data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction,Y_test)

In [30]:
print("The accuracy on test data = ",test_data_accuracy)

The accuracy on test data =  0.9815126050420168


MAKING A PREDICTION SYSTEM FOR FAKE NEWS DETECTION :

In [31]:
X_new = X_test[10] ## stemmed data for analysis
prediction = model.predict(X_new)
if prediction[0]==0:
  print("The news is true")
else:
  print("The news is fake")

The news is fake


In [32]:
print(Y_test[10])

1
