#FAKE NEWS DETECTION

In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
#printing stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

Data preprocessing

In [5]:
news_dataset = pd.read_csv('/content/train.csv')

In [6]:
news_dataset.shape

(10240, 2)

In [7]:
news_dataset.head()


Unnamed: 0,Statement,Label
0,Says the Annies List political group supports ...,False
1,When did the decline of coal start? It started...,True
2,"Hillary Clinton agrees with John McCain ""by vo...",True
3,Health care reform legislation is likely to ma...,False
4,The economic turnaround started at the end of ...,True


In [8]:
#counting the missing values in dataset
news_dataset.isnull().sum()

Statement    0
Label        0
dtype: int64

In [9]:
#replacing the null values with empty string
news_dataset = news_dataset.fillna('')

In [12]:
print(news_dataset.columns)

Index(['Statement', 'Label'], dtype='object')


In [14]:
# Check column types
print(news_dataset.dtypes)

# Convert columns to string if needed
news_dataset['Statement'] = news_dataset['Statement'].astype(str)
news_dataset['Label'] = news_dataset['Label'].astype(str)

# Concatenate 'Statement' and 'Label' into 'content'
news_dataset['content'] = news_dataset['Statement'] + ' ' + news_dataset['Label']


Statement    object
Label          bool
dtype: object


In [15]:
# merging the author name and news title
news_dataset['content'] = news_dataset['Statement']+' '+news_dataset['Label']

In [16]:
print(news_dataset['content'])

0        Says the Annies List political group supports ...
1        When did the decline of coal start? It started...
2        Hillary Clinton agrees with John McCain "by vo...
3        Health care reform legislation is likely to ma...
4        The economic turnaround started at the end of ...
                               ...                        
10235    There are a larger number of shark attacks in ...
10236    Democrats have now become the party of the [At...
10237    Says an alternative to Social Security that op...
10238    On lifting the U.S. Cuban embargo and allowing...
10239    The Department of Veterans Affairs has a manua...
Name: content, Length: 10240, dtype: object


In [18]:
# separating the data and label
X = news_dataset.drop(columns = 'Label', axis=1)
Y = news_dataset['Label']
print(X)
print(Y)

                                               Statement  \
0      Says the Annies List political group supports ...   
1      When did the decline of coal start? It started...   
2      Hillary Clinton agrees with John McCain "by vo...   
3      Health care reform legislation is likely to ma...   
4      The economic turnaround started at the end of ...   
...                                                  ...   
10235  There are a larger number of shark attacks in ...   
10236  Democrats have now become the party of the [At...   
10237  Says an alternative to Social Security that op...   
10238  On lifting the U.S. Cuban embargo and allowing...   
10239  The Department of Veterans Affairs has a manua...   

                                                 content  
0      Says the Annies List political group supports ...  
1      When did the decline of coal start? It started...  
2      Hillary Clinton agrees with John McCain "by vo...  
3      Health care reform legislation is li

Stemming - reducing the word to its root word
actor, actress,acting ---> act

In [19]:
port_stem = PorterStemmer()

In [20]:
def stemming(content):
  stemmed_content = re.sub('[^a-zA-Z]',' ', content) # re- regular expression [''] removes all things excepta-zA-Z
  stemmed_content = stemmed_content.lower() #converting all to lowercase
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)
  return stemmed_content

In [21]:
news_dataset['content'] = news_dataset['content'].apply(stemming)

In [22]:
print(news_dataset['content'])

0        say anni list polit group support third trimes...
1        declin coal start start natur ga took start be...
2        hillari clinton agre john mccain vote give geo...
3        health care reform legisl like mandat free sex...
4                    econom turnaround start end term true
                               ...                        
10235    larger number shark attack florida case voter ...
10236    democrat becom parti atlanta metro area black ...
10237    say altern social secur oper galveston counti ...
10238          lift u cuban embargo allow travel cuba fals
10239    depart veteran affair manual tell veteran stuf...
Name: content, Length: 10240, dtype: object


In [24]:
# separate the data and label
X=  news_dataset['content'].values
Y = news_dataset['Label'].values

In [25]:
print(X)

['say anni list polit group support third trimest abort demand fals'
 'declin coal start start natur ga took start begin presid georg w bush administr true'
 'hillari clinton agre john mccain vote give georg bush benefit doubt iran true'
 ...
 'say altern social secur oper galveston counti texa meant particip retir whole lot money social secur true'
 'lift u cuban embargo allow travel cuba fals'
 'depart veteran affair manual tell veteran stuff like realli valu commun know encourag commit suicid fals']


In [26]:
print(Y)

['False' 'True' 'True' ... 'True' 'False' 'False']


In [27]:
Y.shape

(10240,)

In [28]:
# converting textual data to numerical data using Tfidf-Term frequency Inverse document frequency
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)


In [29]:
print(X)

  (0, 7033)	0.43669429453116043
  (0, 6863)	0.27573188695020806
  (0, 6638)	0.2111421076017704
  (0, 5914)	0.11613396175353505
  (0, 5159)	0.2919258415476546
  (0, 3945)	0.32505944329339903
  (0, 2909)	0.2927031218105689
  (0, 2379)	0.08712067620074737
  (0, 1739)	0.34600876654225243
  (0, 282)	0.45607620906624996
  (0, 20)	0.2516443712673485
  (1, 7048)	0.06105009973572113
  (1, 6936)	0.20245915095836528
  (1, 6446)	0.6775992654692296
  (1, 5263)	0.14802341475633177
  (1, 4496)	0.2751380197455926
  (1, 2750)	0.22312934515115332
  (1, 2686)	0.2218300342009326
  (1, 1691)	0.27348844539196465
  (1, 1258)	0.2751380197455926
  (1, 913)	0.19670651815489937
  (1, 602)	0.26751369652476037
  (1, 90)	0.19692991443356575
  (2, 7354)	0.19135236143981704
  (2, 7048)	0.07895430049691148
  :	:
  (10237, 2697)	0.3272515779961992
  (10237, 1492)	0.19012567998492264
  (10237, 217)	0.29725808948777377
  (10238, 7002)	0.39581079290367843
  (10238, 3919)	0.4176643077034324
  (10238, 2379)	0.09823038809044

Splitting dataset to taining and  test data

In [30]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=2)

Training the model using Logistic Regression

In [31]:
model = LogisticRegression()

In [32]:
model.fit(X_train, Y_train)

Accuracy score

In [33]:
# accuracy score on training data
X_train_prediction = model.predict(X_train) #model=Logistic Regression
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [34]:
print('Accuracy score of the training data: ', training_data_accuracy)

Accuracy score of the training data:  0.9996337890625


In [35]:
# accuracy score on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [36]:
print('Accuracy for test data: ', test_data_accuracy)

Accuracy for test data:  0.998046875


Making predictive System
1 : FAKE news
0 : REAL news

In [37]:
X_new = X_test[8]
prediction = model.predict(X_new)
print(prediction)

if(prediction[0]==0):
  print('the news is REAL')
else:
  print('The news is FAKE')

['False']
The news is FAKE


In [38]:
print(Y_test[8])


False
