In [5]:
#Importing Libs
import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rcParams
from textblob import TextBlob
from plotly import tools
import plotly.graph_objs as go
from plotly.offline import iplot

plt.rcParams['figure.figsize'] = [10,5]
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline = False, world_readable = True)

import nltk
import re
import string
from nltk.corpus import stopwords
from wordcloud import WordCloud,STOPWORDS
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier

from sklearn import metrics
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, confusion_matrix, accuracy_score
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

In [6]:
train_data = pd.read_csv('./train_fake_news.csv')
test_data = pd.read_csv('./test_fake_news.csv')
train_data.head(10)

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
5,5,Jackie Mason: Hollywood Would Love Trump if He...,Daniel Nussbaum,"In these trying times, Jackie Mason is the Voi...",0
6,6,Life: Life Of Luxury: Elton John’s 6 Favorite ...,,Ever wonder how Britain’s most iconic pop pian...,1
7,7,Benoît Hamon Wins French Socialist Party’s Pre...,Alissa J. Rubin,"PARIS — France chose an idealistic, traditi...",0
8,8,Excerpts From a Draft Script for Donald Trump’...,,Donald J. Trump is scheduled to make a highly ...,0
9,9,"A Back-Channel Plan for Ukraine and Russia, Co...",Megan Twohey and Scott Shane,A week before Michael T. Flynn resigned as nat...,0


In [22]:
print('The shape of the training data is (row, column) : '+str(train_data.shape))
print(train_data.info())
print(train_data.head())
print('\n---------------------------------------\n')

The shape of the training data is (row, column) : (20800, 6)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       20800 non-null  int64 
 1   title    20800 non-null  object
 2   author   20800 non-null  object
 3   text     20800 non-null  object
 4   label    20800 non-null  int64 
 5   content  20800 non-null  object
dtypes: int64(2), object(4)
memory usage: 975.1+ KB
None
   id                                              title              author  \
0   0  House Dem Aide: We Didn’t Even See Comey’s Let...       Darrell Lucus   
1   1  FLYNN: Hillary Clinton, Big Woman on Campus - ...     Daniel J. Flynn   
2   2                  Why the Truth Might Get You Fired  Consortiumnews.com   
3   3  15 Civilians Killed In Single US Airstrike Hav...     Jessica Purkiss   
4   4  Iranian woman jailed for fictional unpublished...      Howard Portnoy  

In [23]:
print('The shape of the testing data is (row, column) : '+str(test_data.shape))
print(test_data.info())
print(test_data.head())
print('\n---------------------------------------\n')

The shape of the testing data is (row, column) : (5200, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5200 entries, 0 to 5199
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       5200 non-null   int64 
 1   title    5200 non-null   object
 2   author   5200 non-null   object
 3   text     5200 non-null   object
 4   content  5200 non-null   object
dtypes: int64(1), object(4)
memory usage: 203.2+ KB
None
      id                                              title  \
0  20800  Specter of Trump Loosens Tongues, if Not Purse...   
1  20801  Russian warships ready to strike terrorists ne...   
2  20802  #NoDAPL: Native American Leaders Vow to Stay A...   
3  20803  Tim Tebow Will Attempt Another Comeback, This ...   
4  20804                    Keiser Report: Meme Wars (E995)   

                    author                                               text  \
0         David Streitfeld  PALO ALTO, Calif.  —   After ye

In [9]:
#refer to this code for implementing the classification process
#https://www.kaggle.com/shahidkhan1/fake-news-detection-using-lstm-90-accuracy

In [10]:
#Checking the data shape of Fake News Dataset
train_data.shape
test_data.shape

(5200, 4)

In [11]:
#Checking if there is a Null value in the dataset and 
#replacing the null value with empty string to avoid error in tfidf function
print('Are there null data in the training data? \n',train_data.isnull().sum())

Are there null data in the training data? 
 id           0
title      558
author    1957
text        39
label        0
dtype: int64


In [12]:
print('Replace null data with empty string, you can also just delete the null value to repair the data')
train_data = train_data.fillna('')
print('Now the data is clean')
train_data.isnull().sum()

Replace null data with empty string, you can also just delete the null value to repair the data
Now the data is clean


id        0
title     0
author    0
text      0
label     0
dtype: int64

In [13]:
#Do the same thing to test_data to prepare it before using it as testing data
print('Are there null data in the testing data? \n',test_data.isnull().sum())

Are there null data in the testing data? 
 id          0
title     122
author    503
text        7
dtype: int64


In [14]:
print('Replace null data with empty string, you can also just delete the null value to repair the data')
test_data = test_data.fillna('')
print('Now the data is clean')
test_data.isnull().sum()

Replace null data with empty string, you can also just delete the null value to repair the data
Now the data is clean


id        0
title     0
author    0
text      0
dtype: int64

In [15]:
#Using the title and author to make predictions
train_data['content'] = train_data['author']+'  '+train_data['title']
print('Checking the concatenate Content columns at training data')
print(train_data['content'],'\n')

test_data['content'] = test_data['author']+' '+test_data['title']
print('Checking the concatenate Content columns at testing data')
print(test_data['content'])

Checking the concatenate Content columns at training data
0        Darrell Lucus  House Dem Aide: We Didn’t Even ...
1        Daniel J. Flynn  FLYNN: Hillary Clinton, Big W...
2        Consortiumnews.com  Why the Truth Might Get Yo...
3        Jessica Purkiss  15 Civilians Killed In Single...
4        Howard Portnoy  Iranian woman jailed for ficti...
                               ...                        
20795    Jerome Hudson  Rapper T.I.: Trump a ’Poster Ch...
20796    Benjamin Hoffman  N.F.L. Playoffs: Schedule, M...
20797    Michael J. de la Merced and Rachel Abrams  Mac...
20798    Alex Ansary  NATO, Russia To Hold Parallel Exe...
20799             David Swanson  What Keeps the F-35 Alive
Name: content, Length: 20800, dtype: object 

Checking the concatenate Content columns at testing data
0       David Streitfeld Specter of Trump Loosens Tong...
1        Russian warships ready to strike terrorists n...
2       Common Dreams #NoDAPL: Native American Leaders...
3       Daniel V

In [16]:
#split the data and the label
X_train = train_data.drop(columns = 'label',axis = 1)
Y_train = train_data['label']
print(X_train)

          id                                              title  \
0          0  House Dem Aide: We Didn’t Even See Comey’s Let...   
1          1  FLYNN: Hillary Clinton, Big Woman on Campus - ...   
2          2                  Why the Truth Might Get You Fired   
3          3  15 Civilians Killed In Single US Airstrike Hav...   
4          4  Iranian woman jailed for fictional unpublished...   
...      ...                                                ...   
20795  20795  Rapper T.I.: Trump a ’Poster Child For White S...   
20796  20796  N.F.L. Playoffs: Schedule, Matchups and Odds -...   
20797  20797  Macy’s Is Said to Receive Takeover Approach by...   
20798  20798  NATO, Russia To Hold Parallel Exercises In Bal...   
20799  20799                          What Keeps the F-35 Alive   

                                          author  \
0                                  Darrell Lucus   
1                                Daniel J. Flynn   
2                             Consortiu

In [17]:
portstem = PorterStemmer()
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [portstem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [18]:
train_data['content'] = train_data['content'].apply(stemming)
test_data['content'] = test_data['content'].apply(stemming)
print(train_data['content'])
print(test_data['content'])

0        darrel lucu hous dem aid even see comey letter...
1        daniel j flynn flynn hillari clinton big woman...
2                   consortiumnew com truth might get fire
3        jessica purkiss civilian kill singl us airstri...
4        howard portnoy iranian woman jail fiction unpu...
                               ...                        
20795    jerom hudson rapper trump poster child white s...
20796    benjamin hoffman n f l playoff schedul matchup...
20797    michael j de la merc rachel abram maci said re...
20798    alex ansari nato russia hold parallel exercis ...
20799                            david swanson keep f aliv
Name: content, Length: 20800, dtype: object
0       david streitfeld specter trump loosen tongu pu...
1       russian warship readi strike terrorist near al...
2       common dream nodapl nativ american leader vow ...
3       daniel victor tim tebow attempt anoth comeback...
4        truth broadcast network keiser report meme war e
                 

In [19]:
X_train = train_data['content'].values
X_test = test_data['content'].values
Y = train_data['label'].values
print(X_train)
print(X_test)
#print(Y)

['darrel lucu hous dem aid even see comey letter jason chaffetz tweet'
 'daniel j flynn flynn hillari clinton big woman campu breitbart'
 'consortiumnew com truth might get fire' ...
 'michael j de la merc rachel abram maci said receiv takeov approach hudson bay new york time'
 'alex ansari nato russia hold parallel exercis balkan'
 'david swanson keep f aliv']
['david streitfeld specter trump loosen tongu purs string silicon valley new york time'
 'russian warship readi strike terrorist near aleppo'
 'common dream nodapl nativ american leader vow stay winter file lawsuit polic'
 ... 'mike mcphate california today exactli sushi new york time'
 'us marin deploy russian border norway'
 'teddi wayn awkward sex onscreen new york time']


In [20]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X_train)
vectorizer.fit(X_test)
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

In [24]:
X_train, X_test, Y_train, Y_test = train_test_split(X_train,train_data.label, test_size = 0.3, stratify = Y, random_state = 2)

In [25]:
model = LogisticRegression()
model.fit(X_train, Y_train)

LogisticRegression()

In [26]:
X_train_pred = model.predict(X_train)
training_data_acc = accuracy_score(X_train_pred, Y_train)
print(X_train_pred)
print(Y_train)
print('Accuracy score of training data = ', training_data_acc)

[1 1 1 ... 0 1 0]
14719    1
11282    1
14210    1
19999    1
8622     0
        ..
16733    1
15857    1
17264    0
19253    1
15526    0
Name: label, Length: 14560, dtype: int64
Accuracy score of training data =  0.9842032967032966


In [4]:
X_train_pred = model.predict(X_test)
testing_data_acc = accuracy_score(X_train_pred, Y_test)
print('Accuracy score of testing data : ',testing_data_acc)

NameError: name 'model' is not defined