In [36]:
# Importing the necessary packages
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, f1_score

In [2]:
# loading the dataset
data = pd.read_csv('E:/Projects/Fake_news/train.csv')

In [3]:
data.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
data.shape

(20800, 5)

In [5]:
# checking for missing values
data.isna().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [6]:
# since the data is so large and also its a textual data, so we can replace the null values with empty string
data.fillna('', inplace=True)

In [7]:
data['label'].value_counts() # Around half the data is fake

1    10413
0    10387
Name: label, dtype: int64

In [8]:
data.columns

Index(['id', 'title', 'author', 'text', 'label'], dtype='object')

In [9]:
# here we are using only the title and author column to conduct the analysis

In [10]:
data['content'] = data['author'] + ' ' + data['title']

In [11]:
# stemming
def stemming(content):
    stem = PorterStemmer()
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [13]:
data['content'] = data['content'].apply(stemming)

In [14]:
# seperating the data and label
X = data['content'].values
y = data['label'].values

In [15]:
vec = TfidfVectorizer()
vec.fit(X)
X = vec.transform(X)

# Logistic Regression Model

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=14)
log = LogisticRegression()
log_model = log.fit(X_train, y_train)
log_pred_train = log_model.predict(X_train)
log_pred_test = log_model.predict(X_test)
print(f'The Training accuracy of the Logistic regression model is {accuracy_score(log_pred_train, y_train)}')
print(f'The Testing accuracy of the Logistic regression model is {accuracy_score(log_pred_test, y_test)}')

The Training accuracy of the Logistic regression model is 0.9869591346153846
The Testing accuracy of the Logistic regression model is 0.9831730769230769


# Decision Tree Model 

In [31]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=350)
dtree = DecisionTreeClassifier(criterion='entropy', max_depth=5)
dtree_model = dtree.fit(X_train1, y_train1)
dtree_pred_train = dtree_model.predict(X_train1)
dtree_pred_test = dtree_model.predict(X_test1)
print(f'The Training accuracy of the Decision tree model is {accuracy_score(dtree_pred_train, y_train1)}')
print(f'The Testing accuracy of the Decision tree model is {accuracy_score(dtree_pred_test, y_test1)}')

The Training accuracy of the Decision tree model is 0.9357572115384616
The Testing accuracy of the Decision tree model is 0.9454326923076923


# Random Forest

In [34]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=23)
rf = RandomForestClassifier(n_estimators=50,class_weight=None, max_depth=None, min_samples_leaf=2, min_samples_split=10)
rf_model = rf.fit(X_train2, y_train2)
rf_pred_train = rf_model.predict(X_train2)
rf_pred_test = rf_model.predict(X_test2)
print(f'The Training accuracy of the Random forest model is {accuracy_score(rf_pred_train, y_train2)}')
print(f'The Testing accuracy of the Random forest model is {accuracy_score(rf_pred_test, y_test2)}')

The Training accuracy of the Random forest model is 0.9957932692307693
The Testing accuracy of the Random forest model is 0.9908653846153846


In [40]:
results = pd.DataFrame(columns=['Algorithm_name', 'Training_accuracy', 'Testing_accuracy', 'Precision_score', 'F1-score'])
results = results.append({
    'Algorithm_name' : 'Logistic_Regression',
    'Training_accuracy' : accuracy_score(y_train, log_pred_train),
    'Testing_accuracy' : accuracy_score(y_test, log_pred_test),
    'Precision_score' : precision_score(y_test, log_pred_test),
    'F1-score': f1_score(y_test, log_pred_test)}, ignore_index=True)

results = results.append({
    'Algorithm_name' : 'Decision_Tree',
    'Training_accuracy' : accuracy_score(y_train1, dtree_pred_train),
    'Testing_accuracy' : accuracy_score(y_test1, dtree_pred_test),
    'Precision_score' : precision_score(y_test1, dtree_pred_test),
    'F1-score': f1_score(y_test1, dtree_pred_test)}, ignore_index=True)

results = results.append({
    'Algorithm_name' : 'Random_Forest',
    'Training_accuracy' : accuracy_score(y_train2, rf_pred_train),
    'Testing_accuracy' : accuracy_score(y_test2, rf_pred_test),
    'Precision_score' : precision_score(y_test2, rf_pred_test),
    'F1-score': f1_score(y_test2, rf_pred_test)}, ignore_index=True)


  results = results.append({
  results = results.append({
  results = results.append({


In [41]:
results

Unnamed: 0,Algorithm_name,Training_accuracy,Testing_accuracy,Precision_score,F1-score
0,Logistic_Regression,0.986959,0.983173,0.975152,0.983452
1,Decision_Tree,0.935757,0.945433,0.909483,0.948954
2,Random_Forest,0.995793,0.990865,0.988889,0.990803


In [None]:
# Considering all the above metrics, all of our model is performing well. The choice depend on our 
# specific requirements. Here I am considering based on the Testing accuracy, precision_score and 
# F1-score with that I choose Random Forest as my Ideal model