In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=pd.read_csv("data.csv")

In [3]:
df.head()

Unnamed: 0,URLs,Headline,Body,Label
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1


In [4]:
df.shape

(4009, 4)

In [5]:
df.describe()

Unnamed: 0,Label
count,4009.0
mean,0.466949
std,0.498969
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [6]:
df.isnull().sum()

URLs         0
Headline     0
Body        21
Label        0
dtype: int64

In [7]:
df.columns

Index(['URLs', 'Headline', 'Body', 'Label'], dtype='object')

# Data Preprocessing

In [8]:
#Removing null value
df['Body'] = df['Body'].fillna('') 

In [9]:
df.isnull().sum()

URLs        0
Headline    0
Body        0
Label       0
dtype: int64

In [10]:
#Adding a new column
df['news']=df['Headline']+df['Body']

In [11]:
df.head()

Unnamed: 0,URLs,Headline,Body,Label,news
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1,Four ways Bob Corker skewered Donald TrumpImag...
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1,Linklater's war veteran comedy speaks to moder...
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1,Trump’s Fight With Corker Jeopardizes His Legi...
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1,Egypt's Cheiron wins tie-up with Pemex for Mex...
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1,Jason Aldean opens 'SNL' with Vegas tributeCou...


In [12]:
df.columns

Index(['URLs', 'Headline', 'Body', 'Label', 'news'], dtype='object')

In [13]:
#Drop features that are not needed
df=df.drop(['URLs', 'Headline', 'Body'],axis=1)

In [14]:
df.columns

Index(['Label', 'news'], dtype='object')

# Text Processing

In [15]:
# NLP libraries to clean the text data
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re

In [16]:
#Remove symbols(',','-',...etc)
#Remove stop words
#Stemming

In [17]:
ps = PorterStemmer()
def wordopt(text):
    text = re.sub('[^a-zA-Z]', ' ',text)
    text = text.lower()
    text = text.split()
    text = [ps.stem(word) for word in text if not word in stopwords.words('english')]
    text = ' '.join(text)
    return text

In [18]:
df['news'] = df['news'].apply(wordopt) #Applying the text processing techniques onto every row data

In [19]:
#Splitting DataSet
X=df['news']
y=df['Label']
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25)

In [20]:
# Vectorization technique TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
vectz = TfidfVectorizer()
xv_train = vectz.fit_transform(X_train)
xv_test = vectz.transform(X_test)

# Model Fitting

In [21]:
# Model libraries
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

#Accuracy measuring library
from sklearn.metrics import accuracy_score

In [22]:
#1. Logistic Regression - used because this model is best suited for binary classification
LR_model = LogisticRegression()

#Fitting training set to the model
LR_model.fit(xv_train,y_train)

#Predicting the test set results based on the model
# accuracy on training data
lr_y_pred = LR_model.predict(xv_train)
score = accuracy_score(y_train,lr_y_pred)
print('Training Accuracy of LR model is ', score)

# accuracy on testing data
lr_y_pred_test = LR_model.predict(xv_test)
score = accuracy_score(y_test,lr_y_pred_test)
print('Testing Accuracy of LR model is ', score)

Training Accuracy of LR model is  0.9906852960745176
Testing Accuracy of LR model is  0.9700897308075773


In [23]:
#2. Support Vector Machine(SVM) 
svm_model = SVC(kernel='linear')

#Fitting training set to the model
svm_model.fit(xv_train,y_train)

#Predicting the test set results based on the model
# accuracy on training data
train_svm_y_pred = svm_model.predict(xv_train)
score = accuracy_score(y_train,train_svm_y_pred)
print('Training Accuracy of SVM model is ', score)

# accuracy on testing data
test_svm_y_pred = svm_model.predict(xv_test)
score = accuracy_score(y_test,test_svm_y_pred)
print('Testing Accuracy of SVM model is ', score)

Training Accuracy of SVM model is  0.999001996007984
Testing Accuracy of SVM model is  0.9780658025922233


In [24]:
#3. Random Forest Classifier 
RFC_model = RandomForestClassifier(random_state=0)

#Fitting training set to the model
RFC_model.fit(xv_train, y_train)

#Predicting the test set results based on the model
# accuracy on training data
train_RFC_y_pred = RFC_model.predict(xv_train)
score = accuracy_score(y_train,train_RFC_y_pred)
print('Training Accuracy of RFC model is ', score)

# accuracy on testing data
test_RFC_y_pred = RFC_model.predict(xv_test)
score = accuracy_score(y_test,test_RFC_y_pred)
print('Testing Accuracy of RFC model is ', score)

Training Accuracy of RFC model is  1.0
Testing Accuracy of RFC model is  0.9690927218344965


# Model Testing

In [25]:
# As SVM is able to provide best results - SVM will be used to check the news liability

def fake_news_det(news):
    input_data = {"text":[news]}
    new_def_test = pd.DataFrame(input_data)
    new_def_test["text"] = new_def_test["text"].apply(wordopt) 
    new_x_test = new_def_test["text"]
    #print(new_x_test)
    vectorized_input_data = vectz.transform(new_x_test)
    prediction = svm_model.predict(vectorized_input_data)
    
    if prediction == 1:
        print("Not a Fake News")
    else:
        print("Fake News")

In [26]:
fake_news_det('U.S. Secretary of State John F. Kerry said Monday that he will stop in Paris later this week, amid criticism that no top American officials attended Sundayâ€™s unity march against terrorism.')

Not a Fake News


In [27]:
fake_news_det("JetNation FanDuel League; Week 4 of readers think this story is Fact. Add your two cents.(Before Its News)Our FanDuel league is back again this week. Here are the details:$900 in total prize money. $250 to the winner. $10 to enter.Remember this is a one week league, pick your lineup against the salary cap and next week if you want to play again you can pick a completely different lineup if you want.Click this link to enter — http://fanduel.com/JetNation You can discuss this with other NY Jets fans on the Jet Nation message board. Or visit of on Facebook.Source: http://www.jetnation.com/2017/09/27/jetnation-fanduel-league-week-4/")


Fake News
