In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Step 1: Define the objective of the Problem Statement
# Create a classification model which can classify the News

# Step 2: Data Gathering
# https://www.kaggle.com/c/fake-news/data

In [3]:
# Import lib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Import KNN and accuracy_score from sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [4]:
# Read the Dataset
df = pd.read_csv("/content/drive/My Drive/DS Worksop/DS Workshop/Fake News Dataset.csv")
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


# Step 4: Exploratory Data Analysis


In [5]:
# Step 3: Data Preparation
df = df[['title','label']]
df.head()

Unnamed: 0,title,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",0
2,Why the Truth Might Get You Fired,1
3,15 Civilians Killed In Single US Airstrike Hav...,1
4,Iranian woman jailed for fictional unpublished...,1


In [6]:
df['label'].value_counts()

1    10413
0    10387
Name: label, dtype: int64

In [7]:
# 1: unreliable
# 0: reliable

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   20242 non-null  object
 1   label   20800 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 325.1+ KB


In [9]:
df.dropna(inplace=True)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20242 entries, 0 to 20799
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   20242 non-null  object
 1   label   20242 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 474.4+ KB


In [11]:
df.head()

Unnamed: 0,title,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",0
2,Why the Truth Might Get You Fired,1
3,15 Civilians Killed In Single US Airstrike Hav...,1
4,Iranian woman jailed for fictional unpublished...,1


In [12]:
df.reset_index(inplace= True)

In [13]:
import re
corpus =[]
for i in range (0, len(df)):
  text = re.sub('[^a-zA-Z]',' ',df['title'][i])

  text = text.lower()

  corpus.append(text)

In [14]:
corpus

['house dem aide  we didn t even see comey s letter until jason chaffetz tweeted it',
 'flynn  hillary clinton  big woman on campus   breitbart',
 'why the truth might get you fired',
 '   civilians killed in single us airstrike have been identified',
 'iranian woman jailed for fictional unpublished story about woman stoned to death for adultery',
 'jackie mason  hollywood would love trump if he bombed north korea over lack of trans bathrooms  exclusive video    breitbart',
 'life  life of luxury  elton john s   favorite shark pictures to stare at during long  transcontinental flights',
 'beno t hamon wins french socialist party s presidential nomination   the new york times',
 'excerpts from a draft script for donald trump s q ampa with a black church s pastor   the new york times',
 'a back channel plan for ukraine and russia  courtesy of trump associates   the new york times',
 'obama s organizing for action partners with soros linked  indivisible  to disrupt trump s agenda',
 'bbc 

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=5000)

In [16]:
X = cv.fit_transform(corpus).toarray()

y = df['label']

In [17]:
len(X[0])

5000

In [18]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3)

In [19]:
print("X_train = ",X_train.shape)
print("X_test = ",X_test.shape)

print("y_train = ",y_train.shape)
print("y_test = ",y_test.shape)

X_train =  (14169, 5000)
X_test =  (6073, 5000)
y_train =  (14169,)
y_test =  (6073,)


In [20]:
model = KNeighborsClassifier(n_neighbors=3)
print("Model Created")

model.fit(X_train,y_train)
print("Model Training Completed")

y_pred = model.predict(X_test)

acc = accuracy_score(y_test , y_pred)
print("Accuracy :- ",acc*100)
print("Error rate = ",1-acc)

Model Created
Model Training Completed
Accuracy :-  82.16696854931665
Error rate =  0.17833031450683356


In [21]:
raw_news = df['title'][1]
raw_news

'FLYNN: Hillary Clinton, Big Woman on Campus - Breitbart'

In [22]:
news =re.sub('[^a-zA-Z]',' ',raw_news)
news = news.lower()
news

'flynn  hillary clinton  big woman on campus   breitbart'

In [23]:
news_vec = cv.transform([news])
news_vec

<1x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [24]:
pred = model.predict(news_vec)[0]

In [25]:
if(pred==1):
  print("Fake news")
elif(pred == 0):
  print("Real News")

Fake news
