In [1]:
#IMPORT ALL THE NEEDED LIBRARIES
import pandas as pd
import numpy as np
import itertools
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import matplotlib.pyplot as plt

In [2]:
#LOAD THE DATASET
df=pd.read_csv("news.csv",index_col=None)
df

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


In [3]:
#DROP THE UNWANTED COLUMNS
dataset=df.drop("Unnamed: 0",axis=1)

In [4]:
dataset

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...
6330,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


In [5]:
#ASSIGN SEPARATE VARIABLE FOR THE INDEPOENDENT AND DEPENDENT VARIABLE IF NEEDED
x=dataset["text"]
y=dataset["label"]

In [6]:
#SPLIT TRAIN AND TEST SET
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.33,random_state=53)

In [7]:
x_train.shape

(4244,)

In [8]:
x_test.shape

(2091,)

In [9]:
#PERFORM TFIDF VECTOR TO EXTRACT THE FEATURES FROM THE TEXT
count_vector=CountVectorizer(stop_words='english')
count_train=count_vector.fit_transform(x_train)
print(count_train)
count_test=count_vector.transform(x_test)

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 1119820 stored elements and shape (4244, 56922)>
  Coords	Values
  (1, 42470)	1
  (1, 12105)	1
  (1, 54177)	1
  (1, 50628)	1
  (1, 15924)	2
  (1, 44520)	2
  (1, 51896)	2
  (1, 35783)	4
  (1, 35256)	1
  (1, 21881)	1
  (1, 42534)	1
  (1, 8399)	1
  (1, 29531)	2
  (1, 15927)	2
  (1, 25686)	1
  (1, 49203)	2
  (1, 16814)	1
  (1, 36087)	1
  (1, 21568)	1
  (1, 25684)	1
  (1, 38823)	1
  (1, 47506)	1
  (1, 36831)	1
  (2, 16972)	1
  (2, 762)	1
  :	:
  (4243, 41435)	1
  (4243, 53607)	1
  (4243, 659)	1
  (4243, 38834)	1
  (4243, 19003)	1
  (4243, 11415)	1
  (4243, 7545)	1
  (4243, 22426)	1
  (4243, 54007)	1
  (4243, 7113)	1
  (4243, 4932)	1
  (4243, 39497)	1
  (4243, 50053)	1
  (4243, 38849)	1
  (4243, 20702)	1
  (4243, 42139)	1
  (4243, 17247)	1
  (4243, 50052)	1
  (4243, 55228)	1
  (4243, 29255)	1
  (4243, 49435)	1
  (4243, 11257)	1
  (4243, 52945)	1
  (4243, 20905)	1
  (4243, 7962)	1


In [10]:
#TOTAL COUNT OF THE FEATURE NAMES
len(count_vector.get_feature_names_out())

56922

In [11]:
#DISPLAYED IN ARRAY TYPE
print(count_train.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [12]:
#MODEL CREATION AND PREDICTION 
clf= MultinomialNB()
clf.fit(count_train,y_train)
pred = clf.predict(count_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
cm=metrics.confusion_matrix(y_test,pred,labels=['FAKE','REAL'])

accuracy:   0.893


In [13]:
#CLASSIFICATION REPORT OF THE MODEL CREATED
from sklearn.metrics import classification_report
report=classification_report(y_test,pred)

In [14]:
print(report)

              precision    recall  f1-score   support

        FAKE       0.92      0.86      0.89      1008
        REAL       0.88      0.93      0.90      1083

    accuracy                           0.89      2091
   macro avg       0.90      0.89      0.89      2091
weighted avg       0.89      0.89      0.89      2091



In [15]:
#FIRST ROW OF THE DATASET
dataset["text"][1]

'Google Pinterest Digg Linkedin Reddit Stumbleupon Print Delicious Pocket Tumblr \nThere are two fundamental truths in this world: Paul Ryan desperately wants to be president. And Paul Ryan will never be president. Today proved it. \nIn a particularly staggering example of political cowardice, Paul Ryan re-re-re-reversed course and announced that he was back on the Trump Train after all. This was an aboutface from where he was a few weeks ago. He had previously declared he would not be supporting or defending Trump after a tape was made public in which Trump bragged about assaulting women. Suddenly, Ryan was appearing at a pro-Trump rally and boldly declaring that he already sent in his vote to make him President of the United States. It was a surreal moment. The figurehead of the Republican Party dosed himself in gasoline, got up on a stage on a chilly afternoon in Wisconsin, and lit a match. . @SpeakerRyan says he voted for @realDonaldTrump : “Republicans, it is time to come home” ht

In [16]:
#TO SAVE THE BEST MODEL FOR THE DEPLOYMENT PHASE
import pickle

In [17]:
filename="finalized_model_MultinomialNB.sav"

In [18]:
pickle.dump(clf,open(filename,'wb'))

In [19]:
loaded_model=pickle.load(open("finalized_model_MultinomialNB.sav",'wb'))

In [20]:
result=loaded_model.predict(count_train[[0]])

In [21]:
result

array(['FAKE'], dtype='<U4')