In [1]:
#Import the Libraries
import pandas as pd
import numpy as np
import itertools
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import matplotlib.pyplot as plt

In [2]:
#Refer the below link for Hashing Vectorizer
#https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.HashingVectorizer.html
#from sklearn.feature_extraction.text import HashingVectorizer
#vectorizer = HashingVectorizer(n_features=2**4)
#X = vectorizer.fit_transform

In [3]:
df=pd.read_csv('news.csv', index_col=None)

In [4]:
df

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


In [5]:
dataset=df.drop("Unnamed: 0",axis=1)
#Here axis=1 need to delete the entire column 

In [6]:
dataset

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...
6330,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


In [7]:
y=dataset["label"]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(dataset["text"], y, test_size=0.33, random_state=53)

In [9]:
hash_vectorizer = HashingVectorizer(stop_words='english')
#Stop_words meaning for removing the repeated words(i.e - is, was, the, of, in,) continuous words need to remove from the dataset
hash_train = hash_vectorizer.fit_transform(X_train)
#Here .fit method, it create only model
# suppose .fit_transform method we given here it will ceate model + whatever we given dataset inside the model (X_train) , that will create model + predict)
print(hash_train)
hash_test = hash_vectorizer.transform(X_test)

  (1, 26381)	-0.1336306209562122
  (1, 46353)	0.1336306209562122
  (1, 76282)	-0.1336306209562122
  (1, 124604)	-0.1336306209562122
  (1, 271872)	0.2672612419124244
  (1, 354766)	0.1336306209562122
  (1, 355578)	-0.1336306209562122
  (1, 380136)	-0.2672612419124244
  (1, 399927)	-0.1336306209562122
  (1, 413315)	-0.2672612419124244
  (1, 421751)	-0.2672612419124244
  (1, 452780)	-0.1336306209562122
  (1, 506429)	-0.5345224838248488
  (1, 612563)	-0.1336306209562122
  (1, 615897)	0.1336306209562122
  (1, 626851)	0.1336306209562122
  (1, 639862)	0.1336306209562122
  (1, 691517)	-0.1336306209562122
  (1, 740856)	-0.1336306209562122
  (1, 777362)	-0.2672612419124244
  (1, 798576)	-0.1336306209562122
  (1, 907820)	0.2672612419124244
  (1, 1039472)	-0.1336306209562122
  (2, 14361)	-0.3333333333333333
  (2, 81229)	0.3333333333333333
  :	:
  (4243, 924171)	0.08478501284163323
  (4243, 934801)	0.028261670947211076
  (4243, 935153)	0.028261670947211076
  (4243, 935275)	0.028261670947211076
  (42

In [10]:
print(hash_train.toarray())

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.04490133 0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [12]:
clf = RandomForestClassifier()

clf.fit(hash_train, y_train)
pred = clf.predict(hash_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)

cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])

accuracy:   0.882


In [13]:
from sklearn.metrics import classification_report

report=classification_report(y_test, pred)

In [14]:
print(report)

              precision    recall  f1-score   support

        FAKE       0.86      0.91      0.88      1008
        REAL       0.91      0.86      0.88      1083

    accuracy                           0.88      2091
   macro avg       0.88      0.88      0.88      2091
weighted avg       0.88      0.88      0.88      2091



In [15]:
dataset["text"][0]



In [16]:
dataset["text"][2]

'U.S. Secretary of State John F. Kerry said Monday that he will stop in Paris later this week, amid criticism that no top American officials attended Sunday’s unity march against terrorism.\n\nKerry said he expects to arrive in Paris Thursday evening, as he heads home after a week abroad. He said he will fly to France at the conclusion of a series of meetings scheduled for Thursday in Sofia, Bulgaria. He plans to meet the next day with Foreign Minister Laurent Fabius and President Francois Hollande, then return to Washington.\n\nThe visit by Kerry, who has family and childhood ties to the country and speaks fluent French, could address some of the criticism that the United States snubbed France in its darkest hour in many years.\n\nThe French press on Monday was filled with questions about why neither President Obama nor Kerry attended Sunday’s march, as about 40 leaders of other nations did. Obama was said to have stayed away because his own security needs can be taxing on a country, 

In [17]:
hash_train[[0]]

<1x1048576 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format>

In [18]:
hash_vectorizer = HashingVectorizer(stop_words='english')
hash_train = hash_vectorizer.fit_transform(X_train)

In [19]:
X_train[[0]]

0    Daniel Greenfield, a Shillman Journalism Fello...
Name: text, dtype: object

In [21]:
clf.predict(hash_train[[0]])

array(['FAKE'], dtype=object)

In [22]:
clf.predict(hash_train[[2]])

array(['REAL'], dtype=object)