In [1]:
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
import pickle

In [2]:
df=pd.read_csv("news.csv")
x=df['text']
y=df['label']

In [3]:
df.shape
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [4]:
df.shape

(6335, 4)

In [5]:
df.isnull().any()

Unnamed: 0    False
title         False
text          False
label         False
dtype: bool

In [6]:
lb=df.label
lb.head()

0    FAKE
1    FAKE
2    REAL
3    FAKE
4    REAL
Name: label, dtype: object

In [7]:
traindata_x,testdata_x,traindata_y,testdata_y=train_test_split(x, y, test_size=0.2)

In [8]:
vector_frequency=TfidfVectorizer(stop_words='english', max_df=0.7)


tfidf_train=vector_frequency.fit_transform(traindata_x) 
tfidf_test=vector_frequency.transform(testdata_x)

In [9]:
train_frequency=vector_frequency.fit_transform(traindata_x)

In [10]:
print(testdata_x)
print("#################################***********************####################*************************")
print(train_frequency)
print("#################################***********************####################**************************")

6086    “What difference, at this point, does it make?...
1450    The Richest Man.., Considered, Invested In One...
853     Of course, Republicans and conservatives find ...
4131    And they are looking for someone who could app...
535     We do not like trampolining, say hedgehogs 10-...
                              ...                        
1003    Donald Trump and Paul Ryan might not have buil...
1998    Rep. Mark Meadows (R-N.C.) has been able to co...
1535    Email \nI was in London last weekend to view a...
5892    Tensions between the Republican Party and its ...
5790    In this News Brief, Joe Joseph is joined by go...
Name: text, Length: 1267, dtype: object
#################################***********************####################*************************
  (0, 21202)	0.08020017282924655
  (0, 9458)	0.044231888762311704
  (0, 42435)	0.0393100106565535
  (0, 6755)	0.06903418946393479
  (0, 28339)	0.06592105015551511
  (0, 50922)	0.06451886741472987
  (0, 12817)	0.076119847

In [11]:
test_frequency=vector_frequency.transform(testdata_x)
print(test_frequency)

  (0, 60031)	0.04438247217850116
  (0, 59987)	0.11150163398383134
  (0, 59837)	0.0350011574990807
  (0, 59833)	0.09308667395325672
  (0, 59675)	0.07786445363117558
  (0, 58883)	0.02964528110349921
  (0, 58821)	0.04901749546501128
  (0, 57524)	0.09761776191323937
  (0, 57218)	0.12991762953786087
  (0, 55969)	0.06236797420924839
  (0, 55957)	0.047537686612494225
  (0, 55802)	0.04133557229807167
  (0, 54898)	0.039533280982518286
  (0, 54820)	0.031154978213730904
  (0, 54614)	0.024691023937362045
  (0, 54557)	0.11972815577746432
  (0, 54044)	0.11454797461251848
  (0, 54042)	0.0526308531631707
  (0, 51578)	0.10666497141695236
  (0, 51054)	0.07703604837409123
  (0, 51015)	0.05000820649380884
  (0, 50922)	0.05110500881828277
  (0, 48658)	0.0697295228083746
  (0, 48625)	0.04848822738163661
  (0, 48406)	0.07878521430103338
  :	:
  (1266, 15081)	0.08993494169398755
  (1266, 14496)	0.05520893005251998
  (1266, 14212)	0.15147209761865912
  (1266, 13106)	0.0940249674373157
  (1266, 12866)	0.1096042

In [12]:
pa=PassiveAggressiveClassifier(max_iter=50)
pa.fit(train_frequency,traindata_y)

y_pred=pa.predict(tfidf_test)
score=accuracy_score(testdata_y,y_pred)
print(traindata_y)
print(f'Accuracy: {round(score*100,2)}%')

3595    REAL
3500    REAL
1203    REAL
373     REAL
3816    REAL
        ... 
1194    REAL
6216    REAL
5809    FAKE
4426    REAL
3078    FAKE
Name: label, Length: 5068, dtype: object
Accuracy: 94.63%


In [13]:
pipeline = Pipeline([('tfidf', TfidfVectorizer(stop_words = 'english')),
                    ('nbmodel', MultinomialNB())])

In [14]:
pipeline.fit(traindata_x,traindata_y)

Pipeline(steps=[('tfidf', TfidfVectorizer(stop_words='english')),
                ('nbmodel', MultinomialNB())])

In [15]:
score = pipeline.score(testdata_x,testdata_y)
print('Accuracy', score)

Accuracy 0.8310970797158642


In [16]:
pred = pipeline.predict(testdata_x)

In [17]:
print(classification_report(testdata_y, pred))

              precision    recall  f1-score   support

        FAKE       0.98      0.69      0.81       647
        REAL       0.75      0.98      0.85       620

    accuracy                           0.83      1267
   macro avg       0.86      0.83      0.83      1267
weighted avg       0.87      0.83      0.83      1267



In [18]:
confusion_matrix(testdata_y,pred)

array([[444, 203],
       [ 11, 609]], dtype=int64)

In [20]:
with open('model.pk1', 'wb') as handle:
    pickle.dump(pipeline, handle, protocol=pickle.HIGHEST_PROTOCOL)