In [1]:
import pandas as pd

In [2]:
dataframe = pd.read_csv('news.csv')
dataframe.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [3]:
x = dataframe['text']
y = dataframe['label']

In [4]:
x

0       Daniel Greenfield, a Shillman Journalism Fello...
1       Google Pinterest Digg Linkedin Reddit Stumbleu...
2       U.S. Secretary of State John F. Kerry said Mon...
3       — Kaydee King (@KaydeeKing) November 9, 2016 T...
4       It's primary day in New York and front-runners...
                              ...                        
6330    The State Department told the Republican Natio...
6331    The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...
6332     Anti-Trump Protesters Are Tools of the Oligar...
6333    ADDIS ABABA, Ethiopia —President Obama convene...
6334    Jeb Bush Is Suddenly Attacking Trump. Here's W...
Name: text, Length: 6335, dtype: object

In [5]:
y

0       FAKE
1       FAKE
2       REAL
3       FAKE
4       REAL
        ... 
6330    REAL
6331    FAKE
6332    FAKE
6333    REAL
6334    REAL
Name: label, Length: 6335, dtype: object

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)
x_train

2402    Christian Whiton is a former deputy special en...
1922    Super Tuesday Brings Harsh Light And Heartache...
3475    Prev post Page 1 of 4 Next \nNurses are among ...
6197    The deadly hostage situation at a luxury hotel...
4748    Our new country: Women and minorities hit hard...
                              ...                        
4931    Twenty-two of the 37 corporations nominated fo...
3264    As pieces of luggage, human remains, wreckage ...
1653    0 Add Comment \nIN THE immediate aftermath of ...
2607    Palestine Palestinians check the flat of Amjad...
2732    For the second week in a row, there was a temp...
Name: text, Length: 5068, dtype: object

In [8]:
y_train

2402    REAL
1922    REAL
3475    FAKE
6197    REAL
4748    FAKE
        ... 
4931    REAL
3264    REAL
1653    FAKE
2607    FAKE
2732    REAL
Name: label, Length: 5068, dtype: object

In [9]:
tfvect = TfidfVectorizer(stop_words = 'english', max_df = 0.7)
tfid_x_train = tfvect.fit_transform(x_train)
tfid_x_test = tfvect.transform(x_test)

In [10]:
print(tfid_x_train)

  (0, 5434)	0.04558113027328849
  (0, 22112)	0.03101556231290688
  (0, 41980)	0.041224221139183156
  (0, 41399)	0.08707226417897605
  (0, 32656)	0.05826505252086948
  (0, 40634)	0.08300352400740964
  (0, 59168)	0.03304164565844389
  (0, 17072)	0.015280054882096362
  (0, 21122)	0.03467645674134631
  (0, 36500)	0.036461647252457555
  (0, 21811)	0.021815222286389203
  (0, 55510)	0.037920004934453785
  (0, 35473)	0.025244871570555313
  (0, 8579)	0.03467645674134631
  (0, 34917)	0.05447353166186797
  (0, 51186)	0.027977956004355133
  (0, 17007)	0.020253495378206955
  (0, 13362)	0.023133406428959745
  (0, 28054)	0.026346675461032608
  (0, 51479)	0.04368398576036871
  (0, 19911)	0.03226033674797147
  (0, 12812)	0.021077575874407357
  (0, 5104)	0.0369438299052932
  (0, 22586)	0.03830998824285092
  (0, 23706)	0.03360195028163217
  :	:
  (5067, 25947)	0.0690536695761875
  (5067, 20495)	0.060909276073445075
  (5067, 46270)	0.05755520504375683
  (5067, 56317)	0.0733883544907381
  (5067, 59168)	0.0

In [11]:
classifier = PassiveAggressiveClassifier(max_iter = 50)
classifier.fit(tfid_x_train, y_train)

In [12]:
y_pred = classifier.predict(tfid_x_test)
score = accuracy_score(y_test, y_pred)
print(f'Accuracy: {round(score*100, 2)}%')

Accuracy: 93.61%


In [13]:
cf = confusion_matrix(y_test, y_pred, labels =  ['FAKE', 'REAL'])
print(cf)

[[573  42]
 [ 39 613]]


In [14]:
def fake_news_det(news):
  input_data = [news]
  vectorized_input_data = tfvect.transform(input_data)
  prediction = classifier.predict(vectorized_input_data)
  print(prediction)

In [15]:
fake_news_det("Mike Pence Drapes Shawl Over Immodest Lady Justice Statue - The Onion - America's Finest News Source")

['FAKE']


In [16]:
fake_news_det("What's in that Iran bill that Obama doesn't like?")

['REAL']


In [17]:
import pickle
pickle.dump(classifier, open('model.pkl', 'wb'))

In [18]:
loaded_model = pickle.load(open('model.pkl', 'rb'))

In [19]:
def fake_news_det1(news):
  input_data = [news]
  vectorized_input_data = tfvect.transform(input_data)
  prediction = loaded_model.predict(vectorized_input_data)
  print(prediction)

In [20]:
fake_news_det1("What's in that Iran bill that Obama doesn't like?")

['REAL']


In [21]:
filename = 'vectorizer.pkl'
pickle.dump(tfvect, open(filename, 'wb'))