In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, classification_report

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fake-news-classification/WELFake_Dataset.csv


In [2]:
df = pd.read_csv("/kaggle/input/fake-news-classification/WELFake_Dataset.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [3]:
df.isnull().sum()

Unnamed: 0      0
title         558
text           39
label           0
dtype: int64

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72134 entries, 0 to 72133
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  72134 non-null  int64 
 1   title       71576 non-null  object
 2   text        72095 non-null  object
 3   label       72134 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 2.2+ MB


In [19]:
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71537 entries, 0 to 72133
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  71537 non-null  int64 
 1   title       71537 non-null  object
 2   text        71537 non-null  object
 3   label       71537 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 2.7+ MB


In [20]:
from sklearn.model_selection import train_test_split
x = df["text"]
y = df["label"]

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state =0)

In [23]:
tfidf = TfidfVectorizer(stop_words = "english", max_df = 0.7)
tfidf_xtrain = tfidf.fit_transform(x_train)
tfidf_xtest = tfidf.transform(x_test)

In [24]:
model = PassiveAggressiveClassifier(max_iter= 40)
model.fit(tfidf_xtrain,y_train)

PassiveAggressiveClassifier(max_iter=40)

In [27]:
y_pred = model.predict(tfidf_xtest)
acc = accuracy_score(y_test,y_pred)
print(f"Accuracy is: {np.round(acc*100,3)}%")

Accuracy is: 95.946%


In [28]:
clf = classification_report(y_test,y_pred)
print(clf)

              precision    recall  f1-score   support

           0       0.96      0.95      0.96      6963
           1       0.96      0.96      0.96      7345

    accuracy                           0.96     14308
   macro avg       0.96      0.96      0.96     14308
weighted avg       0.96      0.96      0.96     14308



In [29]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_pred)
print(cm)

[[6641  322]
 [ 258 7087]]


In [53]:
def fake_news_detector(news):
    text = [news]
    vect = tfidf.transform(text)
    pred = model.predict(vect)
    if pred == 0:
        print("Fake News")
    else:
        print("Real News")


In [49]:
x_test.values[0]

"The views expressed herein are the views of the author exclusively and not necessarily the views of VNN, VNN authors, affiliates, advertisers, sponsors, partners, technicians or the Veterans Today Network and its assigns. Notices Posted by VNN on November 4, 2016, With 0 Reads, Filed under Civil Liberties , Corruption , Election 2016 , Foreign Lobbies , Foreign Policy , Government , Legislation , Police State , Politics . You can follow any responses to this entry through the RSS 2.0 . You can leave a response or trackback to this entry FaceBook Comments \nYou must be logged in to post a comment Login WHAT'S HOT"

In [64]:
fake_news_detector("Go to Article President Barack Obama has been campaigning hard for the woman who is supposedly going to extend his legacy four more years. The only problem with stumping for Hillary Clinton, however, is sheâ€™s not exactly a candidate easy to get too enthused about")

Real News


In [56]:
fake_news_detector("""The views expressed herein are the views of the author exclusively and not necessarily the views of VNN, VNN authors, affiliates, advertisers, sponsors, partners, technicians or the Veterans Today Network and its assigns. Notices Posted by VNN on November 4, 2016, With 0 Reads, Filed under Civil Liberties , Corruption , Election 2016 , Foreign Lobbies , Foreign Policy , Government , Legislation , Police State , Politics . You can follow any responses to this entry through the RSS 2.0 . You can leave a response or trackback to this entry FaceBook 
Comments \nYou must be logged in to post a comment Login WHAT'S HOT""")

Real News


In [59]:
import pickle
pickle.dump(model,open("classifier.pkl", "wb"))