In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
import pickle

In [2]:
dataset=pd.read_csv('Dataset.csv')
x = dataset['text']
y = dataset['label']

In [3]:
dataset.head()

Unnamed: 0,title,text,label
0,"One More Nazi Resigned From Trumpâ€™s White, ...",Just days after Donald Trump advisor and avowe...,FAKE
1,Trans People Stranded and Alone in Ukraine Fol...,Trans people in Ukraine tell VICE World News ...,REAL
2,The Russia Investigationâ€™s Special Counsel ...,"First, he impaneled a grand jury. Then the gra...",FAKE
3,Trump Wants The Supreme Court To Punish Lawye...,This is yet another egregious abuse of power.I...,FAKE
4,Country Guitarist Who Survived Vegas Shooting...,"Caleb Keeter, a lifelong proponent of the Seco...",FAKE


In [4]:
dataset.shape

(1569, 3)

In [5]:
dataset.isnull().any()

title    False
text     False
label    False
dtype: bool

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [7]:
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)
tfidf_train=tfidf_vectorizer.fit_transform(x_train)
tfidf_test=tfidf_vectorizer.transform(x_test)

In [8]:
pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train,y_train)
y_pred=pac.predict(tfidf_test)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100, 2)}%')

Accuracy: 100.0%


In [9]:
LR = LogisticRegression()
LR.fit(tfidf_train,y_train)

LogisticRegression()

In [10]:
pred_lr=LR.predict(tfidf_test)

In [11]:
LR.score(tfidf_test, y_test)

1.0

In [12]:
print(classification_report(y_test, pred_lr))

              precision    recall  f1-score   support

        FAKE       1.00      1.00      1.00       141
        REAL       1.00      1.00      1.00       173

    accuracy                           1.00       314
   macro avg       1.00      1.00      1.00       314
weighted avg       1.00      1.00      1.00       314



In [13]:
DT = DecisionTreeClassifier()
DT.fit(tfidf_train,y_train)

DecisionTreeClassifier()

In [14]:
pred_dt = DT.predict(tfidf_test)

In [15]:
DT.score(tfidf_test, y_test)

0.9872611464968153

In [16]:
print(classification_report(y_test, pred_dt))

              precision    recall  f1-score   support

        FAKE       0.99      0.98      0.99       141
        REAL       0.98      0.99      0.99       173

    accuracy                           0.99       314
   macro avg       0.99      0.99      0.99       314
weighted avg       0.99      0.99      0.99       314



In [17]:
RFC = RandomForestClassifier(random_state=0)
RFC.fit(tfidf_train,y_train)

RandomForestClassifier(random_state=0)

In [18]:
pred_rfc = RFC.predict(tfidf_test)

In [19]:
RFC.score(tfidf_test, y_test)

1.0

In [20]:
pipeline = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')),
                      ('nbmodel', MultinomialNB())])

In [21]:
pipeline.fit(x_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer(stop_words='english')),
                ('nbmodel', MultinomialNB())])

In [22]:
score=pipeline.score(x_test,y_test)
print('accuracy', score)

accuracy 0.9585987261146497


In [23]:
pred = pipeline.predict(x_test)

In [24]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

        FAKE       0.92      1.00      0.96       141
        REAL       1.00      0.92      0.96       173

    accuracy                           0.96       314
   macro avg       0.96      0.96      0.96       314
weighted avg       0.96      0.96      0.96       314



In [25]:
print(confusion_matrix(y_test, pred))

[[141   0]
 [ 13 160]]


In [26]:
with open('model.pkl', 'wb') as handle:
     pickle.dump(pipeline, handle, protocol=pickle.HIGHEST_PROTOCOL)