## Use Kaggle Database of Fake any True News to generate and test ML models.

In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df_fake = pd.read_csv("Fake.csv")
df_true = pd.read_csv("True.csv")

In [3]:
df_fake['label'] = 0 #for fake
df_true['label'] = 1 #for true

In [4]:
df_merge = pd.concat([df_fake, df_true])

In [5]:
df1 = pd.read_csv("fake_or_real_news.csv")

In [6]:
df1 = df1.drop(['Unnamed: 0','title'], axis=1)

In [7]:
df1['label'] = df1['label'].apply(lambda x: 0 if x == 'FAKE' else 1)

In [8]:
df1

Unnamed: 0,text,label
0,"Daniel Greenfield, a Shillman Journalism Fello...",0
1,Google Pinterest Digg Linkedin Reddit Stumbleu...,0
2,U.S. Secretary of State John F. Kerry said Mon...,1
3,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",0
4,It's primary day in New York and front-runners...,1
...,...,...
6330,The State Department told the Republican Natio...,1
6331,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,0
6332,Anti-Trump Protesters Are Tools of the Oligar...,0
6333,"ADDIS ABABA, Ethiopia —President Obama convene...",1


In [9]:
df_merge

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0
...,...,...,...,...,...
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",1
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",1
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",1
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",1


In [10]:
df = df_merge.drop(['title','subject', 'date'], axis=1)

In [11]:
df = pd.concat([df, df1])

In [12]:
def clean_words(text):
    text = re.sub('[^a-zA-Z ]', '', text)
    return text

In [13]:
df['text'] = df['text'].apply(clean_words)

In [14]:
df

Unnamed: 0,text,label
0,Donald Trump just couldn t wish all Americans ...,0
1,House Intelligence Committee Chairman Devin Nu...,0
2,On Friday it was revealed that former Milwauke...,0
3,On Christmas day Donald Trump announced that h...,0
4,Pope Francis used his annual Christmas Day mes...,0
...,...,...
6330,The State Department told the Republican Natio...,1
6331,The P in PBS Should Stand for Plutocratic or P...,0
6332,AntiTrump Protesters Are Tools of the Oligarc...,0
6333,ADDIS ABABA Ethiopia President Obama convened ...,1


In [15]:
df.isnull().sum()

text     0
label    0
dtype: int64

In [16]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], train_size=0.25)

In [17]:
vec = TfidfVectorizer()
vec_train = vec.fit_transform(X_train)
vec_test = vec.transform(X_test)

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import pickle

In [19]:
lr = LogisticRegression()
lr.fit(vec_train, y_train)
pred = lr.predict(vec_test)
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       0.96      0.97      0.96     20043
           1       0.96      0.95      0.96     18382

    accuracy                           0.96     38425
   macro avg       0.96      0.96      0.96     38425
weighted avg       0.96      0.96      0.96     38425



In [20]:
dt = DecisionTreeClassifier()
dt.fit(vec_train, y_train)
pred = dt.predict(vec_test)
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       0.95      0.96      0.95     20043
           1       0.95      0.94      0.95     18382

    accuracy                           0.95     38425
   macro avg       0.95      0.95      0.95     38425
weighted avg       0.95      0.95      0.95     38425



In [21]:
pickle.dump(lr, open("Final_model.sav", 'wb')) # using Logistic Regression as final model
pickle.dump(vec, open("vectorizer.pk", 'wb'))