In [1]:
import pandas as pd
import numpy as np


In [2]:
true = pd.read_csv('../data/True.csv')
fake = pd.read_csv('../data/Fake.csv')

In [3]:
true.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [4]:
fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [5]:
true.shape, fake.shape

((21417, 4), (23481, 4))

In [6]:
true['label'] = 1
fake['label'] = 0

In [7]:
true.head()

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1


In [8]:
news = pd.concat([true, fake], axis=0)  

In [9]:
#combining text_processed and title for creating full news article with headline
news['final_news'] = news['title'] + " " + news['text']
news.head(2)

Unnamed: 0,title,text,subject,date,label,final_news
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1,"As U.S. budget fight looms, Republicans flip t..."
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1,U.S. military to accept transgender recruits o...


In [10]:
news.isnull().sum()

title         0
text          0
subject       0
date          0
label         0
final_news    0
dtype: int64

In [11]:
news = news.drop(['title', 'subject', 'date','text'], axis=1)

In [12]:
news.head()

Unnamed: 0,label,final_news
0,1,"As U.S. budget fight looms, Republicans flip t..."
1,1,U.S. military to accept transgender recruits o...
2,1,Senior U.S. Republican senator: 'Let Mr. Muell...
3,1,FBI Russia probe helped by Australian diplomat...
4,1,Trump wants Postal Service to charge 'much mor...


In [13]:
news = news.sample(frac=1)  #Resuffling the data

In [14]:
news.head()

Unnamed: 0,label,final_news
18048,1,"Ghana fuel site blast kills at least 7, injure..."
14454,1,Egyptian activists detained by court for prote...
18383,0,SMUG CNN ANCHORS Say They Won’t Release Identi...
12626,0,WOW! Hillary’s VP Pick Tim Kaine Gets Only 30 ...
17884,0,WHOA! DID OPRAH JUST THROW Her Hat In The Ring...


In [15]:
news.reset_index(inplace=True)

In [16]:
news.drop(['index'], axis=1, inplace=True)

In [17]:
news.head()

Unnamed: 0,label,final_news
0,1,"Ghana fuel site blast kills at least 7, injure..."
1,1,Egyptian activists detained by court for prote...
2,0,SMUG CNN ANCHORS Say They Won’t Release Identi...
3,0,WOW! Hillary’s VP Pick Tim Kaine Gets Only 30 ...
4,0,WHOA! DID OPRAH JUST THROW Her Hat In The Ring...


In [18]:
news.rename(columns={'final_news': 'text'}, inplace=True)

In [19]:
import re

def wordopt(text):
    #convert into lowercase
    text = text.lower()

    #remove URLs
    text = re.sub(r'https?://\S+|www\.\S+','', text)

    #remove HTML tags
    text = re.sub(r'<.*?>','',text)

    #remove punctuation
    text = re.sub(r'[^\w\s]','',text)

    #remove digits
    text = re.sub(r'\d','',text)

    #remove newline characters
    text = re.sub(r'\n','',text)

    return text

In [20]:
news['text'] = news['text'].apply(wordopt) 

In [21]:
news['text']

0        ghana fuel site blast kills at least  injures ...
1        egyptian activists detained by court for prote...
2        smug cnn anchors say they wont release identit...
3        wow hillarys vp pick tim kaine gets only  peop...
4        whoa did oprah just throw her hat in the ring ...
                               ...                        
44893    you wont believe this watch donna brazile defe...
44894    turkeys erdogan says will not succumb to us bl...
44895    several wounded after blast hits bus in turkey...
44896    trump clinton blast each other on character cl...
44897     russia probes new leader disqualified himself...
Name: text, Length: 44898, dtype: object

In [22]:
X = news['text']
y = news['label']

In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [24]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((31428,), (13470,), (31428,), (13470,))

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

xv_train = vectorizer.fit_transform(X_train)
xv_test = vectorizer.transform(X_test)

In [26]:
import pickle

# Save vectorizer
with open("../model/vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)


In [27]:
xv_train

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 6570608 stored elements and shape (31428, 178782)>

In [28]:
xv_train.shape, xv_test.shape

((31428, 178782), (13470, 178782))

In [29]:
from sklearn.linear_model import LogisticRegression

model_logistic = LogisticRegression()
model_logistic.fit(xv_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [30]:
y_pred_logistic = model_logistic.predict(xv_test)

In [31]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

report = classification_report(y_test, y_pred_logistic)
print(report)

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      7018
           1       0.99      0.99      0.99      6452

    accuracy                           0.99     13470
   macro avg       0.99      0.99      0.99     13470
weighted avg       0.99      0.99      0.99     13470



In [32]:
import joblib

# Save model
joblib.dump(model_logistic, "../model/logistic_regression_model.pkl")

['../model/logistic_regression_model.pkl']

In [33]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()
dtc.fit(xv_train, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [34]:
report_dtc = classification_report(y_test, dtc.predict(xv_test))
print(report_dtc)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7018
           1       0.99      1.00      1.00      6452

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



In [35]:
import joblib

# Save model
joblib.dump(dtc, "../model/decision_tree_classifier_model.pkl")

['../model/decision_tree_classifier_model.pkl']

In [36]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(xv_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [37]:
report_rfc = classification_report(y_test, rfc.predict(xv_test))
print(report_rfc)

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      7018
           1       0.99      0.99      0.99      6452

    accuracy                           0.99     13470
   macro avg       0.99      0.99      0.99     13470
weighted avg       0.99      0.99      0.99     13470



In [39]:
def output_lable(n):
    if n == 0:
        return "Fake News"
    elif n == 1:
        return "Real News"

In [40]:
def manual_testing(news):
    testing_news = {"text":[news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test['text'] = new_def_test['text'].apply(wordopt)
    new_x_test = new_def_test['text']
    new_xv_test = vectorizer.transform(new_x_test)

    pred_LR = model_logistic.predict(new_xv_test)

    pred_DTC = dtc.predict(new_xv_test)

    pred_RFC = rfc.predict(new_xv_test)

    return print("\n\nLogistic Regression Prediction: {} \nDecision Tree Classifier Prediction: {} \nRandom Forest Classifier Prediction: {} ".format(output_lable(pred_LR[0]), output_lable(pred_DTC[0]), output_lable(pred_RFC[0])))

In [41]:
news_article = str(input())

In [42]:
manual_testing(news_article)



Logistic Regression Prediction: Fake News 
Decision Tree Classifier Prediction: Fake News 
Random Forest Classifier Prediction: Fake News 
