In [1]:
import pandas as pd
import numpy as np
import os
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.svm import LinearSVC
from sklearn.metrics import *
import re

In [2]:
fake_news = pd.read_csv(os.path.join("News_dataset", "Fake.csv"))
real_news = pd.read_csv(os.path.join("News_dataset", "True.csv"))
fake_news.shape, real_news.shape

((23481, 4), (21417, 4))

In [3]:
fake_news.isnull().sum()

title      0
text       0
subject    0
date       0
dtype: int64

In [4]:
real_news.isnull().sum()

title      0
text       0
subject    0
date       0
dtype: int64

In [5]:
fake_news['label'] = 0
real_news['label'] = 1

In [6]:
dataset = pd.concat([fake_news, real_news], axis=0)
dataset.shape

(44898, 5)

In [7]:
dataset.columns

Index(['title', 'text', 'subject', 'date', 'label'], dtype='object')

In [8]:
dataset['text'] = dataset['title'] + ' ' + dataset['text']

In [9]:
dataset

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump Sends Out Embarrassing New Year’...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,Drunk Bragging Trump Staffer Started Russian ...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,Sheriff David Clarke Becomes An Internet Joke...,News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,Trump Is So Obsessed He Even Has Obama’s Name...,News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis Just Called Out Donald Trump Dur...,News,"December 25, 2017",0
...,...,...,...,...,...
21412,'Fully committed' NATO backs new U.S. approach...,'Fully committed' NATO backs new U.S. approach...,worldnews,"August 22, 2017",1
21413,LexisNexis withdrew two products from Chinese ...,LexisNexis withdrew two products from Chinese ...,worldnews,"August 22, 2017",1
21414,Minsk cultural hub becomes haven from authorities,Minsk cultural hub becomes haven from authorit...,worldnews,"August 22, 2017",1
21415,Vatican upbeat on possibility of Pope Francis ...,Vatican upbeat on possibility of Pope Francis ...,worldnews,"August 22, 2017",1


In [10]:
dataset = dataset.drop(['subject', 'date', 'title'], axis = 1)

In [11]:
dataset

Unnamed: 0,text,label
0,Donald Trump Sends Out Embarrassing New Year’...,0
1,Drunk Bragging Trump Staffer Started Russian ...,0
2,Sheriff David Clarke Becomes An Internet Joke...,0
3,Trump Is So Obsessed He Even Has Obama’s Name...,0
4,Pope Francis Just Called Out Donald Trump Dur...,0
...,...,...
21412,'Fully committed' NATO backs new U.S. approach...,1
21413,LexisNexis withdrew two products from Chinese ...,1
21414,Minsk cultural hub becomes haven from authorit...,1
21415,Vatican upbeat on possibility of Pope Francis ...,1


In [12]:
import nltk 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Ankit
[nltk_data]     Dey\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
# This are the words that are already in there root word and we do not require to convert them.
stopwords = stopwords.words('english') 

In [14]:
# Stemming refers to getting the root word from the actual word.
port_stemmer = PorterStemmer()
import time

count = 0;

def pre_process_text(text):
    global count
    
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    text = text.split()
    stemmed_words = [port_stemmer.stem(word) for word in text if word not in stopwords]
    text = ' '.join(stemmed_words)

    print(count, end = ' ')
    count += 1
    
    return text

# pre_process_text("hello 12 ; 3 guys my name is ankit")

In [15]:
dataset['text'] = dataset['text'].apply(pre_process_text)



In [16]:
dataset

Unnamed: 0,text,label
0,donald trump send embarrass new year eve messa...,0
1,drunk brag trump staffer start russian collus ...,0
2,sheriff david clark becom internet joke threat...,0
3,trump obsess even obama name code websit imag ...,0
4,pope franci call donald trump christma speech ...,0
...,...,...
21412,fulli commit nato back new u approach afghanis...,1
21413,lexisnexi withdrew two product chines market l...,1
21414,minsk cultur hub becom author minsk reuter sha...,1
21415,vatican upbeat possibl pope franci visit russi...,1


In [17]:
x_train, x_test, y_train, y_test = train_test_split(dataset['text'], dataset['label'], test_size=0.2, random_state=2)

In [18]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((35918,), (8980,), (35918,), (8980,))

In [19]:
tfd = TfidfVectorizer()
x_train_vectorized = tfd.fit_transform(x_train)
x_test_vectorized = tfd.transform(x_test)

In [20]:
x_train_vectorized.shape, x_test_vectorized.shape

((35918, 81568), (8980, 81568))

In [21]:
model_lsvc = LinearSVC()
model_lsvc.fit(x_train_vectorized, y_train)

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,1.0
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


In [22]:
model_lsvc.score(x_test_vectorized, y_test)

0.9947661469933184

In [23]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

model_lr = LogisticRegression()
model_lr.fit(x_train_vectorized, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [24]:
model_lr.score(x_test_vectorized, y_test)

0.9861915367483296

In [25]:
model_dtc = DecisionTreeClassifier()
model_dtc.fit(x_train_vectorized, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [26]:
model_dtc.score(x_test_vectorized, y_test)

0.9964365256124722

In [27]:
from sklearn.ensemble import GradientBoostingClassifier

model_gb = GradientBoostingClassifier(random_state=0)
model_gb.fit(x_train_vectorized, y_train)

0,1,2
,loss,'log_loss'
,learning_rate,0.1
,n_estimators,100
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


In [28]:
model_gb.score(x_test_vectorized, y_test)

0.9951002227171493

In [29]:
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier(random_state=0)
model_rf.fit(x_train_vectorized, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [30]:
model_rf.score(x_test_vectorized, y_test)

0.9946547884187082

In [31]:
from PIL import Image
import pytesseract
import requests

pytesseract.pytesseract.tesseract_cmd = r"Tesseract-OCR\tesseract.exe"

# Load image from a URL.
url = "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQWbQ-e7P8B1cS_96Emg2odA0pHe3lAD1LeNA&s"
image = Image.open(requests.get(url, stream=True).raw).convert("RGB")

extracted_text = pytesseract.image_to_string(image)
print(extracted_text)

It was the best of
times, it was the worst
of times, it was the age
of wisdom, it was the
age of foolishness...



In [32]:
extracted_text = pre_process_text(extracted_text)
extracted_text = tfd.transform([extracted_text])

model_lr.predict(extracted_text)

44898 

array([0])

In [33]:
# Saving the models, after testing and performance evaluation.
import os, pickle

os.mkdir('models')
os.chdir('models')
with open('model_LinearSVC_0.1.0.bin', 'wb') as file:
    pickle.dump(model_lsvc, file)   

with open('model_LogisticRegression_0.1.0.bin', 'wb') as file:
    pickle.dump(model_lr, file)   

with open('model_DecisionTree_0.1.0.bin', 'wb') as file:
    pickle.dump(model_dtc, file)   

with open('model_GradientBoost_0.1.0.bin', 'wb') as file:
    pickle.dump(model_gb, file)   

with open('model_RandomForrest_0.1.0.bin', 'wb') as file:
    pickle.dump(model_rf, file)   

os.chdir('..')

In [34]:
# Loading and Testing our models.
with open('models/model_DecisionTree_0.1.0.bin', 'rb') as file:
    curr_model = pickle.load(file)

    print(curr_model.score(x_test_vectorized, y_test))
    print(classification_report(y_test, curr_model.predict(x_test_vectorized)))

0.9964365256124722
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4704
           1       1.00      1.00      1.00      4276

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980



In [35]:
# Saved the text_vectorizer model, because it's already trained on our whole news text dataset, and is ready to generate a resultant sparse_matrix for
# a new block of  text that is provided to it, keeping the dimension of the sparse_matrix same (None, 81568) which is required by our models. 
with open('models/text_vectorizer_model_0.1.0.bin', 'wb') as file:
    pickle.dump(tfd, file)