In [1]:
import pandas as pd
import numpy as np
import re
import string
from scipy.sparse import hstack
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer

**Loading the Dataset - True and False**

In [2]:
data_true=pd.read_csv("True.csv")
data_fake=pd.read_csv("Fake.csv")

**Understanding the Dataset using pandas operations - head, isnull, info**

In [3]:
data_true.head(10)

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"
5,"White House, Congress prepare for talks on spe...","WEST PALM BEACH, Fla./WASHINGTON (Reuters) - T...",politicsNews,"December 29, 2017"
6,"Trump says Russia probe will be fair, but time...","WEST PALM BEACH, Fla (Reuters) - President Don...",politicsNews,"December 29, 2017"
7,Factbox: Trump on Twitter (Dec 29) - Approval ...,The following statements were posted to the ve...,politicsNews,"December 29, 2017"
8,Trump on Twitter (Dec 28) - Global Warming,The following statements were posted to the ve...,politicsNews,"December 29, 2017"
9,Alabama official to certify Senator-elect Jone...,WASHINGTON (Reuters) - Alabama Secretary of St...,politicsNews,"December 28, 2017"


In [4]:
data_fake.head(10)

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"
5,Racist Alabama Cops Brutalize Black Boy While...,The number of cases of cops brutalizing and ki...,News,"December 25, 2017"
6,"Fresh Off The Golf Course, Trump Lashes Out A...",Donald Trump spent a good portion of his day a...,News,"December 23, 2017"
7,Trump Said Some INSANELY Racist Stuff Inside ...,In the wake of yet another court decision that...,News,"December 23, 2017"
8,Former CIA Director Slams Trump Over UN Bully...,Many people have raised the alarm regarding th...,News,"December 22, 2017"
9,WATCH: Brand-New Pro-Trump Ad Features So Muc...,Just when you might have thought we d get a br...,News,"December 21, 2017"


In [5]:
print("true : ",data_true.shape)
print("fake : ",data_fake.shape)

true :  (21417, 4)
fake :  (23481, 4)


In [6]:
data_true.isnull().sum()

title      0
text       0
subject    0
date       0
dtype: int64

In [7]:
data_fake.isnull().sum()

title      0
text       0
subject    0
date       0
dtype: int64

In [8]:
data_true.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    21417 non-null  object
 1   text     21417 non-null  object
 2   subject  21417 non-null  object
 3   date     21417 non-null  object
dtypes: object(4)
memory usage: 669.4+ KB


**Setting the table of true news to be 1 and Fake news to be 0 and concatinating the two datasets**

In [9]:
data_true['type']=1

In [10]:
data_fake['type']=0

In [11]:
data=pd.concat([data_true,data_fake],axis=0)

In [12]:
data

Unnamed: 0,title,text,subject,date,type
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1
...,...,...,...,...,...
23476,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016",0
23477,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016",0
23478,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016",0
23479,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016",0


**Shuffling the Dataset**

In [13]:
data1,data2=train_test_split(data, test_size=0.1, stratify=data['type'], random_state=42)

In [14]:
data1.shape


(40408, 5)

In [15]:
data2.shape

(4490, 5)

In [16]:
data=pd.concat([data1,data2],axis=0)

In [17]:
data


Unnamed: 0,title,text,subject,date,type
4625,U.S. Senate backs Montenegro's membership in NATO,WASHINGTON (Reuters) - The U.S. Senate on Tues...,politicsNews,"March 28, 2017",1
11028,Clinton's 'girl power' push wins over women in...,"DES MOINES, Iowa (Reuters) - If Hillary Clinto...",politicsNews,"January 31, 2016",1
12727,CATHOLIC BISHOP OUTRAGED Over Hillary’s ANTI-C...,Hillary s been using churches across America a...,politics,"Oct 15, 2016",0
14965,"About 30 killed when train derails, catches fi...",KINSHASA (Reuters) - About 30 people were kill...,worldnews,"November 13, 2017",1
483,The UNBELIEVABLE Reason Trump Pardoned Arpaio...,If Donald Trump hadn t proven himself to be th...,News,"August 28, 2017",0
...,...,...,...,...,...
13850,Merkel points to grand coalition with Social D...,BERLIN (Reuters) - Chancellor Angela Merkel on...,worldnews,"November 25, 2017",1
2967,Pres. Obama Just Saved Chelsea Manning On His...,Presidential commutations and pardons are not ...,News,"January 17, 2017",0
16244,BREAKING: HOUSE INTEL LAUNCHES Investigation I...,House Intelligence Committee Chairman Devin Nu...,Government News,"Feb 15, 2017",0
2612,WATCH: Colbert Asks Tapper How CNN Staff Feel...,Donald Trump is no fan of CNN. He has repeated...,News,"February 9, 2017",0


**Checking for class imbalance**

In [18]:
data['type'].value_counts()

type
0    23481
1    21417
Name: count, dtype: int64

**Dropping less significant columns - subject, date**

In [19]:
data.drop(['subject', 'date'], inplace=True, axis=1)

**Using python re function to remove unwanted structures from the text**

In [20]:
def transform_text(text):
    text=text.lower()
    text=re.sub(r'https?://\S+|www\./S+','',text)
    text=re.sub(r'\[.*?\]','',text)
    text=re.sub(r'<.*?>','',text)
    text=re.sub(r'\\W'," ",text)
    text=re.sub(r'\w*\d\w*','',text)
    text=re.sub(r'[%s]' % re.escape(string.punctuation),'',text)
    return text
    

In [21]:
data['text']=data['text'].apply(transform_text)
data['title']=data['title'].apply(transform_text)

**Converting text to vectors using TF-IDF vectorization**

In [22]:
vectorizer=TfidfVectorizer()
title=vectorizer.fit_transform(data['title'])
text=vectorizer.fit_transform(data['text'])


In [23]:
combined_features=hstack([title,text])

**Initializing models - Logistic Regression, LDA, KNeighborsClassifier, DecisionTreeClassifier, GaussianNB**

In [24]:
models=[]
models.append(('LR', LogisticRegression()))
models.append(('LDR', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('DTree', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
               

**Performing dimensionality reduction using TurncatedSVD**

In [25]:
from sklearn.decomposition import TruncatedSVD
pca=TruncatedSVD(n_components=100)
dat=pca.fit_transform(combined_features)

**Using stratified cross validation to run the models and the accuracy is calculated**

In [26]:
names=[]
results=[]
for name,model in models:
    skf=StratifiedKFold(n_splits=10)
    score=cross_val_score(model, dat, data['type'], cv=skf, scoring='accuracy')
    results.append(score)
    names.append(name)
    msg="%s : %f" % (name, np.mean(score))
    print(msg)

LR : 0.979420
LDR : 0.977660
KNN : 0.939463
DTree : 0.944496
NB : 0.773575
