In [1]:
# importing libraries
import pandas as pd
import re
import numpy as np
import spacy
import contractions
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import wordnet
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
#loading datasets
df1 = pd.read_csv('./data/news_a1.csv')
print(df1.shape)
df1.head()

(8620, 7)


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,title,text,subject,date,label
0,4218,15432,NEWSFLASH FOR OUR IMPERIAL PRESIDENT: STATES C...,As Barack Hussein Obama tours around the count...,politics,"Jul 23, 2015",FAKE
1,3828,19686,Factbox: Reactions to speech by Myanmar's Suu ...,NAYPYITAW (Reuters) - Myanmar leader Aung San ...,worldnews,"September 19, 2017",REAL
2,9288,10721,"RADICAL, INTOLERANT Students Held College Admi...",Video from inside Evergreen State College cont...,politics,"Jun 2, 2017",FAKE
3,11577,20031,Japan's Suga: government strongly protests lat...,TOKYO (Reuters) - North Korea fired a ballisti...,worldnews,"September 14, 2017",REAL
4,8869,10577,CNN’s Jim Acosta Goes Bonkers Waving His Hands...,Watch Jim Acosta wave his hands around and hav...,politics,"Jun 19, 2017",FAKE


In [3]:
df2 = pd.read_csv('./data/news_a2.csv')
print(df2.shape)
df2.head()

(8620, 7)


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,title,text,subject,date,label
0,22406,4545,Trump officials defend immigration arrests at ...,LOS ANGELES (Reuters) - Federal agents have ar...,politicsNews,"March 31, 2017",REAL
1,4061,20039,LIBERAL SMACK DOWN OF THE DAY: Watch What Happ...,The Left is not able to get away with shaming ...,left-news,"Sep 2, 2016",FAKE
2,3456,6739,Factbox: Trump fills top jobs for his administ...,(Reuters) - U.S. President-elect Donald Trump ...,politicsNews,"December 13, 2016",REAL
3,4956,18869,"Iraq Kurdish vote may benefit Syrian Kurds, sa...",BEIRUT (Reuters) - The Iraqi Kurdish vote for ...,worldnews,"September 27, 2017",REAL
4,7787,17092,Tillerson to visit Pakistan as well as India: ...,WASHINGTON (Reuters) - U.S. Secretary of State...,worldnews,"October 18, 2017",REAL


In [4]:
df3 = pd.read_csv('./data/news2.csv')
print(df3.shape)
df3.head()

(8980, 6)


Unnamed: 0.1,Unnamed: 0,title,text,subject,date,label
0,22216,Ben Stein Calls Out 9th Circuit Court: Committ...,"21st Century Wire says Ben Stein, reputable pr...",US_News,"February 13, 2017",FAKE
1,4436,Trump drops Steve Bannon from National Securit...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"April 5, 2017",REAL
2,1526,Puerto Rico expects U.S. to lift Jones Act shi...,(Reuters) - Puerto Rico Governor Ricardo Rosse...,politicsNews,"September 27, 2017",REAL
3,1377,OOPS: Trump Just Accidentally Confirmed He Le...,"On Monday, Donald Trump once again embarrassed...",News,"May 22, 2017",FAKE
4,8995,Donald Trump heads for Scotland to reopen a go...,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",politicsNews,"June 24, 2016",REAL


Data preparation and cleaning

In [5]:
# joining the datasets 
df = pd.concat([df1, df2, df3])
print(df.shape)
df.head()

(26220, 7)


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,title,text,subject,date,label
0,4218.0,15432,NEWSFLASH FOR OUR IMPERIAL PRESIDENT: STATES C...,As Barack Hussein Obama tours around the count...,politics,"Jul 23, 2015",FAKE
1,3828.0,19686,Factbox: Reactions to speech by Myanmar's Suu ...,NAYPYITAW (Reuters) - Myanmar leader Aung San ...,worldnews,"September 19, 2017",REAL
2,9288.0,10721,"RADICAL, INTOLERANT Students Held College Admi...",Video from inside Evergreen State College cont...,politics,"Jun 2, 2017",FAKE
3,11577.0,20031,Japan's Suga: government strongly protests lat...,TOKYO (Reuters) - North Korea fired a ballisti...,worldnews,"September 14, 2017",REAL
4,8869.0,10577,CNN’s Jim Acosta Goes Bonkers Waving His Hands...,Watch Jim Acosta wave his hands around and hav...,politics,"Jun 19, 2017",FAKE


In [6]:
# checking for missing values
df.isnull().sum()

Unnamed: 0.1    8980
Unnamed: 0         0
title              0
text               0
subject            0
date               0
label              0
dtype: int64

In [7]:
# dropping duplicates
df.drop_duplicates(subset ="title", keep = 'first', inplace = True)
df.shape

(24051, 7)

In [8]:
# checking imbalanced data
df['label'].value_counts()

REAL    12303
FAKE    11748
Name: label, dtype: int64

Text Cleaning

In [9]:
# normalizing the documents
def normalize_document(doc):
    # remove special characters\whitespaces
    pattern = r'[^a-zA-Z0-9\s]'
    doc = re.sub(pattern, '', doc)
    doc = doc.strip()
    doc = contractions.fix(doc)
    doc = doc.lower()
    return doc

In [10]:
# removing stopwords 
nlp = spacy.load('en_core_web_sm')
stopwords = nlp.Defaults.stop_words
def remove_stop(doc):
    #tokenize words
    word_tokens = word_tokenize(doc)
    #removing stopwords
    filtered_tokens = [token for token in word_tokens if token not in stopwords]
    return filtered_tokens

In [11]:
# function for converting tags
def pos_tag_wordnet(tagged_tokens):
    tag_map = {'j': wordnet.ADJ, 'v': wordnet.VERB, 'n': wordnet.NOUN, 'r': wordnet.ADV}
    new_tagged_tokens = [(word, tag_map.get(tag[0].lower(), wordnet.NOUN))
                            for word, tag in tagged_tokens]
    return new_tagged_tokens

In [12]:
# lematizing words
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
def lemmatize(doc):
    #POS tagging
    tagged_tokens = nltk.pos_tag(doc)
    # converting the tags
    wordnet_tokens = pos_tag_wordnet(tagged_tokens)
    #lemmatizing
    filtered_doc = ' '.join(wnl.lemmatize(word, tag) for word, tag in wordnet_tokens)
    return filtered_doc

In [13]:
# cleaning title
df['clean_title'] = df['title'].apply(lambda x: normalize_document(x)) # normalize text
df['clean_title'] = df['clean_title'].apply(lambda x: remove_stop(x)) # remove stopwords
df['clean_title'] = df['clean_title'].apply(lambda x: lemmatize(x)) # lemmatize text
df.head()


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,title,text,subject,date,label,clean_title
0,4218.0,15432,NEWSFLASH FOR OUR IMPERIAL PRESIDENT: STATES C...,As Barack Hussein Obama tours around the count...,politics,"Jul 23, 2015",FAKE,newsflash imperial president state refuse iran...
1,3828.0,19686,Factbox: Reactions to speech by Myanmar's Suu ...,NAYPYITAW (Reuters) - Myanmar leader Aung San ...,worldnews,"September 19, 2017",REAL,factbox reaction speech myanmar suu kyi violen...
2,9288.0,10721,"RADICAL, INTOLERANT Students Held College Admi...",Video from inside Evergreen State College cont...,politics,"Jun 2, 2017",FAKE,radical intolerant student hold college admini...
3,11577.0,20031,Japan's Suga: government strongly protests lat...,TOKYO (Reuters) - North Korea fired a ballisti...,worldnews,"September 14, 2017",REAL,japan suga government strongly protest late n ...
4,8869.0,10577,CNN’s Jim Acosta Goes Bonkers Waving His Hands...,Watch Jim Acosta wave his hands around and hav...,politics,"Jun 19, 2017",FAKE,cnns jim acosta go bonkers wave hand camera se...


In [14]:
# cleaning text
df['clean_text'] = df['text'].apply(lambda x: normalize_document(x)) # normalize text
df['clean_text'] = df['clean_text'].apply(lambda x: remove_stop(x)) # remove stopwords
df['clean_text'] = df['clean_text'].apply(lambda x: lemmatize(x)) # lemmatize text
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,title,text,subject,date,label,clean_title,clean_text
0,4218.0,15432,NEWSFLASH FOR OUR IMPERIAL PRESIDENT: STATES C...,As Barack Hussein Obama tours around the count...,politics,"Jul 23, 2015",FAKE,newsflash imperial president state refuse iran...,barack hussein obama tour country try convince...
1,3828.0,19686,Factbox: Reactions to speech by Myanmar's Suu ...,NAYPYITAW (Reuters) - Myanmar leader Aung San ...,worldnews,"September 19, 2017",REAL,factbox reaction speech myanmar suu kyi violen...,naypyitaw reuters myanmar leader aung san suu ...
2,9288.0,10721,"RADICAL, INTOLERANT Students Held College Admi...",Video from inside Evergreen State College cont...,politics,"Jun 2, 2017",FAKE,radical intolerant student hold college admini...,video inside evergreen state college continue ...
3,11577.0,20031,Japan's Suga: government strongly protests lat...,TOKYO (Reuters) - North Korea fired a ballisti...,worldnews,"September 14, 2017",REAL,japan suga government strongly protest late n ...,tokyo reuters north korea fire ballistic missi...
4,8869.0,10577,CNN’s Jim Acosta Goes Bonkers Waving His Hands...,Watch Jim Acosta wave his hands around and hav...,politics,"Jun 19, 2017",FAKE,cnns jim acosta go bonkers wave hand camera se...,watch jim acosta wave hand temper tantrum air ...


In [15]:
# turning the label to int
df['label'] = (df.label == 'FAKE').astype(int)
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,title,text,subject,date,label,clean_title,clean_text
0,4218.0,15432,NEWSFLASH FOR OUR IMPERIAL PRESIDENT: STATES C...,As Barack Hussein Obama tours around the count...,politics,"Jul 23, 2015",1,newsflash imperial president state refuse iran...,barack hussein obama tour country try convince...
1,3828.0,19686,Factbox: Reactions to speech by Myanmar's Suu ...,NAYPYITAW (Reuters) - Myanmar leader Aung San ...,worldnews,"September 19, 2017",0,factbox reaction speech myanmar suu kyi violen...,naypyitaw reuters myanmar leader aung san suu ...
2,9288.0,10721,"RADICAL, INTOLERANT Students Held College Admi...",Video from inside Evergreen State College cont...,politics,"Jun 2, 2017",1,radical intolerant student hold college admini...,video inside evergreen state college continue ...
3,11577.0,20031,Japan's Suga: government strongly protests lat...,TOKYO (Reuters) - North Korea fired a ballisti...,worldnews,"September 14, 2017",0,japan suga government strongly protest late n ...,tokyo reuters north korea fire ballistic missi...
4,8869.0,10577,CNN’s Jim Acosta Goes Bonkers Waving His Hands...,Watch Jim Acosta wave his hands around and hav...,politics,"Jun 19, 2017",1,cnns jim acosta go bonkers wave hand camera se...,watch jim acosta wave hand temper tantrum air ...


Splitting data into train and validation set

In [16]:
#split to get test set
df_train, df_val = train_test_split(df, test_size=0.2, random_state=1)

In [17]:
#creating the label
y_train = df_train.label.values
y_val = df_val.label.values

In [18]:
print(df_train.shape), print(y_train.shape), print(df_val.shape), print(y_val.shape)

(19240, 9)
(19240,)
(4811, 9)
(4811,)


(None, None, None, None)

Encoding text data & feature engineering

In [19]:
#creating a function for encoding text data
def transform_text(data):
    #transforming with count vectorizer
    vectorizer = CountVectorizer()
    vectorizer.fit(df_train['clean_text'])
    cv = vectorizer.transform(data.clean_text)
    return cv

In [20]:
#transforming train data
train_data = transform_text(df_train)
print(train_data.shape)

(19240, 140861)


In [21]:
#transforming validation data
val_data = transform_text(df_val)
print(val_data.shape)

(4811, 140861)


Model training and testing

1. Logistic Regression

In [22]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=500, solver='lbfgs')
lr.fit(train_data, y_train) #fitting  model

In [23]:
#predicting 
y_pred = lr.predict(val_data)#predicting validation values
train_pred = lr.predict(train_data)#predicting train values

In [24]:
# evaluation
print('train results')
print(classification_report(y_train,train_pred))
print(f'accuracy - {accuracy_score(y_train,train_pred)}')
print(' ')
print('Validation results')
print(classification_report(y_val,y_pred))
print(f'accuracy - {accuracy_score(y_val,y_pred)}')

train results
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9872
           1       1.00      1.00      1.00      9368

    accuracy                           1.00     19240
   macro avg       1.00      1.00      1.00     19240
weighted avg       1.00      1.00      1.00     19240

accuracy - 1.0
 
Validation results
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      2431
           1       1.00      0.99      0.99      2380

    accuracy                           0.99      4811
   macro avg       0.99      0.99      0.99      4811
weighted avg       0.99      0.99      0.99      4811

accuracy - 0.9943878611515278


2. Decision Trees

In [25]:
from sklearn.tree import DecisionTreeClassifier
#training
dt = DecisionTreeClassifier(min_samples_leaf=1)
dt.fit(train_data, y_train)

In [26]:
# predicting
y_pred = dt.predict(val_data)
train_pred = dt.predict(train_data)

In [27]:
# evaluation
print('train results')
print(classification_report(y_train,train_pred))
print(f'accuracy - {accuracy_score(y_train,train_pred)}')
print(' ')
print('Validation results')
print(classification_report(y_val,y_pred))
print(f'accuracy - {accuracy_score(y_val,y_pred)}')

train results
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9872
           1       1.00      1.00      1.00      9368

    accuracy                           1.00     19240
   macro avg       1.00      1.00      1.00     19240
weighted avg       1.00      1.00      1.00     19240

accuracy - 1.0
 
Validation results
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      2431
           1       1.00      0.99      0.99      2380

    accuracy                           0.99      4811
   macro avg       0.99      0.99      0.99      4811
weighted avg       0.99      0.99      0.99      4811

accuracy - 0.9941800041571399


3. Random Forest

In [28]:
# training
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=50, max_depth=1500, random_state=1)
rf.fit(train_data, y_train)

In [29]:
# predicting
y_pred = rf.predict(val_data)
train_pred = rf.predict(train_data)

In [30]:
# evaluation
print('train results')
print(classification_report(y_train,train_pred))
print(f'accuracy - {accuracy_score(y_train,train_pred)}')
print(' ')
print('Validation results')
print(classification_report(y_val,y_pred))
print(f'accuracy - {accuracy_score(y_val,y_pred)}')

train results
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9872
           1       1.00      1.00      1.00      9368

    accuracy                           1.00     19240
   macro avg       1.00      1.00      1.00     19240
weighted avg       1.00      1.00      1.00     19240

accuracy - 1.0
 
Validation results
              precision    recall  f1-score   support

           0       0.95      0.99      0.97      2431
           1       0.99      0.95      0.97      2380

    accuracy                           0.97      4811
   macro avg       0.97      0.97      0.97      4811
weighted avg       0.97      0.97      0.97      4811

accuracy - 0.9711078777800873
