In [2]:
# Importing all the important libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import  CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [3]:
fake_data = pd.read_csv("data/fake.csv")
true_data = pd.read_csv("data/true.csv")

In [5]:
print("Fake Data Shape: ", fake_data.shape)
print("True Data Shape: ", true_data.shape)

Fake Data Shape:  (23481, 4)
True Data Shape:  (21417, 4)


In [6]:
# Data Cleaning

fake_data["target"] = 'fake'
true_data["target"] = 'true'

In [8]:
# Concatination of dataframes
data = pd.concat([fake_data, true_data]).reset_index(drop = True)
print("Concatinated Data Shape: ", data.shape)

Concatinated Data Shape:  (44898, 5)


In [9]:
# Shuffling the data
from sklearn.utils import shuffle
data = shuffle(data)
data = data.reset_index(drop = True)

In [10]:
# Checking the top rows of data
data.head()

Unnamed: 0,title,text,subject,date,target
0,WATCH: BLACK CONSERVATIVE DESTROYS Mexican Fla...,***LANGUAGE WARNING*** Why is it acceptable fo...,left-news,"Jun 4, 2016",fake
1,MUST WATCH: Barack Obama After January 20th…Th...,January 20th can't come soon enough. We need t...,politics,"Dec 21, 2016",fake
2,Ellen Pompeo Just Shut A Racist Twitter Troll...,After Ellen Pompeo viewed yet other shooting o...,News,"July 21, 2016",fake
3,"Sanders may debate Trump, not Clinton, before ...",WASHINGTON (Reuters) - An unconventional debat...,politicsNews,"May 26, 2016",true
4,CA: STATE LEGISLATORS Want Traffic Fines To Be...,"Punishing rich, hard-working, white Americans ...",left-news,"May 19, 2017",fake


In [11]:
# Removing the date columns
data.drop(["date"], axis = 1, inplace = True)
data.head()

Unnamed: 0,title,text,subject,target
0,WATCH: BLACK CONSERVATIVE DESTROYS Mexican Fla...,***LANGUAGE WARNING*** Why is it acceptable fo...,left-news,fake
1,MUST WATCH: Barack Obama After January 20th…Th...,January 20th can't come soon enough. We need t...,politics,fake
2,Ellen Pompeo Just Shut A Racist Twitter Troll...,After Ellen Pompeo viewed yet other shooting o...,News,fake
3,"Sanders may debate Trump, not Clinton, before ...",WASHINGTON (Reuters) - An unconventional debat...,politicsNews,true
4,CA: STATE LEGISLATORS Want Traffic Fines To Be...,"Punishing rich, hard-working, white Americans ...",left-news,fake


In [12]:
# Removing the title columns
data.drop(["title"], axis = 1, inplace = True)
data.head()

Unnamed: 0,text,subject,target
0,***LANGUAGE WARNING*** Why is it acceptable fo...,left-news,fake
1,January 20th can't come soon enough. We need t...,politics,fake
2,After Ellen Pompeo viewed yet other shooting o...,News,fake
3,WASHINGTON (Reuters) - An unconventional debat...,politicsNews,true
4,"Punishing rich, hard-working, white Americans ...",left-news,fake


In [13]:
# Covnerting to lower case

data['text'] = data['text'].apply(lambda x:x.lower())
data.head()

Unnamed: 0,text,subject,target
0,***language warning*** why is it acceptable fo...,left-news,fake
1,january 20th can't come soon enough. we need t...,politics,fake
2,after ellen pompeo viewed yet other shooting o...,News,fake
3,washington (reuters) - an unconventional debat...,politicsNews,true
4,"punishing rich, hard-working, white americans ...",left-news,fake


In [15]:
# Removing the punctuation

import string
def punctuation_removal(text):
    all_list = [char for char in text if char not in string.punctuation]
    clean_str = ''.join(all_list)
    return clean_str

data['text'] = data['text'].apply(punctuation_removal) 

In [17]:
# Checking whether data is cleaned or not
data.head()

Unnamed: 0,text,subject,target
0,language warning why is it acceptable for peop...,left-news,fake
1,january 20th cant come soon enough we need thi...,politics,fake
2,after ellen pompeo viewed yet other shooting o...,News,fake
3,washington reuters an unconventional debate b...,politicsNews,true
4,punishing rich hardworking white americans one...,left-news,fake
