In [1]:
#!/usr/bin/env python3

import pandas as pd
import numpy as np
import nltk
import sys
import matplotlib.pyplot as plt
import seaborn as sns
import re
from wordcloud import WordCloud





# Loading fake and real news files

In [2]:
true_df = pd.read_csv(r"dataset/true.csv")
fake_df = pd.read_csv(r"dataset/fake.csv")


Partie Laurent

# Cleaning the datasets

In [3]:
# Adding class Information
true_df["class"] = 1
fake_df["class"] = 0

Since there might be some empty text in those datasets (maybe the news is only made of a video ?)

In [4]:
# First we are getting indexes of empty news 
empty_fake_index = [index for index,text in enumerate(fake_df.text.values) if str(text).strip() == '']

#Checking some rows 
fake_df.iloc[empty_fake_index].tail()

Unnamed: 0,title,text,subject,date,class
21816,BALTIMORE BURNS: MARYLAND GOVERNOR BRINGS IN N...,,left-news,"Apr 27, 2015",0
21826,FULL VIDEO: THE BLOCKBUSTER INVESTIGATION INTO...,,left-news,"Apr 25, 2015",0
21827,(VIDEO) HILLARY CLINTON: RELIGIOUS BELIEFS MUS...,,left-news,"Apr 25, 2015",0
21857,(VIDEO)ICE PROTECTING OBAMA: WON’T RELEASE NAM...,,left-news,"Apr 14, 2015",0
21873,(VIDEO) HYSTERICAL SNL TAKE ON HILLARY’S ANNOU...,,left-news,"Apr 12, 2015",0


To solve this and since it seems kinda pointless to separate them, we are merging the title and the text of the news

In [5]:
fake_df['text'] = fake_df['title'] + " " + fake_df['text']
true_df['text'] = true_df['title'] + " " + true_df['text']

fake_df = fake_df.drop(['title'], axis = 1)
true_df = true_df.drop(['title'], axis = 1)

We think the subject could have been an interesting addition but since they are different across both datasets they might cause an overfitting if the model is able to predict a fake news solely by having the subject be "News" for exemple

In [6]:
fake_df = fake_df.drop(['subject'], axis = 1)
true_df = true_df.drop(['subject'], axis = 1)

There might be some duplicated news so we are dropping them

In [7]:
true_df = true_df.drop_duplicates(keep=False)
fake_df = fake_df.drop_duplicates(keep=False)


Combining both datasets

In [8]:
data = pd.concat([true_df, fake_df], ignore_index=True, sort=False)

data.tail(15)

Unnamed: 0,text,date,class
33793,UPDATE: 40% OF VICTIM’S SKULL IS MISSING…No Ne...,"Apr 11, 2015",0
33794,‘MOTHER OF THE YEAR’ Drives11 And Armed 15 Yr....,"Apr 10, 2015",0
33795,IT’S TIME TO STOP THE LIES! ARE YOU SICK AND T...,"Apr 9, 2015",0
33796,[Video] BURGER KING MANAGER CURSES OUT AND THR...,"Apr 7, 2015",0
33797,SHOCKING: UNIV OF HAWAII RECRUITS GIRLS AS YOU...,"Apr 7, 2015",0
33798,BREAKING: [Video] COLORADO BAKER WHO REFUSED T...,"Apr 7, 2015",0
33799,(VIDEO) PATRIOTS DEMAND REMOVAL OF COMMUNIST F...,"Apr 6, 2015",0
33800,BUSTED: [VIDEO] MAN ATTEMPTS TO TAPE “GOTCHA” ...,"Apr 5, 2015",0
33801,[VIDEO] 16 YR OLD ARRESTED For Violent Gang Be...,"Apr 4, 2015",0
33802,“Non-violence hasn’t worked”…Reverend Sam Most...,"Apr 1, 2015",0


# Preprocessing text
   ## First preprocessing
   

In [9]:
def first_preprocessing(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]','',text) # remove punctuation
    text = re.sub('(\s+)', ' ', text) # removing extra whitespaces
    text = re.sub('\[.*?\]', '', text) # remove square brackets
    text = re.sub('\w*\d\w*', '', text) # remove words containing numbers
    text = re.sub(r'http\S+', '', text) # remove http
    text = re.sub('\n', '', text) # remove new lines
    return text

In [15]:
preprocessing_data_1 = data.copy()
preprocessing_data_1['text'] = preprocessing_data_1.text.apply(lambda text : first_preprocessing(text))
preprocessing_data_1.tail()

Unnamed: 0,text,date,class
33803,the white house and the theatrics of gun contr...,"January 7, 2016",0
33804,activists or terrorists how media controls and...,"January 7, 2016",0
33805,boiler room no surrender no retreat heads will...,"January 6, 2016",0
33806,federal showdown looms in oregon after blm abu...,"January 4, 2016",0
33807,a troubled king chicagos rahm emanuel desperat...,"January 2, 2016",0


Removing stopwords

In [13]:
def remove_stopwords(text):
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokens = nltk.word_tokenize(text)
    filtered_sentence = []
    for w in tokens:
        if w not in stop_words:
            filtered_sentence.append(w)

    return " ".join(filtered_sentence)


In [16]:
preprocessing_data_1['text'] = preprocessing_data_1.text.apply(lambda text : remove_stopwords(text))
preprocessing_data_1.tail()

Unnamed: 0,text,date,class
33803,white house theatrics gun control century wire...,"January 7, 2016",0
33804,activists terrorists media controls dictates n...,"January 7, 2016",0
33805,boiler room surrender retreat heads roll ep tu...,"January 6, 2016",0
33806,federal showdown looms oregon blm abuse local ...,"January 4, 2016",0
33807,troubled king chicagos rahm emanuel desperate ...,"January 2, 2016",0


Lemmatization

In [17]:
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatization(text):
    tokens = nltk.word_tokenize(text)
    text = [lemmatizer.lemmatize(w) for w in tokens]
    return " ".join(text)

In [18]:
preprocessing_data_1['text'] = preprocessing_data_1.text.apply(lambda text : lemmatization(text))
preprocessing_data_1.tail()

Unnamed: 0,text,date,class
33803,white house theatrics gun control century wire...,"January 7, 2016",0
33804,activist terrorist medium control dictate narr...,"January 7, 2016",0
33805,boiler room surrender retreat head roll ep tun...,"January 6, 2016",0
33806,federal showdown loom oregon blm abuse local r...,"January 4, 2016",0
33807,troubled king chicago rahm emanuel desperate s...,"January 2, 2016",0


Partie Marcel