Processing all datasets requires a lot of time, that's why we will work with samples of datasets 

In [1]:
import pandas as pd
fake_news = pd.read_csv("../Datasets/Fake.csv")
fake_news = fake_news.sample(1000)
true_news = pd.read_csv("../Datasets/True.csv")
true_news = true_news.sample(1000)

# Corpus tokenization, stemming and stop words removal

Divide the different texts into individual words. Remove which are common words (a, the, not, etc) that bring close to no contribution to the semantic meaning of a text. Stemming reduces a word to it’s root.

In [2]:
import numpy as np
from nltk.corpus import words, stopwords 
from nltk import pos_tag
stop_words = set(stopwords.words('english')) 
english_words = set(words.words())

from nltk.stem import PorterStemmer
porter = PorterStemmer()


import re


def filter_text(text):
    text = re.sub('@[^\s]+','', text)
    text = re.split("\W+", text.lower())
    text = [word for word in text if word]
    filtered_text = []
    for word, pos in pos_tag(text):
        if len(word) > 1: #skip random letters
            if word not in stop_words and word in english_words:
                if pos.startswith('NN'): 
                    #word_root = porter.stem(word)
                    filtered_text.append(word)
    return filtered_text

def pre_proc_dataset(news_set):
    token_sets = []
    for index, article_text in news_set['text'].items():         
        tokenize_text = filter_text(article_text)
        token_sets.append(" ".join(tokenize_text))
    news_set["tokenize_text"] = token_sets
    return news_set


fake_news = pre_proc_dataset(fake_news)
for i, article in fake_news['tokenize_text'].items():
    if article is np.nan or article.isspace() or not article:
        fake_news.drop(i, axis=0, inplace=True)
fake_news.to_csv("../Datasets/FakeTransformed.csv", mode="w")

true_news = pre_proc_dataset(true_news)
for i, article in true_news['tokenize_text'].items():
    if article is np.nan or article.isspace() or not article:
        true_news.drop(i, axis=0, inplace=True)
true_news.to_csv("../Datasets/TrueTransformed.csv", mode="w")

In [3]:
true_news

Unnamed: 0,title,text,subject,date,tokenize_text
5230,Ex-President Bush says hopeful despite 'pretty...,(Reuters) - Former U.S. president George W. Bu...,politicsNews,"February 28, 2017",president bush climate trump presidency optimi...
5012,Trump's revised travel ban dealt first court s...,(Reuters) - A federal judge in Wisconsin dealt...,politicsNews,"March 11, 2017",judge blow president ban enforcement policy en...
7126,"Trump outlines plans for first day in office, ...",NEW YORK/WASHINGTON (Reuters) - U.S. President...,politicsNews,"November 21, 2016",york president trump day office trade accord w...
6470,Trump's EPA pick resigns from Rule of Law Defe...,(Reuters) - U.S. President-elect Donald Trump’...,politicsNews,"January 7, 2017",president trump pick protection agency chairma...
13529,Bosnian Croat war crimes convict dies after ta...,THE HAGUE (Reuters) - A former Bosnian Croat m...,worldnews,"November 29, 2017",commander poison war courtroom appeal year pri...
...,...,...,...,...,...
7854,Clinton opposition to Asia trade pact 'close c...,NEW YORK (Reuters) - Hillary Clinton’s campaig...,politicsNews,"October 11, 2016",york campaign balance candidate trade pact pre...
14296,Ukraine expels Belarus diplomat over Minsk spy...,KIEV (Reuters) - Ukraine said on Tuesday it ha...,worldnews,"November 21, 2017",diplomat tit move defense ministry spy ring st...
17303,U.S. condemns Venezuelan elections as neither ...,WASHINGTON (Reuters) - The United States on Mo...,worldnews,"October 16, 2017",weekend power democracy oil nation lack yester...
21155,Exclusive: Displaced Rohingya in camps face ai...,"SITTWE, Myanmar (Reuters) - Around 120,000 dis...",worldnews,"September 1, 2017",people state food aid government people north ...


In [4]:
fake_news

Unnamed: 0,title,text,subject,date,tokenize_text
10183,"WATCH: CNN DOES SEGMENT From Bunker In Hawaii,...",CNN host Erin Burnett started out her segment ...,politics,"Aug 10, 2017",host segment threat attack audience danger nat...
4168,Trump FURIOUS After New Poll DEVASTATES His C...,"NBC News third post-leaked audio, post-second ...",News,"October 16, 2016",news post post debate poll something lead trum...
5946,Elizabeth Warren Just Proved Why She’s Trump’...,If there s anyone who has clearly had enough o...,News,"June 9, 2016",anyone enough trump bullying senator warren ti...
11543,MICHIGAN City With First MUSLIM-MAJORITY City ...,"It s good to know that illegal aliens, potenti...",politics,"Feb 28, 2017",place law enforcement mi resident mi community...
8865,Racist Moron Gets Fired For This 20 Second Vi...,It s not often that the stupidity of a racist ...,News,"January 9, 2016",stupidity racist slur minority disgusting cras...
...,...,...,...,...,...
15911,FORMER FBI ASST DIRECTOR LETS IT RIP! Comey’s ...,James Kallstrom is the former Assistant Direct...,Government News,"Nov 9, 2017",assistant director fan director times intellig...
19785,ELECTION WHISTLEBLOWER: DOJ In Cahoots With De...,J. Christian Adams: Dead people are voting and...,left-news,"Oct 18, 2016",people something administration anything voter...
15379,THE FIX IS IN: JUDGE QUICKLY BLOCKS ABORTION V...,"Judge William H. Orrick, III joins Obama in hi...",politics,"Aug 1, 2015",judge desire parenthood abortion order release...
20445,SCREAMING LEFTISTS Interrupt Trump Speech…Crow...,Screaming leftists interrupted Donald Trump s ...,left-news,"Jun 10, 2016",trump speech today faith freedom conference tr...


## Concat into one dataset

Concatination fake and true news into one dataset with an additional column "Fake/True"

In [5]:
fake_news=pd.read_csv("../Datasets/FakeTransformed.csv")
true_news=pd.read_csv("../Datasets/TrueTransformed.csv")
fake_news['fake/true'] = 'fake'
true_news['fake/true'] = 'true'
all_news = pd.concat([true_news, fake_news])
all_news.to_csv("../Datasets/AllNews.csv", mode="w")
all_news

Unnamed: 0.1,Unnamed: 0,title,text,subject,date,tokenize_text,fake/true
0,5230,Ex-President Bush says hopeful despite 'pretty...,(Reuters) - Former U.S. president George W. Bu...,politicsNews,"February 28, 2017",president bush climate trump presidency optimi...,true
1,5012,Trump's revised travel ban dealt first court s...,(Reuters) - A federal judge in Wisconsin dealt...,politicsNews,"March 11, 2017",judge blow president ban enforcement policy en...,true
2,7126,"Trump outlines plans for first day in office, ...",NEW YORK/WASHINGTON (Reuters) - U.S. President...,politicsNews,"November 21, 2016",york president trump day office trade accord w...,true
3,6470,Trump's EPA pick resigns from Rule of Law Defe...,(Reuters) - U.S. President-elect Donald Trump’...,politicsNews,"January 7, 2017",president trump pick protection agency chairma...,true
4,13529,Bosnian Croat war crimes convict dies after ta...,THE HAGUE (Reuters) - A former Bosnian Croat m...,worldnews,"November 29, 2017",commander poison war courtroom appeal year pri...,true
...,...,...,...,...,...,...,...
962,15911,FORMER FBI ASST DIRECTOR LETS IT RIP! Comey’s ...,James Kallstrom is the former Assistant Direct...,Government News,"Nov 9, 2017",assistant director fan director times intellig...,fake
963,19785,ELECTION WHISTLEBLOWER: DOJ In Cahoots With De...,J. Christian Adams: Dead people are voting and...,left-news,"Oct 18, 2016",people something administration anything voter...,fake
964,15379,THE FIX IS IN: JUDGE QUICKLY BLOCKS ABORTION V...,"Judge William H. Orrick, III joins Obama in hi...",politics,"Aug 1, 2015",judge desire parenthood abortion order release...,fake
965,20445,SCREAMING LEFTISTS Interrupt Trump Speech…Crow...,Screaming leftists interrupted Donald Trump s ...,left-news,"Jun 10, 2016",trump speech today faith freedom conference tr...,fake
