In [1]:
import nltk
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

<hr style="border:2px solid gray">

### Dataset 

Importing the dataset

<hr style="border:2px solid gray">

In [2]:
data = pd.read_csv('./datasets/WELFake_Dataset.csv')

# using serial number to index dataset 
data.rename({"Unnamed: 0": "index"}, axis=1, inplace=True)
data.set_index("index", inplace=True)

# dropping columns that wont be used (title)
data.drop(["title"], axis=1, inplace=True)

display(data)

Unnamed: 0_level_0,text,label
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,No comment is expected from Barack Obama Membe...,1
1,Did they post their votes for Hillary already?,1
2,"Now, most of the demonstrators gathered last ...",1
3,A dozen politically active pastors came here f...,0
4,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1
...,...,...
72129,WASHINGTON (Reuters) - Hackers believed to be ...,0
72130,"You know, because in fantasyland Republicans n...",1
72131,Migrants Refuse To Leave Train At Refugee Camp...,0
72132,MEXICO CITY (Reuters) - Donald Trump’s combati...,0


<hr style="border:2px solid gray">

## Preprocessing

We will be performing the following steps to preprocess the dataset

### 1. Tokenization

> We will be using **Tokenisation** to divide large pieces of continuous data into distinct units or token.

### 2. Stemming

> Stemming converts different forms of a word to a core root word by removing suffixes.

### 3. Stopwords removal

> Stop words are words that have little meaning lexically. Stop words and special characters are removed in this process.

<hr style="border:2px solid gray">

In [3]:
# tokenization

nltk.download("punkt")
from nltk.tokenize import word_tokenize

# storing pre-processed text into 'pp_text' column
data["text"] = data["text"].apply(str)
data["pp_text"] = data["text"].progress_apply(word_tokenize)
display(data.head())


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/imunnangi1/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
100%|██████████████████████████████████████████████| 72134/72134 [04:12<00:00, 285.28it/s]


Unnamed: 0_level_0,text,label,pp_text
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,No comment is expected from Barack Obama Membe...,1,"[No, comment, is, expected, from, Barack, Obam..."
1,Did they post their votes for Hillary already?,1,"[Did, they, post, their, votes, for, Hillary, ..."
2,"Now, most of the demonstrators gathered last ...",1,"[Now, ,, most, of, the, demonstrators, gathere..."
3,A dozen politically active pastors came here f...,0,"[A, dozen, politically, active, pastors, came,..."
4,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,"[The, RS-28, Sarmat, missile, ,, dubbed, Satan..."


In [4]:
# stemming

from nltk.stem.snowball import SnowballStemmer
porter = SnowballStemmer("english")

def stem_text(text):
    return [porter.stem(word) for word in text]

data['pp_text'] = data['pp_text'].progress_apply(stem_text)
display(data.head())

100%|██████████████████████████████████████████████| 72134/72134 [08:21<00:00, 143.78it/s]


Unnamed: 0_level_0,text,label,pp_text
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,No comment is expected from Barack Obama Membe...,1,"[no, comment, is, expect, from, barack, obama,..."
1,Did they post their votes for Hillary already?,1,"[did, they, post, their, vote, for, hillari, a..."
2,"Now, most of the demonstrators gathered last ...",1,"[now, ,, most, of, the, demonstr, gather, last..."
3,A dozen politically active pastors came here f...,0,"[a, dozen, polit, activ, pastor, came, here, f..."
4,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,"[the, rs-28, sarmat, missil, ,, dub, satan, 2,..."


In [5]:
nltk.download('stopwords')
from nltk.corpus import stopwords

eng_sw = stopwords.words("english")

def stop_text(text):
    return [word for word in text if word not in eng_sw]

data['pp_text'] = data['pp_text'].progress_apply(stop_text)
display(data.head())

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/imunnangi1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
100%|██████████████████████████████████████████████| 72134/72134 [01:32<00:00, 778.15it/s]


Unnamed: 0_level_0,text,label,pp_text
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,No comment is expected from Barack Obama Membe...,1,"[comment, expect, barack, obama, member, #, fy..."
1,Did they post their votes for Hillary already?,1,"[post, vote, hillari, alreadi, ?]"
2,"Now, most of the demonstrators gathered last ...",1,"[,, demonstr, gather, last, night, exercis, co..."
3,A dozen politically active pastors came here f...,0,"[dozen, polit, activ, pastor, came, privat, di..."
4,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,"[rs-28, sarmat, missil, ,, dub, satan, 2, ,, r..."


In [36]:
print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '