In [33]:
import re
import pandas as pd
import numpy as np
import nltk
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

## Forming the dataframe

In [5]:
df = pd.read_json('../src/utility/final.json')
df = pd.DataFrame(df.data.values.tolist())
df.head()

Unnamed: 0,id,text,label,withheld
0,850490912954351616,Alex Brosas another idiot #ALDUBKSGoesToUS ht...,abusive,
1,848791766853668864,"RT @ItIzBiz: as Nancy Reagan would say, 'just ...",abusive,
2,850010509969465344,RT @MailOnline: The Nazi death gas so horrific...,normal,
3,850433664890544128,I hate er chase because if the Bitch that work...,hateful,
4,849282894682050564,But he still with the shits so he started smok...,abusive,


In [6]:
df = df.drop(['id', 'withheld'], axis=1)
df.head()

Unnamed: 0,text,label
0,Alex Brosas another idiot #ALDUBKSGoesToUS ht...,abusive
1,"RT @ItIzBiz: as Nancy Reagan would say, 'just ...",abusive
2,RT @MailOnline: The Nazi death gas so horrific...,normal
3,I hate er chase because if the Bitch that work...,hateful
4,But he still with the shits so he started smok...,abusive


In [7]:
lb = LabelEncoder()
label = lb.fit_transform(df["label"])
label

array([0, 0, 2, ..., 3, 2, 3])

In [8]:
df = df.drop(["label"], axis='columns')
df.head()

Unnamed: 0,text
0,Alex Brosas another idiot #ALDUBKSGoesToUS ht...
1,"RT @ItIzBiz: as Nancy Reagan would say, 'just ..."
2,RT @MailOnline: The Nazi death gas so horrific...
3,I hate er chase because if the Bitch that work...
4,But he still with the shits so he started smok...


In [9]:
df["label"]=label
df.head()

Unnamed: 0,text,label
0,Alex Brosas another idiot #ALDUBKSGoesToUS ht...,0
1,"RT @ItIzBiz: as Nancy Reagan would say, 'just ...",0
2,RT @MailOnline: The Nazi death gas so horrific...,2
3,I hate er chase because if the Bitch that work...,1
4,But he still with the shits so he started smok...,0


## Preprocessing the data

In [68]:
def cleanTweet(text):
    text = re.sub(r'@[A-Za-z0-9]+','',text) # Removing @mentions
    text = re.sub(r'#','',text) # Removing the '#' symbol
    text = re.sub(r'RT[\s]+','',text) # Removing RT
    text = re.sub(r'https?:\/\/\S+','',text) # Removing hyperlinks
    text = re.sub(r'[^a-zA-Z ]',' ', text) # Removing all the punctuations and numbers
    text = text.lower()
    return text
df['text'] = df['text'].apply(cleanTweet)
df.head()

Unnamed: 0,text,label,clean_tweet
0,alex brosas another idiot aldubksgoestous,0,a l e x b r o s a a ...
1,as nancy reagan would say just say fucking...,0,n a n c i r e a g a n ...
2,the nazi death gas so horrific even hitler f...,2,n a z i d e a t h g ...
3,i hate er chase because if the bitch that work...,1,h a t e e r c h a s ...
4,but he still with the shits so he started smok...,0,s t i l l s h i t s ...


In [76]:
# Removing the stop words and tokeinzing the sentences
stop_words = set(stopwords.words('english'))
def removeStopWords(text):
    words = word_tokenize(text)
    filtered_sentence = [w for w in words if not w in stop_words]
    return filtered_sentence
tokenized_tweet = df['text'].apply(removeStopWords)
tokenized_tweet.head()

0      [alex, brosas, another, idiot, aldubksgoestous]
1    [nancy, reagan, would, say, say, fucking, some...
2    [nazi, death, gas, horrific, even, hitler, fea...
3     [hate, er, chase, bitch, works, literally, evil]
4    [still, shits, started, smoking, drinking, bad...
Name: text, dtype: object

In [77]:
# Stemming
stemmer = PorterStemmer()
def stemTweet(text):
    text = [stemmer.stem(word) for word in text]
    return text
tokenized_tweet = tokenized_tweet.apply(stemTweet)
tokenized_tweet.head()

0            [alex, brosa, anoth, idiot, aldubksgoest]
1    [nanci, reagan, would, say, say, fuck, someth,...
2    [nazi, death, ga, horrif, even, hitler, fear, ...
3          [hate, er, chase, bitch, work, liter, evil]
4    [still, shit, start, smoke, drink, bad, combo,...
Name: text, dtype: object

In [78]:
tweet=[]
for i in range(len(tokenized_tweet)):
    s = tokenized_tweet[i]
    sent = ' '.join([str(elem) for elem in s])
    tweet.append(sent)
df['clean_tweet'] = tweet
df.head()

Unnamed: 0,text,label,clean_tweet
0,alex brosas another idiot aldubksgoestous,0,alex brosa anoth idiot aldubksgoest
1,as nancy reagan would say just say fucking...,0,nanci reagan would say say fuck someth like
2,the nazi death gas so horrific even hitler f...,2,nazi death ga horrif even hitler fear use
3,i hate er chase because if the bitch that work...,1,hate er chase bitch work liter evil
4,but he still with the shits so he started smok...,0,still shit start smoke drink bad combo probabl...
