**Import Libraries**

In [1]:
try:
    import emoji
except ImportError:
    !pip install emoji
    import emoji

In [2]:
import pandas as pd
import numpy as np
import random
import string
import re

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\algin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\algin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
local_path = "C:\\Greenwich\\MSc Project\\project_code\\"
store_local = False

**Load Data**

In [4]:
full_train_data = pd.read_csv("https://github.com/Voldegin/hate_speech_detection/blob/1-data-processing-and-eda/data/balanced/balanced_train_data.csv?raw=true")
test_data = pd.read_csv("https://github.com/Voldegin/hate_speech_detection/blob/1-data-processing-and-eda/data/balanced/balanced_test_data.csv?raw=true")

In [5]:
full_train_data.head()

Unnamed: 0,tweet_text,cyberbullying_type,is_cyberbullying
0,"In other words #katandandre, your food was cra...",not_cyberbullying,0
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying,0
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying,0
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying,0
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying,0


In [6]:
test_data.head()

Unnamed: 0,tweet_text,actual_value,is_cyberbullying
0,"""\n\nI decided on """"breached"""" as a term to de...",[],0
1,Support Adding information regarding the 2007 ...,[],0
2,Stick to topic \n\nI would suggest cleaning up...,[],0
3,20 moments where e.g. or i.e. is used in an ar...,[],0
4,"""\n\n ADHD \n\nIt was a fight there for a whil...",[],0


**Data Preprocessing**

In [7]:
def show_random_tweet(data):
    return data.iloc[random.randint(0,len(data)),0]

In [8]:
show_random_tweet(full_train_data)

'Laying down watching school daze'

In [9]:
full_train_data.loc[1,'tweet_text']

'Why is #aussietv so white? #MKR #theblock #ImACelebrityAU #today #sunrise #studio10 #Neighbours #WonderlandTen #etc'

In [10]:
full_train_data.loc[4,'tweet_text']

'@RudhoeEnglish This is an ISIS account pretending to be a Kurdish account.  Like Islam, it is all lies.'

In [11]:
full_train_data.loc[10,'tweet_text']

'@Jord_Is_Dead http://t.co/UsQInYW5Gn'

**These samples shows the tags, mentions and links contained in the tweets can be removed as part of pre-processing**

In [12]:
def get_emoji_regexp():
    # Sort emoji by length to make sure multi-character emojis are
    # matched first
    emojis = sorted(emoji.EMOJI_DATA, key=len, reverse=True)
    pattern = u'(' + u'|'.join(re.escape(u) for u in emojis) + u')'
    return re.compile(pattern)

In [13]:
banned_list= string.punctuation
punctuation_reg_exp = "[" + banned_list + "]"
emoji_reg_exp = get_emoji_regexp()

def stemmer(text):
    tokenized = nltk.word_tokenize(text)
    ps = PorterStemmer()
    return ' '.join([ps.stem(words) for words in tokenized])

def clean_text(text):
    text = text.replace('\r', '').replace('\n', ' ').lower()
    text = re.sub(r"(?:\@|https?\://)\S+", "", text)

    text = [word for word in text.split() if word not in stop_words]
    text = ' '.join(text)

    text = " ".join(word.strip() for word in re.split('#(?!(?:hashtag)\b)[\w-]+(?=(?:\s+#[\w-]+)*\s*$)', text))
    
    text = re.sub(punctuation_reg_exp,"",text)

    text = re.sub("\s\s+" , " ", text)

    text = re.sub(emoji_reg_exp, r"", text)

    text = stemmer(text)
    return text

In [14]:
banned_list

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [15]:
print(full_train_data.loc[1,'tweet_text'])
clean_text(full_train_data.loc[1,'tweet_text'])

Why is #aussietv so white? #MKR #theblock #ImACelebrityAU #today #sunrise #studio10 #Neighbours #WonderlandTen #etc


'aussietv white'

In [16]:
print(full_train_data.loc[4,'tweet_text'])
clean_text(full_train_data.loc[4,'tweet_text'])

@RudhoeEnglish This is an ISIS account pretending to be a Kurdish account.  Like Islam, it is all lies.


'isi account pretend kurdish account like islam lie'

In [17]:
print(full_train_data.loc[10,'tweet_text'])
clean_text(full_train_data.loc[10,'tweet_text'])

@Jord_Is_Dead http://t.co/UsQInYW5Gn


''

**Clean Datasets**

In [18]:
full_train_data['cleaned'] = full_train_data['tweet_text'].apply(clean_text)

In [19]:
full_train_data

Unnamed: 0,tweet_text,cyberbullying_type,is_cyberbullying,cleaned
0,"In other words #katandandre, your food was cra...",not_cyberbullying,0,word katandandr food crapilici
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying,0,aussietv white
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying,0,classi whore red velvet cupcak
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying,0,meh p thank head up concern anoth angri dude t...
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying,0,isi account pretend kurdish account like islam...
...,...,...,...,...
77687,I'll just do this ==\n\nI'll flush out the art...,not_cyberbullying,0,ill ill flush articl scrub tuesday im done guy...
77688,"""\nAre you a sock puppet trying to answer my q...",not_cyberbullying,0,sock puppet tri answer question think so proba...
77689,"Church of England \n\nIn 2013, Church of Engla...",not_cyberbullying,0,church england 2013 church england end ban gay...
77690,"""\n\nMuch better, thanks. Now why can't I do ...",not_cyberbullying,0,much better thank cant that doom


In [20]:
test_data['cleaned'] = test_data['tweet_text'].apply(clean_text)

**Remove Duplicates**

In [21]:
full_train_data['cleaned'].duplicated().sum()

3018

In [22]:
test_data['cleaned'].duplicated().sum()

5

In [23]:
cleaned_train_data = full_train_data.drop_duplicates("cleaned")
cleaned_test_data = test_data.drop_duplicates("cleaned")

In [24]:
cleaned_train_data = cleaned_train_data[cleaned_train_data['cleaned']!='']
cleaned_test_data = cleaned_test_data[cleaned_test_data['cleaned']!='']

In [25]:
cleaned_train_data['cleaned'].duplicated().sum()

0

In [26]:
cleaned_test_data['cleaned'].duplicated().sum()

0

In [27]:
if store_local:
    cleaned_train_data.to_csv(local_path + "cleaned_train_data.csv",index=False)
    cleaned_test_data.to_csv(local_path + "cleaned_test_data.csv",index=False)