In [26]:
import pandas as pd
import json
import numpy as np
import re
from nltk.corpus import stopwords
stop = stopwords.words("english")

<h3>Function: Cleaning the corpus</h3>

In [27]:
def clean_text(text):
    
    # To lower case
    text = text.lower()
    
    # Remove new line characters
    text = text.replace("\t"," ")
    text = text.replace("\n"," ")
    
    # Remove punctuation
    text = re.sub(r"[^\w\s]", " ", text)
    
    # Remove digits
    text = re.sub(r"\b\d+\b"," ", text)
    
    # Remove multiple white spaces
    text = re.sub(r' +', ' ', text)
    
    # Remove stopwords
    text = [x for x in text.split() if x not in stop]
    
    # Stemming (Did not use)
    # text = [stemmer.stem(x) for x in text]
    
    return " ".join(text)

<h3>Import the datasets</h3>

In [28]:
comments = pd.read_csv("/Users/mazichang/Desktop/Cyberviolence/Project/HateSpeechDatasetBalanced.csv")
comments.columns = ['content', 'label']

# Remove the rows with "NaN"
# comments.dropna(inplace=True)

<h3>Check the datasets</h3>

In [29]:
print(comments.shape)
comments.head()
comments

(726119, 2)


Unnamed: 0,content,label
0,denial of normal the con be asked to comment o...,1
1,just by being able to tweet this insufferable ...,1
2,that is retarded you too cute to be single tha...,1
3,thought of a real badass mongol style declarat...,1
4,afro american basho,1
...,...,...
726114,i mute this telecasting and played kanye west ...,1
726115,but hell yeah he s not a bachelor but looooooo...,1
726116,great video musician but s not my musician lol...,1
726117,not great pop video yeah he s not a pedophile ...,1


In [30]:
from nltk.tokenize import word_tokenize

<h3>Calculate the average number of words in each sentence</h3>

In [31]:
words = 0
for index, row in comments.iterrows():
    content = row['content']
    words = words + len(word_tokenize(content))
print(words/len(comments))

36.42867629135169


<h3>Eliminate outliers (comments with less than 5 or more than 40 words)</h3>

In [32]:
invalid_record = []
for index, row in comments.iterrows():
    content = row['content']
    number_of_words = len(word_tokenize(content))
    if number_of_words <= 10 or number_of_words >= 60:
        invalid_record.append(index)
comments.drop(invalid_record, inplace=True)
comments

Unnamed: 0,content,label
0,denial of normal the con be asked to comment o...,1
1,just by being able to tweet this insufferable ...,1
2,that is retarded you too cute to be single tha...,1
3,thought of a real badass mongol style declarat...,1
7,the dead what a slut still warm when she tweet...,1
...,...,...
726094,oh haha real funny give maine chills ms twilig...,1
726115,but hell yeah he s not a bachelor but looooooo...,1
726116,great video musician but s not my musician lol...,1
726117,not great pop video yeah he s not a pedophile ...,1


<h3>Sample 20000 comments at a 4:1 (non-hate:hate) ratio</h3>

In [33]:
comments_0 = comments.query('label == 0').sample(16000, random_state=6)
comments_1 = comments.query('label == 1').sample(4000, random_state=6)
comments = pd.concat([comments_0, comments_1], ignore_index=True)
comments = comments.sample(frac=1,random_state=6).reset_index(drop=True)

print(comments.shape)
comments

(20000, 2)


Unnamed: 0,content,label
0,the future the female average yr old oh um wom...,1
1,possible tatar relation it seems to me there b...,0
2,somehow i doubt that statistic you just made u...,0
3,you guys need to stop with the gay martins arg...,0
4,whatever need to get hold of everyone and plan...,0
...,...,...
19995,why plenty of people have taken a stance in fa...,0
19996,bitches be mad they mane nigga getting snatche...,0
19997,done and for this is would even a name be know...,0
19998,chicago s a killing field from decades of inep...,0


<h3>Save file for pre-trained model</h3>

In [34]:
comments.to_csv("CommentsForPretrainedGPT.csv", index=False)

<h3>Clean the corpus</h3>

In [35]:
for index, row in comments.iterrows():
    comments.loc[index,'content'] = clean_text(comments.loc[index,'content'])
comments

Unnamed: 0,content,label
0,future female average yr old oh um women knock...,1
1,possible tatar relation seems minor connection...,0
2,somehow doubt statistic made head,0
3,guys need stop gay martins argent godaddy ads ...,0
4,whatever need get hold everyone plan something...,0
...,...,...
19995,plenty people taken stance favor common sense ...,0
19996,bitches mad mane nigga getting snatched whore ...,0
19997,done would even name known,0
19998,chicago killing field decades inept corrupt li...,0


<h3>Save cleaned dataset to file</h3>

In [36]:
comments.to_csv("CommentsCleaned.csv", index=False)