In [1]:
from gensim.models import FastText
import pandas as pd
import numpy as np
import itertools
import html
import pickle
import regex as re

from sklearn.metrics import confusion_matrix
from nltk.tokenize import TweetTokenizer
from collections import Counter
from emoji import UNICODE_EMOJI
import emoji
import matplotlib.pyplot as plt
%matplotlib inline



In [2]:
def load_data(csv, p1, p2):
    """
    Loads data from 3 specific paths. 
    Returns a list of texts, a corresponding list of labels (0|1|2), and a labels counter.
    """
    df = pd.read_csv(csv, index_col=0, encoding = 'utf-8')

    texts = []
    labels = []
    for row in df.itertuples(index=True):
        tweet = getattr(row, "tweet")
        classif = getattr(row, "classification")
        texts.append(tweet)
        labels.append(classif)

    more_hateful_tweets = pickle.load(open(p1, "rb" ))
    more_clean_tweets = pickle.load(open(p2, "rb" ))

    hateful_labels = [0] * len(more_hateful_tweets)
    clean_labels = [2] * len(more_clean_tweets)
    print('Additional Hateful:', len(more_hateful_tweets), 'Additional Clean:', len(more_clean_tweets))
    
    texts = texts+more_hateful_tweets+more_clean_tweets
    labels = labels+hateful_labels+clean_labels
    
    texts, labels = drop_duplicates(texts, labels)
    cnt = Counter(labels)
    print(cnt)
    return texts, labels, cnt

In [3]:
def drop_duplicates(texts, labels):
    """ Removes duplicate entries."""
    single_texts = []
    single_labels = []

    duplicates = 0
    for i, text in enumerate(texts):
        if text not in single_texts:

            single_texts.append(text)
            single_labels.append(labels[i])
        else:
            duplicates += 1
    return single_texts, single_labels

In [4]:
def convert_html_emojis(corpus):
    new_corpus = []
    for text in corpus:
        text = html.unescape(text)
        new_corpus.append(text)
    return new_corpus

In [5]:
def tokenize_texts(corpus, stopword_path = None):
    """
    Tokenizes a list of texts. ...!
    """
    corpus = convert_html_emojis(corpus)
    print(corpus[:10])
    tknzr = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
    
    
    token_doc_list = []
    token_list = []
   
    for text in corpus:
       # text = re.sub(r'https?:\/\/\S*', ' ', text)

        tokens = tknzr.tokenize(text)
        tokens2 = []
        for token in tokens:
            token = re.sub(r'([1234567890!"#$%&()*+,./:@;?[\]^`{|}_~\t\n])', '', token)
            if token != '':
                tokens2.append(token)
        
        #text = re.sub(r'([1234567890!"#$%&()*+,./:;?[\]^`{|}_~\t\n])', ' ', text) #handles and hashtag
        #tokens = tknzr.tokenize(text)
        
        tmp_tokens = []
        for i, token in enumerate(tokens2):
            if token in UNICODE_EMOJI:
               
                emoji_one = emoji.demojize(token)
                emoji_one = emoji_one[1:-1]
                emoji_all = re.split('_', emoji_one)

                tokens[i] = emoji_all
                token_list.extend(emoji_all)
                tmp_tokens.extend(emoji_all)
            elif token == "i'd":
                token_list.extend(['i', 'would'])
                tmp_tokens.extend(['i', 'would'])
            else: 
                token_list.append(token)
                tmp_tokens.append(token)

            
        token_doc_list.append(tmp_tokens)     
            
    #index, mfws, max_words = find_most_common(token_list)
   
    #if stopword_path == None: stopwords = []
    #else: stopwords = read_stopwords(stopword_path)
    #new_texts = fit_index_on_texts(index, token_doc_list, stopwords, mfws) 
    return token_doc_list

In [6]:
texts, labels, cnt = load_data('offensive_language_crowdflower.csv', "tweets_hate_icwsm18_extended.p", "tweets_clean_zeerakW_extended.p")

Additional Hateful: 9802 Additional Clean: 3017
Counter({1: 19190, 0: 11185, 2: 7180})


In [7]:
texts[:10]

["!!! RT @mayasolovely: As a woman you shouldn't complain about cleaning up your house. &amp; as a man you should always take the trash out...",
 '!!!!! RT @mleew17: boy dats cold...tyga dwn bad for cuffin dat hoe in the 1st place!!',
 '!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby4life: You ever fuck a bitch and she start to cry? You be confused as shit',
 '!!!!!!!!! RT @C_G_Anderson: @viva_based she look like a tranny',
 '!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you hear about me might be true or it might be faker than the bitch who told it to ya &#57361;',
 '!!!!!!!!!!!!!!!!!!"@T_Madison_x: The shit just blows me..claim you so faithful and down for somebody but still fucking with hoes! &#128514;&#128514;&#128514;"',
 '!!!!!!"@__BrighterDays: I can not just sit up and HATE on another bitch .. I got too much shit going on!"',
 "!!!!&#8220;@selfiequeenbri: cause I'm tired of you big bitches coming for us skinny girls!!&#8221;",
 '" &amp; you might not get ya bitch back &amp; thats t

In [8]:
maxlen = 100
token_doc_list = tokenize_texts(texts)

["!!! RT @mayasolovely: As a woman you shouldn't complain about cleaning up your house. & as a man you should always take the trash out...", '!!!!! RT @mleew17: boy dats cold...tyga dwn bad for cuffin dat hoe in the 1st place!!', '!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby4life: You ever fuck a bitch and she start to cry? You be confused as shit', '!!!!!!!!! RT @C_G_Anderson: @viva_based she look like a tranny', '!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you hear about me might be true or it might be faker than the bitch who told it to ya \ue011', '!!!!!!!!!!!!!!!!!!"@T_Madison_x: The shit just blows me..claim you so faithful and down for somebody but still fucking with hoes! 😂😂😂"', '!!!!!!"@__BrighterDays: I can not just sit up and HATE on another bitch .. I got too much shit going on!"', "!!!!“@selfiequeenbri: cause I'm tired of you big bitches coming for us skinny girls!!”", '" & you might not get ya bitch back & thats that "', '" @rhythmixx_ :hobbies include: fighting Mariam"\

In [9]:
token_doc_list[:10]

[['rt',
  'as',
  'a',
  'woman',
  'you',
  "shouldn't",
  'complain',
  'about',
  'cleaning',
  'up',
  'your',
  'house',
  'as',
  'a',
  'man',
  'you',
  'should',
  'always',
  'take',
  'the',
  'trash',
  'out'],
 ['rt',
  'boy',
  'dats',
  'cold',
  'tyga',
  'dwn',
  'bad',
  'for',
  'cuffin',
  'dat',
  'hoe',
  'in',
  'the',
  'st',
  'place'],
 ['rt',
  'dawg',
  'rt',
  'you',
  'ever',
  'fuck',
  'a',
  'bitch',
  'and',
  'she',
  'start',
  'to',
  'cry',
  'you',
  'be',
  'confused',
  'as',
  'shit'],
 ['rt', 'she', 'look', 'like', 'a', 'tranny'],
 ['rt',
  'the',
  'shit',
  'you',
  'hear',
  'about',
  'me',
  'might',
  'be',
  'true',
  'or',
  'it',
  'might',
  'be',
  'faker',
  'than',
  'the',
  'bitch',
  'who',
  'told',
  'it',
  'to',
  'ya',
  '\ue011'],
 ['the',
  'shit',
  'just',
  'blows',
  'me',
  'claim',
  'you',
  'so',
  'faithful',
  'and',
  'down',
  'for',
  'somebody',
  'but',
  'still',
  'fucking',
  'with',
  'hoes',
  'face',

In [10]:
new_texts = []
for t in token_doc_list:
    text = " ".join(t)
    new_texts.append(text)

In [11]:
new_texts[:20]

["rt as a woman you shouldn't complain about cleaning up your house as a man you should always take the trash out",
 'rt boy dats cold tyga dwn bad for cuffin dat hoe in the st place',
 'rt dawg rt you ever fuck a bitch and she start to cry you be confused as shit',
 'rt she look like a tranny',
 'rt the shit you hear about me might be true or it might be faker than the bitch who told it to ya \ue011',
 'the shit just blows me claim you so faithful and down for somebody but still fucking with hoes face with tears of joy face with tears of joy face with tears of joy',
 'i can not just sit up and hate on another bitch i got too much shit going on',
 "“ cause i'm tired of you big bitches coming for us skinny girls ”",
 'you might not get ya bitch back thats that',
 'hobbies include fighting mariam bitch',
 'keeks is a bitch she curves everyone lol i walked into a conversation like this smh',
 'murda gang bitch its gang land',
 'so hoes that smoke are losers yea go on ig',
 'bad bitches is

In [12]:
df = pd.DataFrame(columns=['label', 'text'])

In [13]:
len(df)

0

In [14]:
for i, text in enumerate(new_texts):
    df = df.append({'label': labels[i], 'text':text}, ignore_index=True)

In [15]:
#df.to_csv('hatespeech_preprocessed_full.tsv', sep='\t')

In [15]:
pd.set_option('max_colwidth', 150)

In [16]:
df_sorted = df.sort_values(by=['label'])

In [17]:
df_0 = df_sorted[df_sorted['label'] == 0]
df_1 = df_sorted[df_sorted['label'] == 1]
df_2 = df_sorted[df_sorted['label'] == 2]

In [19]:
df_0[:100]

Unnamed: 0,label,text
33389,0,bruh honestly no one cares ur a little scrawny arab indian cunt anyone can put you into the ground so shut ya fucking mouth
26571,0,imp your retard strength
26570,0,probably because ur a blind cunt
26569,0,nowadays questioning any group of people is considered hate speech when did society become so sensitive istandwithhatespeech
26568,0,you are basically saying my cunt fav dug your grave yas
26567,0,free software foundation does not cover copywriting its upto an author whether it will be copyrighted or not you faggot
26566,0,truth istandwithhatespeech httpstcoHoXRxtly
26565,0,rt istandwithhatespeech because the legal definition of hate speech depends on what offends the regime in power erdo …
26564,0,fuck off you cunt
26563,0,the emoji makes you hundred points percent more retarded


In [19]:
df_0 = df_0.sample(frac=1)
df_1 = df_1.sample(frac=1)
df_2 = df_2.sample(frac=1)

In [None]:
df_2[:6]

In [None]:
cnt

In [20]:
print(len(df_0), len(df_1), len(df_2))

11185 19190 7180


In [21]:
train_df = df_0[:4000].append([df_1[:4000], df_2[:4000]])
test_df = df_0[4000:5000].append([df_1[4000:5000], df_2[4000:5000]])
dev_df = df_0[5000:6000].append([df_1[5000:6000], df_2[5000:6000]])

In [22]:
len(train_df)

12000

In [23]:
len(test_df)

3000

In [24]:
len(dev_df)

3000

In [25]:
train_df = train_df.sample(frac=1)
test_df = test_df.sample(frac=1)
dev_df = dev_df.sample(frac=1)

In [26]:
train_df.to_csv('hatespeech_final_train.tsv', sep='\t')
test_df.to_csv('hatespeech_final_test.tsv', sep='\t')
dev_df.to_csv('hatespeech_final_dev.tsv', sep='\t')

In [27]:
train_df[:20]

Unnamed: 0,label,text
31330,0,wow classy language you cunt nazi
10327,1,i get gwop now that bitch remember me
30293,0,u guys are fucking mentally retarded wasting my time plz rechall for this shit lmao
23466,2,you're black your name looks hungarian you marry a hunky
35879,2,where did i call you stupid show me the tweet
8544,1,crazy ass rt i been acting so mean ratchet and ghetto i crack myself up
27936,0,everything about and everything you do with the queens is retarded go king go is a retardest chant
25418,0,tan is short for tania not the other way around you retard
2379,1,i've seen a number of bitches on here say there's no such thing as loose pussy lol ok
6847,1,yeah i would want to shoot the son of a bitch myself lol


In [None]:

lines = train_df.values.tolist()


In [None]:
lines[:5]

In [None]:
for (i, line) in enumerate(lines[:5]):
    text_a = str(line[1])
    label = str(line[0])

In [None]:
lines[0]