# Hate Speech Detector - EN - Features extraction for Conv. & LSTM model

Based on [this notebook](https://github.com/t-davidson/hate-speech-and-offensive-language/blob/master/classifier/final_classifier.ipynb).

In [1]:
import os
import pandas as pd
import csv
import numpy as np
import pickle
from klepto.archives import dir_archive
import sys
import nltk
import string
import re
import fasttext

from tqdm.notebook import tqdm
%matplotlib inline

In [2]:
MODEL = 'lstm'
dim = 6*20 if MODEL == 'conv' else 200

## Davidson et al. data loading

### Classes pre:
    0 - hate speech
    1 - offensive language
    2 - neither

In [3]:
if not os.path.exists('hsd/DavidsonEtAl/perfect_data.pkl'):
    tweets, labels = [], []
    with open('hsd/DavidsonEtAl/labeled_data.csv', 'r') as f:
        for d in tqdm(list(csv.reader(f))[1:]):
            tweets.append(d[6])  # tweet
            labels.append(d[5])  # class
    with open('hsd/DavidsonEtAl/perfect_data.pkl', 'w') as f:
        def chcl(c):
            return 0 if c=='2' else 1
        labels = list(map(chcl, labels))
        pickle.dump((tweets, labels), f)
else:
    with open('hsd/DavidsonEtAl/perfect_data.pkl', 'r') as f:
        tweets, labels = pickle.load(f)

### Classes post:
    0 - no hate
    1 - hate speech

In [4]:
print('Tweets: {}'.format(len(tweets)))
print('Labels: {}'.format(len(labels)))

Tweets: 24783
Labels: 24783


In [5]:
list(zip(tweets[:5], labels[:5]))

[("!!! RT @mayasolovely: As a woman you shouldn't complain about cleaning up your house. &amp; as a man you should always take the trash out...",
  0),
 ('!!!!! RT @mleew17: boy dats cold...tyga dwn bad for cuffin dat hoe in the 1st place!!',
  1),
 ('!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby4life: You ever fuck a bitch and she start to cry? You be confused as shit',
  1),
 ('!!!!!!!!! RT @C_G_Anderson: @viva_based she look like a tranny', 1),
 ('!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you hear about me might be true or it might be faker than the bitch who told it to ya &#57361;',
  1)]

## Features extraction

In [6]:
def preprocess(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE
    4) hashtags with HASHTAGHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    #hashtag_regex = '#[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, '', parsed_text)
    parsed_text = re.sub(mention_regex, '', parsed_text)
    #parsed_text = re.sub(hashtag_regex, '', parsed_text)
    return parsed_text

def basic_tokenize(tweet):
    tweet = " ".join(re.split("[^a-zA-Z.,!?]*", tweet.lower())).strip()
    return tweet.split()

def get_pos_string(tweet):
    text = preprocess(tweet)
    tokens = basic_tokenize(text)
    tags = nltk.pos_tag(tokens)
    tag_list = [x[1] for x in tags]
    tag_str = ' '.join(tag_list)
    
    return tag_str

def pad_words(words, length):
    if len(words) >= length:
        return words[:length]
    else:
        additional = length - len(words)
        return words + ['EMPTY']*additional

def adjust_words(words, length):
    # different from pad: output tokens may contain more than 1 words
    if len(words) >= length:
        q, r = divmod(len(words), length)
        return [' '.join(words[i * q + min(i, r):(i + 1) * q + min(i + 1, r)]) for i in xrange(length)]
    else:
        additional = length - len(words)
        return words + ['EMPTY']*additional

### Median sentences length

In [7]:
def median_sentences_length(data):
    all_lengths, wt_lengths, pos_lengths = [], [], []
    for d in tqdm(data):
        sentence = preprocess(d)
        pos_string = get_pos_string(d)
        all_lengths.append(len(sentence.split(' ')))
        all_lengths.append(len(pos_string.split(' ')))
        wt_lengths.append(len(sentence.split(' ')))
        pos_lengths.append(len(pos_string.split(' ')))
    
    return int(np.median(all_lengths)), int(np.median(wt_lengths)), int(np.median(pos_lengths))

In [8]:
opt_length, opt_wt_length, opt_pos_length = median_sentences_length(tweets)

print('Optimal all length: {}'.format(opt_length))
print('Optimal sentence length: {}'.format(opt_wt_length))
print('Optimal pos sentence length: {}'.format(opt_pos_length))

HBox(children=(IntProgress(value=0, max=24783), HTML(value=u'')))


Optimal all length: 13
Optimal sentence length: 13
Optimal pos sentence length: 12


### Supervised fastText wordtokens training

In [9]:
if not os.path.exists('hsd/DavidsonEtAl/fasttext.ft'):
    with open('hsd/DavidsonEtAl/fasttext.ft', 'a') as f:
        for t, l in list(zip(tweets, labels)):
            f.write('__label__{} {}\n'.format(l, preprocess(t)))

# load fasttext model or train & save if none
if os.path.exists('hsd/DavidsonEtAl/fasttext_{}.bin'.format(MODEL)):
    ft_model = fasttext.load_model('hsd/DavidsonEtAl/fasttext_{}.bin'.format(MODEL))
else:
    ft_model = fasttext.train_supervised('hsd/DavidsonEtAl/fasttext.ft',
                                         lr=0.5, epoch=50, wordNgrams=3, dim=dim)
    ft_model.save_model('hsd/DavidsonEtAl/fasttext_{}.bin'.format(MODEL))




### Wordtoken features

In [10]:
def get_wordtoken_fts(data, length, adjust=False):
    
    sentences_words = []
    t = tqdm(data)
    t.set_postfix_str('Wordtokens features extraction: tokenization.')
    for d in t:
        sentence = preprocess(d)
        sentences_words.append(sentence.split(' '))
    
    if adjust:
        sentences_words = [adjust_words(sw, length) for sw in sentences_words]
    else:
        sentences_words = [pad_words(sw, length) for sw in sentences_words]
    
    ft_matrices = []
    t = tqdm(sentences_words)
    t.set_postfix_str('Wordtokens features extraction: vectorization.')
    for sw in t:
        ft_matrix = []
        for w in sw:
            ft_matrix.append(ft_model[w])
        ft_matrices.append(ft_matrix)
    
    return ft_matrices

In [11]:
wordtoken_features = get_wordtoken_fts(tweets, opt_wt_length)

HBox(children=(IntProgress(value=0, max=24783), HTML(value=u'')))




HBox(children=(IntProgress(value=0, max=24783), HTML(value=u'')))




In [12]:
wordtoken_features[0]

[array([-5.88701107e-03, -6.35682931e-03,  7.21235992e-03,  8.30636546e-03,
        -1.07052913e-02,  1.56841371e-02, -1.02461418e-02,  4.82069608e-03,
        -9.95255262e-03, -2.51853317e-02,  9.42495372e-03, -8.52091890e-03,
        -1.31215565e-02,  3.65492213e-03,  3.26350611e-03, -1.00015728e-02,
        -8.01088009e-03,  3.74478608e-04, -1.00427959e-02,  2.71318690e-03,
         4.00448637e-03, -1.39096081e-02, -2.62372359e-03,  7.23903626e-03,
        -5.24997385e-03,  4.91737016e-03, -7.59983808e-03,  8.80085584e-03,
        -2.84798932e-03, -2.30059102e-02, -8.35987681e-04,  9.71683708e-04,
         8.59464798e-03,  1.08168065e-03,  1.28186755e-02, -2.76839267e-03,
        -6.89407298e-03,  1.42194265e-02, -1.13243498e-02,  6.08905870e-03,
         4.56750439e-03, -5.50246937e-03, -8.02710280e-03,  7.10748415e-03,
        -5.91650791e-03,  4.13374044e-03, -5.18505787e-03,  9.81136807e-04,
         6.94503717e-04, -1.59346238e-02,  1.49715291e-02, -9.63942241e-03,
         1.1

### Supervised fastText pos training

In [13]:
if not os.path.exists('hsd/DavidsonEtAl/fasttext_pos.ft'):
    with open('hsd/DavidsonEtAl/fasttext_pos.ft', 'a') as f:
        for t, l in list(zip(tweets, labels)):
            f.write('__label__{} {}\n'.format(l, get_pos_string(t)))

# load fasttext pos model or train & save if none
if os.path.exists('hsd/DavidsonEtAl/fasttext_pos_{}.bin'.format(MODEL)):
    ft_pos_model = fasttext.load_model('hsd/DavidsonEtAl/fasttext_pos_{}.bin'.format(MODEL))
else:
    ft_pos_model = fasttext.train_supervised('hsd/DavidsonEtAl/fasttext_pos.ft',
                                             lr=0.5, epoch=50, wordNgrams=3, dim=dim)
    ft_pos_model.save_model('hsd/DavidsonEtAl/fasttext_pos_{}.bin'.format(MODEL))




### Part of speech (PoS) features

In [14]:
def get_pos_fts(data, length, adjust=False):

    #Get POS tags for tweets and save as a string
    pos_sentences = []
    t = tqdm(data)
    t.set_postfix_str('PoS features extraction: tokenization.')
    for d in t:
        pos_string = get_pos_string(d)
        pos_sentences.append(pos_string)
        
        
    pos_tags = []
    for ps in pos_sentences:
        pos_tags.append(ps.split(' '))
    
    if adjust:
        pos_tags = [adjust_words(pt, length) for pt in pos_tags]
    else:
        pos_tags = [pad_words(pt, length) for pt in pos_tags]
    
    ft_matrices = []
    t = tqdm(pos_tags)
    t.set_postfix_str('PoS features extraction: vectorization.')
    for pt in t:
        ft_matrix = []
        for t in pt:
            ft_matrix.append(ft_pos_model[t])
        ft_matrices.append(ft_matrix)
    
    return ft_matrices

In [15]:
pos_features = get_pos_fts(tweets, opt_pos_length)

HBox(children=(IntProgress(value=0, max=24783), HTML(value=u'')))




HBox(children=(IntProgress(value=0, max=24783), HTML(value=u'')))




In [16]:
pos_features[0]

[array([-0.04133609,  0.00908666, -0.09434811,  0.05086514, -0.02806712,
        -0.04504529,  0.16762328, -0.03059581,  0.09255946, -0.14315666,
         0.03398997, -0.00289617, -0.00329385,  0.10632594,  0.01252081,
         0.03013656,  0.01539553, -0.00982873,  0.08559413, -0.04518993,
        -0.03659159,  0.02406482, -0.0420054 , -0.1627055 ,  0.08733887,
        -0.23019423,  0.21657053, -0.20851907, -0.1380061 ,  0.14944491,
        -0.0258622 , -0.03285831, -0.11333527, -0.23119052, -0.19008468,
         0.06052198,  0.09640078, -0.08128897,  0.059801  ,  0.0597319 ,
         0.01948337,  0.13435997,  0.11074282,  0.11328635,  0.06170149,
         0.13687232, -0.03548734, -0.07128105, -0.28651482,  0.07859058,
        -0.11782619,  0.23164546, -0.00638825,  0.09298854, -0.06635772,
        -0.0613602 , -0.03253764,  0.09136543,  0.10448191,  0.14441657,
        -0.08110224,  0.13090526,  0.1396035 ,  0.15411678, -0.15639092,
         0.04049598, -0.00945679,  0.02626096,  0.1

In [17]:
np.array(wordtoken_features).shape

(24783, 13, 200)

In [18]:
np.array(pos_features).shape

(24783, 12, 200)

### All features

In [19]:
#Now join them all up
features = np.concatenate([wordtoken_features, pos_features], axis=1)

In [20]:
features.shape

(24783, 25, 200)

In [21]:
features[0]

array([[-0.00588701, -0.00635683,  0.00721236, ..., -0.00499792,
         0.01916229,  0.01112006],
       [-0.01280384, -0.01386348,  0.03446244, ..., -0.00385264,
         0.03250621,  0.03051583],
       [-0.00197909, -0.00947993,  0.02308215, ..., -0.00857643,
         0.03753267,  0.03181882],
       ...,
       [-0.02771328,  0.04237166, -0.07771615, ...,  0.08031926,
        -0.12693454, -0.09340674],
       [ 0.01038093,  0.01979677, -0.2951925 , ..., -0.06210161,
        -0.02076587, -0.10863646],
       [-0.04137733, -0.12971084,  0.0035724 , ..., -0.06720112,
        -0.01565731, -0.03894983]], dtype=float32)

## Save features & labels

In [22]:
archive = dir_archive('hsd/DavidsonEtAl/X_y_{}'.format(MODEL), {'features': features, 'labels': labels,
                                                                'wt_num': np.array(wordtoken_features).shape[1]}, serialized=True)
archive.dump()
del archive

# Hate Speech Detector - EN - Features extraction for EN test data

In [None]:
BATCHES = 50

TOKENS_LENGTH = 13
POS_LENGTH = 12

if os.path.exists('tests/tweets_en.csv'):
    with open('tests/tweets_en.csv', 'r') as f:
        raw_tweets = [d[0] for d in tqdm(list(csv.reader(f))[1:])]
    
    q, r = divmod(len(raw_tweets), BATCHES)
    tweets_batches = [raw_tweets[i * q + min(i, r):(i + 1) * q + min(i + 1, r)] for i in xrange(BATCHES)]
    
    for batch in range(BATCHES):
        print('Batch {}/{}'.format(batch+1, BATCHES))
        tweets_batch = tweets_batches[batch]
    
        wt_features = get_wordtoken_fts(tweets_batch, length=TOKENS_LENGTH)
        p_features = get_pos_fts(tweets_batch, length=POS_LENGTH)

        all_features = np.concatenate([wt_features, p_features], axis=1)
        print('Done! Extracted dimensions: {}'.format(all_features.shape))
        
        batch_str = str(batch) if batch >= 100 else '0'+str(batch) if batch >= 10 else '00'+str(batch)
        archive = dir_archive('tests/en_{}/X_{}'.format(MODEL, batch_str), {'features': all_features,
                                                                            'wt_num': np.array(wt_features).shape[1]}, serialized=True)
        archive.dump()
        del archive
    print('All done!')