# Hate Speech Detector - PL - Features extraction for SVM & Dense model

Based on [this notebook](https://github.com/t-davidson/hate-speech-and-offensive-language/blob/master/classifier/final_classifier.ipynb).

In [1]:
import os
import csv
import pandas as pd
import numpy as np
import pickle
from klepto.archives import dir_archive

import sys
import nltk
import string
import re
import fasttext
from polyglot.text import Text
import syllables as sylla

import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
%matplotlib inline

In [2]:
MODEL = 'dense'
dim = 10 if MODEL == 'svm' else 200

## Poleval 2019 data loading

### Classes pre:
    0 - non-harmful
    1 - cyberbullying
    2 - hate speech

In [3]:
if not os.path.exists('hsd/Poleval2019/perfect_data.pkl'):
    with open('hsd/Poleval2019/train_texts.txt', 'r') as f:
        tweets = f.readlines()
    with open('hsd/Poleval2019/test_texts.txt', 'r') as f:
        tweets.extend(f.readlines())
    
    with open('hsd/Poleval2019/train_labels.txt', 'r') as f:
        labels = f.readlines()
    with open('hsd/Poleval2019/test_labels.txt', 'r') as f:
        labels.extend(f.readlines())
    
    with open('hsd/Poleval2019/perfect_data.pkl', 'w') as f:
        def chcl(c):
            return 0 if c=='0\r\n' else 1
        labels = list(map(chcl, labels))
        pickle.dump((tweets, labels), f)
else:
    with open('hsd/Poleval2019/perfect_data.pkl', 'r') as f:
        tweets, labels = pickle.load(f)

### Classes post:
    0 - no hate
    1 - hate speech

In [4]:
list(zip(tweets[:5], labels[:5]))

[('Dla mnie faworytem do tytu\xc5\x82u b\xc4\x99dzie Cracovia. Zobaczymy, czy typ si\xc4\x99 sprawdzi.\r\n',
  0),
 ('@anonymized_account @anonymized_account Brawo ty Daria kibic ma by\xc4\x87 na dobre i z\xc5\x82e\r\n',
  0),
 ('@anonymized_account @anonymized_account Super, polski premier sk\xc5\x82ada kwiaty na grobach kolaborant\xc3\xb3w. Ale doczekali\xc5\x9bmy czas\xc3\xb3w.\r\n',
  0),
 ('@anonymized_account @anonymized_account Musi. Innej drogi nie mamy.\r\n',
  0),
 ('Odrzut natychmiastowy, kwa\xc5\x9bna mina, mam problem\r\n', 0)]

## Features extraction

In [5]:
def preprocess(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE
    4) hashtags with HASHTAGHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    hashtag_regex = '#[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, '', parsed_text)
    parsed_text = re.sub(mention_regex, '', parsed_text)
    parsed_text = re.sub(hashtag_regex, '', parsed_text)
    return parsed_text

def pos(text):
    import morfeusz2
    morf = morfeusz2.Morfeusz()

    analysis = morf.analyse(line)
    
    return [interp[2] for i, j, interp in analysis]

def adjust_words(words, length):
    # different from pad: output tokens may contain more than 1 words
    if len(words) >= length:
        q, r = divmod(len(words), length)
        return [' '.join(words[i * q + min(i, r):(i + 1) * q + min(i + 1, r)]) for i in xrange(length)]
    else:
        additional = length - len(words)
        return words + ['EMPTY']*additional

def pad_words(words, length):
    if len(words) >= length:
        return words[:length]
    else:
        additional = length - len(words)
        return words + ['PUSTY']*additional

def count_twitter_objs(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE
    4) hashtags with HASHTAGHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned.
    
    Returns counts of urls, mentions, and hashtags.
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    hashtag_regex = '#[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text)
    parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text)
    parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text)
    return(parsed_text.count('URLHERE'),parsed_text.count('MENTIONHERE'),parsed_text.count('HASHTAGHERE'))

### Supervised fastText wordtokens training

In [6]:
if not os.path.exists('hsd/Poleval2019/fasttext.ft'):
    with open('hsd/Poleval2019/fasttext.ft', 'a') as f:
        for t, l in list(zip(tweets, labels)):
            f.write('__label__{} {}\n'.format(l, preprocess(t)))

# load fasttext model or train & save if none
if os.path.exists('hsd/Poleval2019/fasttext_{}.bin'.format(MODEL)):
    ft_model = fasttext.load_model('hsd/Poleval2019/fasttext_{}.bin'.format(MODEL))
else:
    ft_model = fasttext.train_supervised('hsd/Poleval2019/fasttext.ft',
                                         lr=0.5, epoch=50, wordNgrams=3, dim=dim)
    ft_model.save_model('hsd/Poleval2019/fasttext_{}.bin'.format(MODEL))




### Wordtoken features

In [7]:
def get_wordtoken_fts(data, length=None):
    
    sentences_words = []
    t = tqdm(data)
    t.set_postfix_str('Wordtokens features extraction: tokenization.')
    for d in t:
        sentence = preprocess(d)
        sentences_words.append(sentence.split(' '))
    
    if length:
        sentences_words = [adjust_words(sw, length) for sw in sentences_words]
        print('Required length: {}'.format(length))
    else:
        opt_length = int(np.median([len(sw) for sw in sentences_words]))
        print('Optimal median length: {}'.format(opt_length))
        sentences_words = [pad_words(sw, opt_length) for sw in sentences_words]
    
    ft_vectors = []
    t = tqdm(sentences_words)
    t.set_postfix_str('Wordtokens features extraction: vectorization.')
    for sw in t:
        ft_vector = []
        for w in sw:
            ft_vector.extend(ft_model[w])
        ft_vectors.append(ft_vector)
    
    return ft_vectors

In [8]:
wordtoken_features = get_wordtoken_fts(tweets)

HBox(children=(IntProgress(value=0, max=11041), HTML(value=u'')))


Optimal median length: 13


HBox(children=(IntProgress(value=0, max=11041), HTML(value=u'')))




In [9]:
wordtoken_features[0]

[-0.015512926,
 -0.0026072965,
 -0.06923173,
 -0.033796422,
 0.095874764,
 -0.0026384613,
 -0.031722605,
 -0.012471147,
 0.1740663,
 0.14314674,
 0.02551314,
 0.06822865,
 0.038870316,
 0.050582696,
 -0.06462112,
 0.021364849,
 -0.017295238,
 0.04671716,
 0.048430618,
 -0.020492287,
 0.043160345,
 0.111538,
 0.05919135,
 -0.123705864,
 0.004580482,
 -0.022507176,
 0.07859944,
 -0.03323759,
 -0.03462369,
 0.16817991,
 -0.04833367,
 0.0096064815,
 -0.13695371,
 -0.09531137,
 -0.12481745,
 -0.027561303,
 0.08612766,
 -0.06865149,
 -0.032448597,
 -0.043504033,
 0.0346101,
 0.11603275,
 0.107247874,
 0.046616975,
 0.02244389,
 -0.06148051,
 -0.023481231,
 0.012245244,
 -0.060820606,
 0.062400904,
 -0.13959157,
 0.15588847,
 -0.10509294,
 0.06942574,
 -0.0137870535,
 -0.099674284,
 0.010179095,
 -0.030308815,
 -0.016198218,
 0.0971291,
 -0.03430514,
 0.11718732,
 0.056701344,
 0.0893687,
 -0.01929071,
 0.06670288,
 -0.08351892,
 0.09160725,
 0.055442102,
 0.010097044,
 -0.15358204,
 -0.05286

### Supervised fastText wordtokens training

In [10]:
# if no morfeusz2 installed then save preprocessed tweets and load pos strings from outer source
sentences = [preprocess(t) for t in tweets]
with open('hsd/Poleval2019/preprocessed.pkl', 'w') as f:
    pickle.dump(sentences, f)

In [11]:
if not os.path.exists('hsd/Poleval2019/fasttext_pos.ft'):
    # only if morfeusz2 is installed
    '''with open('hsd/Poleval2019/fasttext_pos.ft', 'a') as f:
        for t, l in list(zip(tweets, labels)):
            f.write('__label__{} {}\n'.format(l, pos(t)))'''
    # otherwise load pos strings from outer source
    with open('hsd/Poleval2019/pos_sentences.pkl', 'r') as f:
        pos_sentences = pickle.load(f)
    with open('hsd/Poleval2019/fasttext_pos.ft', 'a') as f:
        for ps, l in list(zip(pos_sentences, labels)):
            f.write('__label__{} {}\n'.format(l, ps))
        

# load fasttext pos model or train & save if none
if os.path.exists('hsd/Poleval2019/fasttext_pos_{}.bin'.format(MODEL)):
    ft_pos_model = fasttext.load_model('hsd/Poleval2019/fasttext_pos_{}.bin'.format(MODEL))
else:
    ft_pos_model = fasttext.train_supervised('hsd/Poleval2019/fasttext_pos.ft',
                                             lr=0.5, epoch=50, wordNgrams=3, dim=dim)
    ft_pos_model.save_model('hsd/Poleval2019/fasttext_pos_{}.bin'.format(MODEL))




### Part of speech (PoS) features

In [12]:
def get_pos_fts(data, length=None, batch_data=None):
    
    # only if morfeusz2 is installed
    '''pos_sentences = [pos(sentence) for sentence in tqdm(sentences)]'''
    # otherwise load pos strings from outer source
    if length and batch_data:  # for test data
        pos_sentences = batch_data
    else:
        with open('hsd/Poleval2019/pos_sentences.pkl', 'r') as f:
            pos_sentences = pickle.load(f)
    
    
    pos_tags = []
    for ps in pos_sentences:
        pos_tags.append(ps.split(' '))
    
    if length:
        pos_tags = [adjust_words(pt, length) for pt in pos_tags]
        print('Required length: {}'.format(length))
    else:
        opt_length = int(np.median([len(pt) for pt in pos_tags]))
        print('Optimal median length: {}'.format(opt_length))
        pos_tags = [pad_words(pt, opt_length) for pt in pos_tags]
    
    ft_vectors = []
    t = tqdm(pos_tags)
    t.set_postfix_str('PoS features extraction: tokenization.')
    for pt in t:
        ft_vector = []
        for t in pt:
            ft_vector.extend(ft_pos_model[t])
        ft_vectors.append(ft_vector)
    
    return ft_vectors

In [13]:
pos_features = get_pos_fts(tweets)

Optimal median length: 27


HBox(children=(IntProgress(value=0, max=11041), HTML(value=u'')))




In [14]:
pos_features[0]

[0.095117204,
 -0.0020177192,
 -0.083730265,
 -0.07730704,
 0.16947564,
 -0.04341364,
 -0.11852013,
 -0.09357776,
 0.25111187,
 0.21683273,
 -0.032175027,
 0.22070137,
 -0.017235378,
 0.06428934,
 -0.14010037,
 0.03259269,
 0.106180094,
 -0.093390636,
 0.08206671,
 0.0723995,
 0.04248942,
 0.09249842,
 0.08466029,
 -0.1960741,
 -0.04869621,
 -0.122711584,
 0.1836747,
 -0.11195623,
 0.08116391,
 0.34863436,
 -0.0035521043,
 -0.060784813,
 -0.17950287,
 0.0074286805,
 -0.24655534,
 -0.044192895,
 0.13288441,
 0.11346913,
 -0.03868216,
 0.018759185,
 -0.045990612,
 0.07301381,
 0.23778106,
 0.17884853,
 0.029925516,
 -0.16210057,
 0.13870086,
 0.14593707,
 0.03734735,
 0.14395522,
 -0.11735986,
 0.15883799,
 -0.11085931,
 0.07745205,
 -0.13142869,
 -0.07569433,
 -0.05521636,
 -0.0971961,
 -0.06669872,
 0.17593294,
 0.14186819,
 0.2102134,
 0.026650526,
 0.11786988,
 -0.054732822,
 0.053063396,
 -0.29811487,
 0.07922049,
 0.048171725,
 0.042521656,
 -0.0855644,
 -0.03641211,
 0.0121240765,

### Other features

In [15]:
def get_other_fts(data):
    other_features = []
    t = tqdm(data)
    t.set_postfix_str('Other features extraction.')
    for tweet in t:
        """This function takes a string and returns a list of features.
        These include Sentiment scores, Text and Readability scores,
        as well as Twitter specific features"""
        sent_analysis = Text(tweet)
        sentiment = {}
        sentiment['neg_cnt'] = 0
        sentiment['neu_cnt'] = 0
        sentiment['pos_cnt'] = 0
        for w in sent_analysis.words:
            mapping = {-1: 'neg_cnt', 0: 'neu_cnt', 1: 'pos_cnt'}
            try:
                sentiment[mapping[w.polarity]] += 1
            except ValueError, UnicodeError:
                sentiment['neu_cnt'] += 1

        words = preprocess(tweet) #Get text only

        syllables = sylla.estimate(words)
        num_chars = sum(len(w) for w in words)
        num_chars_total = len(tweet)
        num_terms = len(tweet.split())
        num_words = len(words.split())
        avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4)
        num_unique_terms = len(set(words.split()))

        ###Modified FK grade, where avg words per sentence is just num words/1
        FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59, 1)
        ##Modified FRE score, where sentence fixed to 1
        FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)), 2)

        twitter_objs = count_twitter_objs(tweet)
        retweet = 0 if "rt" in words else 1
        features = [FKRA, FRE, syllables, avg_syl, num_chars, num_chars_total, num_terms, num_words, num_unique_terms,
                    sentiment['neg_cnt'], sentiment['neu_cnt'], sentiment['pos_cnt'],
                    twitter_objs[2], twitter_objs[1],
                    twitter_objs[0], retweet]
        other_features.append(features)
        
    return np.array(other_features)

In [16]:
other_features = get_other_fts(tweets)

HBox(children=(IntProgress(value=0, max=11041), HTML(value=u'')))

No handlers could be found for logger "polyglot.detect.base"





In [17]:
other_features[:5]

array([[ 11.7   ,  32.51  ,  23.    ,   1.9166,  82.    ,  83.    ,
         12.    ,  12.    ,  12.    ,   0.    ,  15.    ,   0.    ,
          0.    ,   0.    ,   0.    ,   1.    ],
       [  4.8   ,  78.25  ,  14.    ,   1.4   ,  47.    ,  86.    ,
         12.    ,  10.    ,  10.    ,   1.    ,  12.    ,   1.    ,
          0.    ,   2.    ,   0.    ,   1.    ],
       [ 14.4   ,  11.1   ,  24.    ,   2.1817,  92.    , 131.    ,
         13.    ,  11.    ,  11.    ,   0.    ,  17.    ,   1.    ,
          0.    ,   2.    ,   0.    ,   1.    ],
       [  5.2   ,  66.41  ,   8.    ,   1.5999,  30.    ,  69.    ,
          7.    ,   5.    ,   5.    ,   0.    ,  11.    ,   0.    ,
          0.    ,   2.    ,   0.    ,   1.    ],
       [ 12.3   ,  17.46  ,  13.    ,   2.1665,  49.    ,  50.    ,
          6.    ,   6.    ,   6.    ,   1.    ,   6.    ,   1.    ,
          0.    ,   0.    ,   0.    ,   1.    ]])

### All features and feature names

In [18]:
#Now join them all up
features = np.concatenate([wordtoken_features, pos_features, other_features],axis=1)

In [19]:
features.shape

(11041, 8016)

## Save features & labels

In [20]:
archive = dir_archive('hsd/Poleval2019/X_y_{}'.format(MODEL), {'features': features, 'labels': labels}, serialized=True)
archive.dump()
del archive

# Hate Speech Detector - PL - Features extraction for PL test data

In [None]:
BATCHES = 1

TOKENS_LENGTH = 13
POS_LENGTH = 27

if os.path.exists('tests/tweets_pl.csv'):
    with open('tests/tweets_pl.csv', 'r') as f:
        raw_tweets = [d[0] for d in tqdm(list(csv.reader(f))[1:])]
    batch_len = len(raw_tweets)/BATCHES
    
    # for polish test tweets
    # if no morfeusz2 installed then save preprocessed tweets and load pos strings from outer source
    sentences = [preprocess(t) for t in raw_tweets]
    with open('tests/preprocessed_pl.pkl', 'w') as f:
        pickle.dump(sentences, f)
    
    if not os.path.exists('tests/pos_sentences_pl.pkl'):
        raise Exception('Stay awhile! Use morfeusz2 and get pos features.')
    
    with open('tests/pos_sentences_pl.pkl', 'r') as f:
        pos_sentences = pickle.load(f)
    
    q, r = divmod(len(raw_tweets), BATCHES)
    tweets_batches = [raw_tweets[i * q + min(i, r):(i + 1) * q + min(i + 1, r)] for i in xrange(BATCHES)]
    pos_batches = [pos_sentences[i * q + min(i, r):(i + 1) * q + min(i + 1, r)] for i in xrange(BATCHES)]
    
    for batch in range(BATCHES):
        print('Batch {}/{}'.format(batch+1, BATCHES))
        tweets_batch = tweets_batches[batch]
        pos_batch = pos_batches[batch]
    
        wt_features = get_wordtoken_fts(tweets_batch, length=TOKENS_LENGTH)
        p_features = get_pos_fts(tweets_batch, length=POS_LENGTH, batch_data=pos_batch)
        o_features = get_other_fts(tweets_batch)

        all_features = np.concatenate([wt_features, p_features, o_features], axis=1)
        print('Done! Extracted dimensions: {}'.format(all_features.shape))
        
        batch_str = str(batch) if batch >= 100 else '0'+str(batch) if batch >= 10 else '00'+str(batch)
        archive = dir_archive('tests/pl_{}/X_{}'.format(MODEL, batch_str), {'features': all_features}, serialized=True)
        archive.dump()
        del archive
    print('All done!')