# Hate Speech Detector - PL - Features extraction for Conv. & LSTM model

Based on [this notebook](https://github.com/t-davidson/hate-speech-and-offensive-language/blob/master/classifier/final_classifier.ipynb).

In [1]:
import os
import csv
import pandas as pd
import numpy as np
import pickle
from klepto.archives import dir_archive

import sys
import string
import re
import fasttext

import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
%matplotlib inline

In [2]:
MODEL = 'lstm'
dim = 6*20 if MODEL == 'conv' else 200

## Poleval 2019 data loading

### Classes pre:
    0 - non-harmful
    1 - cyberbullying
    2 - hate speech

In [3]:
if not os.path.exists('hsd/Poleval2019/perfect_data.pkl'):
    with open('hsd/Poleval2019/train_texts.txt', 'r') as f:
        tweets = f.readlines()
    with open('hsd/Poleval2019/test_texts.txt', 'r') as f:
        tweets.extend(f.readlines())
    
    with open('hsd/Poleval2019/train_labels.txt', 'r') as f:
        labels = f.readlines()
    with open('hsd/Poleval2019/test_labels.txt', 'r') as f:
        labels.extend(f.readlines())
    
    with open('hsd/Poleval2019/perfect_data.pkl', 'w') as f:
        def chcl(c):
            return 0 if c=='0\r\n' else 1
        labels = list(map(chcl, labels))
        pickle.dump((tweets, labels), f)
else:
    with open('hsd/Poleval2019/perfect_data.pkl', 'rb') as f:
        tweets, labels = pickle.load(f)

UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 26: ordinal not in range(128)

### Classes post:
    0 - no hate
    1 - hate speech

In [None]:
list(zip(tweets[:5], labels[:5]))

## Features extraction

In [None]:
def preprocess(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE
    4) hashtags with HASHTAGHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    hashtag_regex = '#[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, '', parsed_text)
    parsed_text = re.sub(mention_regex, '', parsed_text)
    parsed_text = re.sub(hashtag_regex, '', parsed_text)
    return parsed_text

def pos(text):
    import morfeusz2
    morf = morfeusz2.Morfeusz()

    analysis = morf.analyse(line)
    
    return [interp[2] for i, j, interp in analysis]
    

def pad_words(words, length):
    if len(words) >= length:
        return words[:length]
    else:
        additional = length - len(words)
        return words + ['PUSTY']*additional

def adjust_words(words, length):
    # different from pad: output tokens may contain more than 1 words
    if len(words) >= length:
        q, r = divmod(len(words), length)
        return [' '.join(words[i * q + min(i, r):(i + 1) * q + min(i + 1, r)]) for i in xrange(length)]
    else:
        additional = length - len(words)
        return words + ['EMPTY']*additional

In [None]:
# if no morfeusz2 installed then save preprocessed tweets and load pos strings from outer source
sentences = [preprocess(t) for t in tweets]
with open('hsd/Poleval2019/preprocessed.pkl', 'w') as f:
    pickle.dump(sentences, f)

### Median sentences length

In [None]:
def median_sentences_length(data):
    all_lengths, wt_lengths, pos_lengths = [], [], []
    for d in tqdm(data):
        sentence = preprocess(d)
        all_lengths.append(len(sentence.split(' ')))
        wt_lengths.append(len(sentence.split(' ')))
        
        # only if morfeusz2 is installed
        '''all_lengths.append(len(pos(preprocess(d))))
        pos_lengths.append(len(pos(preprocess(d))))'''
    # otherwise load pos strings from outer source
    with open('hsd/Poleval2019/pos_sentences.pkl', 'r') as f:
        pos_sentences = pickle.load(f)
    for ps in tqdm(pos_sentences):
        all_lengths.append(len(ps.split(' ')))
        pos_lengths.append(len(ps.split(' ')))
    
    return int(np.median(all_lengths)), int(np.median(wt_lengths)), int(np.median(pos_lengths))

In [None]:
opt_length, opt_wt_length, opt_pos_length = median_sentences_length(tweets)

print('Optimal all length: {}'.format(opt_length))
print('Optimal sentence length: {}'.format(opt_wt_length))
print('Optimal pos sentence length: {}'.format(opt_pos_length))

### Supervised fastText wordtokens training

In [None]:
if not os.path.exists('hsd/Poleval2019/fasttext.ft'):
    with open('hsd/Poleval2019/fasttext.ft', 'a') as f:
        for t, l in list(zip(tweets, labels)):
            f.write('__label__{} {}\n'.format(l, preprocess(t)))

# load fasttext model or train & save if none
if os.path.exists('hsd/Poleval2019/fasttext_{}.bin'.format(MODEL)):
    ft_model = fasttext.load_model('hsd/Poleval2019/fasttext_{}.bin'.format(MODEL))
else:
    ft_model = fasttext.train_supervised('hsd/Poleval2019/fasttext.ft',
                                         lr=0.5, epoch=50, wordNgrams=3, dim=dim)
    ft_model.save_model('hsd/Poleval2019/fasttext_{}.bin'.format(MODEL))

### Wordtoken features

In [None]:
def get_wordtoken_fts(data, length, adjust=False):
    
    sentences_words = []
    t = tqdm(data)
    t.set_postfix_str('Wordtokens features extraction: tokenization.')
    for d in t:
        sentence = preprocess(d)
        sentences_words.append(sentence.split(' '))
    
    if adjust:
        sentences_words = [adjust_words(sw, length) for sw in sentences_words]
    else:
        sentences_words = [pad_words(sw, length) for sw in sentences_words]
    
    ft_matrices = []
    t = tqdm(sentences_words)
    t.set_postfix_str('Wordtokens features extraction: vectorization.')
    for sw in t:
        ft_matrix = []
        for w in sw:
            ft_matrix.append(ft_model[w])
        ft_matrices.append(ft_matrix)
    
    return ft_matrices

In [None]:
wordtoken_features = get_wordtoken_fts(tweets, opt_wt_length)

In [None]:
wordtoken_features[0]

### Supervised fastText wordtokens training

In [None]:
if not os.path.exists('hsd/Poleval2019/fasttext_pos.ft'):
    # only if morfeusz2 is installed
    '''with open('hsd/Poleval2019/fasttext_pos.ft', 'a') as f:
        for t, l in list(zip(tweets, labels)):
            f.write('__label__{} {}\n'.format(l, pos(t)))'''
    # otherwise load pos strings from outer source
    with open('hsd/Poleval2019/pos_sentences.pkl', 'r') as f:
        pos_sentences = pickle.load(f)
    with open('hsd/Poleval2019/fasttext_pos.ft', 'a') as f:
        for ps, l in list(zip(pos_sentences, labels)):
            f.write('__label__{} {}\n'.format(l, ps))
        

# load fasttext pos model or train & save if none
if os.path.exists('hsd/Poleval2019/fasttext_pos_{}.bin'.format(MODEL)):
    ft_pos_model = fasttext.load_model('hsd/Poleval2019/fasttext_pos_{}.bin'.format(MODEL))
else:
    ft_pos_model = fasttext.train_supervised('hsd/Poleval2019/fasttext_pos.ft',
                                             lr=0.5, epoch=50, wordNgrams=3, dim=dim)
    ft_pos_model.save_model('hsd/Poleval2019/fasttext_pos_{}.bin'.format(MODEL))

### Part of speech (PoS) features

In [None]:
def get_pos_fts(data, length, adjust=False, batch_data=None):
    
    # only if morfeusz2 is installed
    '''pos_sentences = [pos(sentence) for sentence in tqdm(sentences)]'''
    # otherwise load pos strings from outer source
    if adjust and batch_data:  # for test data
        pos_sentences = batch_data
    else:
        with open('hsd/Poleval2019/pos_sentences.pkl', 'r') as f:
            pos_sentences = pickle.load(f)
    
    
    pos_tags = []
    for ps in pos_sentences:
        pos_tags.append(ps.split(' '))
    
    if adjust:
        pos_tags = [adjust_words(pt, length) for pt in pos_tags]
    else:
        pos_tags = [pad_words(pt, length) for pt in pos_tags]
    
    ft_matrices = []
    t = tqdm(pos_tags)
    t.set_postfix_str('PoS features extraction: tokenization.')
    for pt in t:
        ft_matrix = []
        for t in pt:
            ft_matrix.append(ft_pos_model[t])
        ft_matrices.append(ft_matrix)
    
    return ft_matrices

In [None]:
pos_features = get_pos_fts(tweets, opt_pos_length)

In [None]:
pos_features[0]

In [None]:
np.array(wordtoken_features).shape

In [None]:
np.array(pos_features).shape

### All features

In [None]:
#Now join them all up
features = np.concatenate([wordtoken_features, pos_features],axis=1)

In [None]:
features.shape

In [None]:
features[0]

## Save features & labels

In [None]:
archive = dir_archive('hsd/Poleval2019/X_y_{}'.format(MODEL), {'features': features, 'labels': labels,
                                                                'wt_num': np.array(wordtoken_features).shape[1]}, serialized=True)
archive.dump()
del archive

# Hate Speech Detector - PL - Features extraction for PL test data

In [None]:
BATCHES = 1

TOKENS_LENGTH = 13
POS_LENGTH = 27

if os.path.exists('tests/tweets_pl.csv'):
    with open('tests/tweets_pl.csv', 'r') as f:
        raw_tweets = [d[0] for d in tqdm(list(csv.reader(f))[1:])]
    batch_len = len(raw_tweets)/BATCHES
    
    # for polish test tweets
    # if no morfeusz2 installed then save preprocessed tweets and load pos strings from outer source
    sentences = [preprocess(t) for t in raw_tweets]
    with open('tests/preprocessed_pl.pkl', 'w') as f:
        pickle.dump(sentences, f)
    
    if not os.path.exists('tests/pos_sentences_pl.pkl'):
        raise Exception('Stay awhile! Use morfeusz2 and get pos features.')
    
    with open('tests/pos_sentences_pl.pkl', 'r') as f:
        pos_sentences = pickle.load(f)
    
    q, r = divmod(len(raw_tweets), BATCHES)
    tweets_batches = [raw_tweets[i * q + min(i, r):(i + 1) * q + min(i + 1, r)] for i in xrange(BATCHES)]
    pos_batches = [pos_sentences[i * q + min(i, r):(i + 1) * q + min(i + 1, r)] for i in xrange(BATCHES)]
    
    for batch in range(BATCHES):
        print('Batch {}/{}'.format(batch+1, BATCHES))
        tweets_batch = tweets_batches[batch]
        pos_batch = pos_batches[batch]
    
        wt_features = get_wordtoken_fts(tweets_batch, length=TOKENS_LENGTH)
        p_features = get_pos_fts(tweets_batch, length=POS_LENGTH, batch_data=pos_batch)

        all_features = np.concatenate([wt_features, p_features], axis=1)
        print('Done! Extracted dimensions: {}'.format(all_features.shape))
        
        batch_str = str(batch) if batch >= 100 else '0'+str(batch) if batch >= 10 else '00'+str(batch)
        archive = dir_archive('tests/pl_{}/X_{}'.format(MODEL, batch_str), {'features': all_features,
                                                                            'wt_num': np.array(wt_features).shape[1]}, serialized=True)
        archive.dump()
        del archive
    print('All done!')