In [None]:
from hatespeech import preprocessing
from hatespeech import evaluation

In [None]:
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers import Embedding, Flatten, Dense, Dropout, LSTM, Conv1D, MaxPooling1D, Concatenate, Input
from keras.utils.np_utils import to_categorical
from keras import layers

In [None]:
import pickle
import numpy as np
from gensim.models import FastText
from sklearn.metrics import confusion_matrix

### Load Data

In [None]:
train_path = 'Data/Datasets/train_data.csv'
dev_path = 'Data/Datasets/dev_data.csv'
test_path = 'Data/Datasets/test_data.csv'

In [None]:
texts, labels, cnt = preprocessing.load_datasets(train_path, dev_path, test_path)

### Create Ngrams

In [None]:
maxlen_ngrams = 100
sequences, word_index, mfws, max_words_ngrams = preprocessing.tokenize_texts_ngrams(texts, ngrams = True, chars = 4)
data_reshaped, labels_reshaped = preprocessing.reshape(sequences, labels, maxlen = maxlen_ngrams)

### Create Characters

In [None]:
maxlen_chars = 280
max_words_chars = 670
texts_chars = preprocessing.tokenize_texts_characters(texts)
data_reshaped_chars, labels_reshaped_chars, word_index_chars = preprocessing.reshape_characters(texts_chars, labels, maxlen = maxlen_chars)

### Prepare Datasets and Embeddings

In [None]:
x_train = data_reshaped[:12000]
y_train = labels_reshaped[:12000]
y_train = to_categorical(y_train)
x_dev = data_reshaped[12000:15000]
y_dev = labels_reshaped[12000:15000]
y_dev = to_categorical(y_dev)
x_test = data_reshaped[15000:18000]
y_test = labels_reshaped[15000:18000]

In [None]:
x_train_chars = data_reshaped_chars[:12000]

x_dev_chars = data_reshaped_chars[12000:15000]

x_test_chars = data_reshaped_chars[15000:18000]

In [None]:
def create_embedding_matrix(path, word_index, embdding_dim=300, save=False):
    
    vectors = FastText.load_fasttext_format(path, encoding='utf-8')
  
    embedding_matrix=np.zeros((max_words, embedding_dim))
    for word, i in word_index.items(): 
        if i < max_words:
            try:
                embedding_vector=vectors.wv[word]
            except KeyError:
           
                print(word, 'ist nicht enthalten.')
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    if save==True:            
        pickle.dump( embedding_matrix, open("embeddings_ngrams_small.p", "wb" ) )
    return embedding_matrix

In [None]:
def save_f1_scores(f1_array, output_file):
    with open(output_file, 'w') as f:
        for score in f1_array:
            f.write("%s\n" % score)
        f.write("\n")
        f.write("Average: %s"  % np.mean(f1_array))

**Note**: These Embeddings were created once via FastText with the method above, then saved. That is why the Embeddings are loaded via pickle in this case. They could also be created again with the method above, should the pickled version not work.

In [None]:
#path to FastText Embeddings
path = r'G:\Fasttext\cc.en.300.bin\\cc.en.300.bin'
embedding_matrix_ngrams = pickle.load(open( "embeddings_ngrams_small.p", "rb" ))
#embedding_matrix = create_embedding_matrix(path, word_index=word_index, save=True)

### Create Model

In [None]:
def create_mixed_model(max_words_ngrams, maxlen_ngrams, max_words_chars, maxlen_chars, embedding_matrix_ngrams, embedding_matrix_chars, embedding_dim=300):
    
    ngram_input= Input(shape=(100,), name='ngrams')
    ngram_embeds = Embedding(max_words_ngrams, embedding_dim, input_length=maxlen_ngrams)(ngram_input)
    
    x = LSTM(5)(ngram_embeds)
    
    char_input= Input(shape=(280,), name='characters')
    char_embeds = Embedding(max_words_chars, embedding_dim, input_length=maxlen_chars)(char_input)
    
    
    y = Conv1D(16, 5, activation='relu')(char_embeds)
    y = MaxPooling1D(3)(y)
    y = Conv1D(32, 5, activation='relu')(y)
    y = MaxPooling1D(3)(y)
    y = Conv1D(64, 5, activation='relu')(y)
    y = MaxPooling1D(3)(y)
    y = Flatten()(y)
    y = Dense(5, activation='relu')(y)
    
    
    concat = Concatenate()([x, y])
    z=layers.Dropout(0.5)(concat)
    output_tensor=layers.Dense(3, activation='softmax')(z)
    
    model = Model([ngram_input, char_input], output_tensor)
    
    
    model.layers[9].set_weights([embedding_matrix_ngrams]) 
    model.layers[9].trainable = False 
    
    return model

In [None]:
def fit_model(x_train, y_train, x_dev, y_dev, x_train_chars, x_dev_chars, model):
    """
    Fits a model on a given train set (data and labels). Returns model and history. (Rewritten two fit two inputs.)
    """

    model.compile(optimizer='rmsprop',
                 loss='categorical_crossentropy',
                 metrics=['accuracy'])
    
    history=model.fit({'ngrams': x_train, 'characters': x_train_chars},
                     y_train, 
                     epochs=15,
                     batch_size=32,
                     validation_data=({'ngrams': x_dev, 'characters': x_dev_chars}, y_dev))
    return model, history

In [None]:
def get_test_predictions(model, x_test, x_test_chars):
    """
    Gets predictions given test data (array) and a model. Returns array of predictions. (Rewritten to fit two inputs.)
    """
    predictions = model.predict({'ngrams': x_test, 'characters': x_test_chars})
    y_pred = []
    for pred in predictions:
        pred = list(pred)
        y_pred.append(pred.index(max(pred)))
    return y_pred

### Start Testing

In [None]:
f1_array = []
i = 0
cm = np.zeros(shape=(3,3))

while i < 20:
    print(i)
 
    model = create_mixed_model(max_words_ngrams, maxlen_ngrams, max_words_chars, maxlen_chars, embedding_matrix_ngrams, embedding_matrix_chars=None, embedding_dim=300)
    model, history = fit_model(x_train, y_train, x_dev, y_dev, x_train_chars, x_dev_chars, model)
    
    predictions = get_test_predictions(model, x_test, x_test_chars)
    f1 = evaluation.print_f1_scores(y_test, predictions)
    f1_array.append(f1)
    
    cnf_matrix = confusion_matrix(y_test, predictions)    
    cm = cm+cnf_matrix
    i +=1
    

evaluation.plot_confusion_matrix(cm, classes=['Hassrede', 'Beleidigung', 'Neutral'], normalize=True,
                      title=' ')
#save_f1_scores(f1_array, 'mixed_ver_results.txt')