In [None]:
from hatespeech import preprocessing
from hatespeech import evaluation

In [None]:
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, Dropout, LSTM, Conv1D, MaxPooling1D, Add
from keras.utils.np_utils import to_categorical
from keras import Input, layers, Model

In [None]:
import pickle
import numpy as np
from gensim.models import FastText
from sklearn.metrics import confusion_matrix

### Load Data

In [None]:
train_path = 'Data/Datasets/train_data.csv'
dev_path = 'Data/Datasets/dev_data.csv'
test_path = 'Data/Datasets/test_data.csv'

### Preprocess Data

In [None]:
texts, labels, cnt = preprocessing.load_datasets(train_path, dev_path, test_path)

In [None]:
maxlen = 100
sequences, word_index, mfws, max_words = preprocessing.tokenize_texts(texts)

In [None]:
data_reshaped, labels_reshaped = preprocessing.reshape(sequences, labels, maxlen = maxlen)

### Prepare Datasets and Embeddings

In [None]:
x_train = data_reshaped[:12000]
y_train = labels_reshaped[:12000]
x_dev = data_reshaped[12000:15000]
y_dev = labels_reshaped[12000:15000]
x_test = data_reshaped[15000:18000]
y_test = labels_reshaped[15000:18000]

In [None]:
embedding_dim = 300
def create_embedding_matrix(path, word_index, embdding_dim=300, save=False, save_as="embeddings.p"):
    
    vectors = FastText.load_fasttext_format(path, encoding='utf-8')
  
    embedding_matrix=np.zeros((max_words, embedding_dim))
    for word, i in word_index.items():
        if i < max_words:
            try:
                embedding_vector=vectors.wv[word]
            except KeyError:
           
                print(word, 'ist nicht enthalten.')
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    if save==True:            
        pickle.dump( embedding_matrix, open(save_as, "wb" ) )
    return embedding_matrix

**Note**: These Embeddings were created once via FastText with the method above, then saved. That is why the Embeddings are loaded via pickle in this case. They could also be created again with the method above, should the pickled version not work.

In [None]:
#path to FastText Embeddings
path = r'G:\Fasttext\cc.en.300.bin\\cc.en.300.bin'
#embedding_matrix = create_embedding_matrix(path, word_index=word_index, save=True, save_as="embeddings_words_small.p")

In [None]:
embedding_matrix = pickle.load(open("embeddings_words_small.p", "rb" ))

### Create Models

In [None]:
### Model LSTM
def create_LSTM_model(maxlen, max_words, embedding_dim, embedding_matrix):
    model = Sequential()
    model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
    model.add(LSTM(5))
    model.add(Dropout(0.5))


    model.add(Dense(3, activation='softmax'))


    model.layers[0].set_weights([embedding_matrix]) 
    model.layers[0].trainable = False 

    #model.summary()
    return model

In [None]:
### Model CNN_LSTM
def create_CNN_LSTM_model(maxlen, max_words, embedding_dim, embedding_matrix):
    model = Sequential()
    model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
    model.add(Conv1D(32, 5, activation='relu'))
    model.add(MaxPooling1D(3))
    model.add(LSTM(5))
    model.add(Dropout(0.5))

    model.add(Dense(3, activation='softmax'))


    model.layers[0].set_weights([embedding_matrix]) 
    model.layers[0].trainable = False 

    return model

In [None]:
### Model Dense
def create_Dense_model(maxlen, max_words, embedding_dim, embedding_matrix):
    model = Sequential()
    model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(5, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Flatten())
    model.add(Dense(3, activation='softmax'))


    model.layers[0].set_weights([embedding_matrix]) 
    model.layers[0].trainable = False 

    return model

In [None]:
### Model BiLSTM
def create_BiLSTM_model(max_words, maxlen, embedding_dim, embedding_matrix):
    input_tensor= Input(shape=(100,))
    x = Embedding(max_words, embedding_dim, input_length=maxlen)(input_tensor)
    left = LSTM(5)(x)
    right = LSTM(5, go_backwards=True)(x)
    added = Add()([left, right])
    z=layers.Dropout(0.5)(added)
    output_tensor=layers.Dense(3, activation='softmax')(z)

    model = Model(input_tensor, output_tensor)
    model.layers[1].set_weights([embedding_matrix]) 
    model.layers[1].trainable = False 
    
    return model

In [None]:
def fit_model(x_train, y_train, x_dev, y_dev, model):
    """
    Fits a model on a given train set (data and labels). Returns model and history.
    """
    cat_y_train = to_categorical(y_train)
    cat_y_dev = to_categorical(y_dev)

    model.compile(optimizer='rmsprop',
                 loss='categorical_crossentropy',
                 metrics=['accuracy'])
    history=model.fit(x_train, cat_y_train,
                     epochs=15,
                     batch_size=32,
                     validation_data=(x_dev, cat_y_dev))
    return model, history

In [None]:
def save_f1_scores(f1_array, output_file):
    with open(output_file, 'w') as f:
        for score in f1_array:
            f.write("%s\n" % score)
        f.write("\n")
        f.write("Average: %s"  % np.mean(f1_array))

### Start Testing

In [None]:
f1_array = []
i = 0
cm = np.zeros(shape=(3,3))
while i < 20:
    print(i)
   
    model = create_LSTM_model(maxlen=maxlen, max_words=max_words, embedding_dim=embedding_dim, embedding_matrix=embedding_matrix)
    #model.summary()
    model, history = fit_model(x_train, y_train, x_dev, y_dev, model)
    predictions = evaluation.get_test_predictions(model, x_test)
    f1 = evaluation.print_f1_scores(y_test, predictions)
    f1_array.append(f1)
    
    cnf_matrix = confusion_matrix(y_test, predictions)    
    cm = cm+cnf_matrix
    i +=1

print(cm)
evaluation.plot_confusion_matrix(cm, classes=['Hassrede', 'Beleidigung', 'Neutral'], normalize=True,
                      title=' ')
#save_f1_scores(f1_array, 'word_LSTM_results.txt')

In [None]:
f1_array = []
i = 0
cm = np.zeros(shape=(3,3))
while i < 20:
    print(i)
    
    model = create_Dense_model(maxlen=maxlen, max_words=max_words, embedding_dim=embedding_dim, embedding_matrix=embedding_matrix)
    model, history = fit_model(x_train, y_train, x_dev, y_dev, model)
    predictions = evaluation.get_test_predictions(model, x_test)
    f1 = evaluation.print_f1_scores(y_test, predictions)
    f1_array.append(f1)
    
    cnf_matrix = confusion_matrix(y_test, predictions)    
    cm = cm+cnf_matrix
    i +=1

print(cm)    
evaluation.plot_confusion_matrix(cnf_matrix, classes=['Hassrede', 'Beleidigung', 'Neutral'], normalize=True,
                      title=' ')
#save_f1_scores(f1_array, 'word_Dense_results.txt')

In [None]:
f1_array = []
i = 0
cm = np.zeros(shape=(3,3))

while i < 20:
    print(i)
   
    model = create_CNN_LSTM_model(maxlen=maxlen, max_words=max_words, embedding_dim=embedding_dim, embedding_matrix=embedding_matrix)
    model, history = fit_model(x_train, y_train, x_dev, y_dev, model)
    predictions = evaluation.get_test_predictions(model, x_test)
    f1 = evaluation.print_f1_scores(y_test, predictions)
    f1_array.append(f1)
    
    cnf_matrix = confusion_matrix(y_test, predictions)    
    cm = cm+cnf_matrix
    i +=1

print(cm)
evaluation.plot_confusion_matrix(cnf_matrix, classes=['Hassrede', 'Beleidigung', 'Neutral'], normalize=True,
                      title=' ')
#save_f1_scores(f1_array, 'word_CNNLSTM_results.txt')

In [None]:
f1_array = []
i = 0
cm = np.zeros(shape=(3,3))
while i < 20:
    print(i)
    
    model = create_BiLSTM_model(maxlen=maxlen, max_words=max_words, embedding_dim=embedding_dim, embedding_matrix=embedding_matrix)
    model, history = fit_model(x_train, y_train, x_dev, y_dev, model)
    predictions = evaluation.get_test_predictions(model, x_test)
    f1 = evaluation.print_f1_scores(y_test, predictions)
    f1_array.append(f1)
    
    cnf_matrix = confusion_matrix(y_test, predictions)    
    cm = cm+cnf_matrix
    i +=1
      
print(cm)
evaluation.plot_confusion_matrix(cnf_matrix, classes=['Hassrede', 'Beleidigung', 'Neutral'], normalize=True,
                      title=' ')
#save_f1_scores(f1_array, 'word_BILSTM_results.txt')