In [None]:
import sys
sys.path.append('../')
from hatespeech import preprocessing
from hatespeech import evaluation

In [None]:
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, Dropout, LSTM, Conv1D, MaxPooling1D, Add
from keras.utils.np_utils import to_categorical

In [None]:
import pickle
import numpy as np
from gensim.models import FastText
from sklearn.metrics import confusion_matrix

### Load Data

In [None]:
train_path = '../Data/Datasets/train_data.csv'
dev_path = '../Data/Datasets/dev_data.csv'
test_path = '../Data/Datasets/test_data.csv'

### Preprocess Data

In [None]:
texts, labels, cnt = preprocessing.load_datasets(train_path, dev_path, test_path)

In [None]:
maxlen = 100
sequences, word_index, mfws, max_words = preprocessing.tokenize_texts_ngrams(texts, ngrams = True, chars = 4)

### Prepare Datasets and Embeddings

**Note**: Check the Notebook "Test Models - NGram" should the pickled version of the Embeddings not work.

In [None]:
embedding_matrix = pickle.load(open( "../embeddings_ngrams_small.p", "rb" ))

In [None]:
data_reshaped, labels_reshaped = preprocessing.reshape(sequences, labels, maxlen = maxlen)

In [None]:
embedding_dim=300

In [None]:
x_train = data_reshaped[:12000]
y_train = labels_reshaped[:12000]
x_dev = data_reshaped[12000:15000]
y_dev = labels_reshaped[12000:15000]
x_test = data_reshaped[15000:18000]
y_test = labels_reshaped[15000:18000]

### Create Model

In [None]:
def create_LSTM_model(maxlen, max_words, embedding_dim, embedding_matrix):
    model = Sequential()
    model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
    model.add(LSTM(32, return_sequences=True))
    model.add(LSTM(32))
    model.add(Dropout(0.44))


    model.add(Dense(3, activation='softmax'))


    model.layers[0].set_weights([embedding_matrix])
    model.layers[0].trainable = False 

    #model.summary()
    return model

In [None]:
def fit_model(x_train, y_train, x_dev, y_dev, model):
    """
    Fits a model on a given train set (data and labels). Returns model and history.
    """
    cat_y_train = to_categorical(y_train)
    cat_y_dev = to_categorical(y_dev)

    model.compile(optimizer='rmsprop',
                 loss='categorical_crossentropy',
                 metrics=['accuracy'])
    history=model.fit(x_train, cat_y_train, 
                     epochs=15,
                     batch_size=32,
                     validation_data=(x_dev, cat_y_dev))
    return model, history

In [None]:
def save_f1_scores(f1_array, output_file):
    with open(output_file, 'w') as f:
        for score in f1_array:
            f.write("%s\n" % score)
        f.write("\n")
        f.write("Average: %s"  % np.mean(f1_array))

### Start Testing

In [None]:
f1_array = []
i = 0
cm = np.zeros(shape=(3,3))
while i < 20:
    print(i)
  #  x_train, y_train, x_test, y_test = preprocessing.create_train_and_test_data(data_reshaped, labels_reshaped, cnt)

    model = create_LSTM_model(maxlen=maxlen, max_words=max_words, embedding_dim=embedding_dim, embedding_matrix=embedding_matrix)
    model, history = fit_model(x_train, y_train, x_dev, y_dev, model)
    
    predictions = evaluation.get_test_predictions(model, x_test)
    f1 = evaluation.print_f1_scores(y_test, predictions)
    f1_array.append(f1)
    
    cnf_matrix = confusion_matrix(y_test, predictions)    
    cm = cm+cnf_matrix
    i +=1
    
print(cm)
evaluation.plot_confusion_matrix(cnf_matrix, classes=['Hassrede', 'Beleidigung', 'Neutral'], normalize=True,
                      title=' ')
#save_f1_scores(f1_array, 'ngram_LSTM_final_results.txt')

### Start Evaluation on jokes, sexism and racism

In [None]:
#train only once in this case
model = create_LSTM_model(maxlen=maxlen, max_words=max_words, embedding_dim=embedding_dim, embedding_matrix=embedding_matrix)
model, history = fit_model(x_train, y_train, x_dev, y_dev, model)

In [None]:
#choose file accordingly
t='../Data/Data_Evaluation/racism.txt'
text_file = open(t, "r")
lines = text_file.read().split('\n')

In [None]:
sequences2, word_index2, mfws2, max_words2 = preprocessing.tokenize_texts_ngrams(lines, ngrams = True, chars = 4)

In [None]:
data_reshaped2, labels_reshaped2 = preprocessing.reshape(sequences2, labels, maxlen = maxlen)

#### Get class counter

In [None]:
def get_accuracies_per_class(model, x_test):
    predictions = model.predict(x_test)
    pred0 = []
    pred1 = []
    pred2 = []
    y_pred = []
    for pred in predictions:
        pred = list(pred)
        
        if pred.index(max(pred)) == 0:
            pred0.append(max(pred))
        elif pred.index(max(pred)) == 1:
            pred1.append(max(pred))
        elif pred.index(max(pred)) == 2:
            pred2.append(max(pred))

    return np.mean(pred0), np.mean(pred1), np.mean(pred2)

In [None]:
acc0, acc1, acc2 = get_accuracies_per_class(model, data_reshaped2)

In [None]:
print(acc0, acc1, acc2)

#### See which lines are in which class

In [None]:
predictions = evaluation.get_test_predictions(model, data_reshaped2)

In [None]:
from collections import Counter

In [None]:
Counter(predictions)

In [None]:
def get_lines_per_class(lines, predictions, c):
    class_list = []
    for i, pred in enumerate(predictions):
        if pred == c:
            class_list.append(lines[i])
    return class_list

In [None]:
a = get_lines_per_class(lines, predictions, c=0)

In [None]:
a