In [None]:
from hatespeech import preprocessing
from hatespeech import evaluation

In [None]:
from keras import layers
from keras import Sequential
from keras.utils.np_utils import to_categorical

In [None]:
from gensim.models import FastText
from sklearn.metrics import confusion_matrix
import numpy as np

### Load Data

In [None]:
train_path = 'Data/Datasets/train_data.csv'
dev_path = 'Data/Datasets/dev_data.csv'
test_path = 'Data/Datasets/test_data.csv'

### Preprocess Data

In [None]:
texts, labels, cnt = preprocessing.load_datasets(train_path, dev_path, test_path)

In [None]:
max_words = 670

In [None]:
maxlen = 280
texts = preprocessing.tokenize_texts_characters(texts)

In [None]:
data_reshaped, labels_reshaped, word_index = preprocessing.reshape_characters(texts, labels, maxlen = maxlen)

### Prepare Datasets and Embeddings

In [None]:
x_train = data_reshaped[:12000]
y_train = labels_reshaped[:12000]
y_train = to_categorical(y_train)
x_dev = data_reshaped[12000:15000]
y_dev = labels_reshaped[12000:15000]
y_dev = to_categorical(y_dev)
x_test = data_reshaped[15000:18000]
y_test = labels_reshaped[15000:18000]

#### Prepare One-Hot Characters

In [None]:
def one_hot_gen(texts, labels, n=10):
    labels_array = []
    result = np.zeros(shape= (n, maxlen, max_words))
 
    while 1:
        
        i = 0
        labels_i = 0
        for sample in texts:
         
            labels_array.append(labels[labels_i])
            labels_i += 1
            for j, character in enumerate(sample):
                index = character
            
                result[i, j, index] = 1.
            i += 1

            if len(labels_array) >= n:

                labels_array=np.asarray(labels_array)             
        
                yield result, labels_array
                i = 0
                result = np.zeros(shape= (n, maxlen, max_words))
                labels_array = []

In [None]:
len(word_index.values())

In [None]:
def one_hot(texts):
   
    results = np.zeros((len(texts), maxlen, max_words))
    for i, sample in enumerate(texts):
        #sample = sample.lower()
        for j, character in enumerate(sample):
            
            index = character
           
            results[i, j, index] = 1.
            
    return results

In [None]:
oh_x_test = preprocessing.one_hot(x_test, maxlen, max_words)

In [None]:
oh_x_train_gen = preprocessing.one_hot_gen(x_train, y_train, maxlen, max_words, n = 10)
oh_x_dev_gen = preprocessing.one_hot_gen(x_dev, y_dev, maxlen, max_words,n = 10)

#### Prepare Pretrained Embeddings

In [None]:
def create_embedding_matrix(path, word_index, max_words, embedding_dim=300):

    model = FastText.load(path)
    
    embedding_matrix=np.zeros((max_words, embedding_dim))
    for word, i in word_index.items():
        if i < max_words:
            try:
                embedding_vector=model.wv[word]
            except KeyError:
                
                print(word, 'ist nicht enthalten.')
                embedding_vector = [0] * embedding_dim
                    
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
                
    #pickle.dump(embedding_matrix, open("embeddings.p", "wb" ) )
    return embedding_matrix

In [None]:
# Originally created in "Preprocessing/Twitter Character Embeddings"
embedding_matrix = create_embedding_matrix(path='embeddings_chars.model', word_index=word_index, max_words=max_words)

### Create Models

In [None]:
def create_model(maxlen, max_words, embed='pretrained', embedding_matrix=False, embedding_dim =300):
    """
    Creates keras model including embeddings.
    """
 
    model = Sequential()
    if embed == 'pretrained' or embed == 'self-train':
        model.add(layers.Embedding(max_words, embedding_dim, input_length=maxlen))
        model.add(layers.Conv1D(16, 5, activation='relu'))
    if embed == 'none':
        model.add(layers.Conv1D(16, 5, activation='relu', input_shape=(maxlen, max_words)))

        
    model.add(layers.MaxPooling1D(3))
    model.add(layers.Conv1D(32, 5, activation='relu'))
    model.add(layers.MaxPooling1D(3))
    model.add(layers.Conv1D(64, 5, activation='relu'))
    model.add(layers.MaxPooling1D(3))
  

    model.add(layers.Flatten())
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(5, activation='relu'))

    model.add(layers.Dense(3, activation='softmax'))

    if embed == 'pretrained':
        model.layers[0].set_weights([embedding_matrix]) 
        model.layers[0].trainable = False
    return model

In [None]:
def fit_model(x_train, y_train, x_dev, y_dev, model):
    """
    Fits a model on a given train set (data and labels). Returns model and history.
    """


    model.compile(optimizer='rmsprop',
                 loss='categorical_crossentropy',
                 metrics=['accuracy'])
    history=model.fit(x_train, y_train, 
                     epochs=15,
                     batch_size=32,
                     validation_data=(x_dev, y_dev))
    return model, history

In [None]:
def fit_model_gen(oh_x_train_gen, oh_x_dev_gen, model):
    """
    Fits a model on a given train and val generator (data and labels). Returns model and history.
    """
   

    model.compile(optimizer='rmsprop',
                 loss='categorical_crossentropy',
                 metrics=['accuracy'])
    history=model.fit_generator(oh_x_train_gen, 
                                epochs=15,
                                steps_per_epoch=1200,
                                
                                validation_data=oh_x_dev_gen,
                                validation_steps=300)
    return model, history

In [None]:
def save_f1_scores(f1_array, output_file):
    with open(ouput_file, 'w') as f:
        for score in f1_array:
            f.write("%s\n" % score)
        f.write("\n")
        f.write("Average: %s"  % np.mean(f1_array))

### Start Testing

In [None]:
f1_array = []
i = 0
cm = np.zeros(shape=(3,3))
while i < 20:
    print(i)
    
    model = create_model(maxlen, max_words, embed='self-train', embedding_matrix=embedding_matrix)
    model, history = fit_model(x_train, y_train, x_dev, y_dev, model)
    
    predictions = evaluation.get_test_predictions(model, x_test)
    f1 = evaluation.print_f1_scores(y_test, predictions)
    f1_array.append(f1)
    
    cnf_matrix = confusion_matrix(y_test, predictions)    
    cm = cm+cnf_matrix
    i +=1

cnf_matrix = confusion_matrix(y_test, predictions)    
evaluation.plot_confusion_matrix(cnf_matrix, classes=['Hassrede', 'Beleidigung', 'Neutral'], normalize=True,
                      title=' ')
#save_f1_scores(f1_array, 'char_selftrained_results.txt')

In [None]:
f1_array = []
i = 0
cm = np.zeros(shape=(3,3))
while i < 20:
    print(i)
    
    model = create_model(maxlen, max_words, embed='pretrained', embedding_matrix=embedding_matrix)
    model, history = fit_model(x_train, y_train, x_dev, y_dev, model)
    
    predictions = evaluation.get_test_predictions(model, x_test)
    f1 = evaluation.print_f1_scores(y_test, predictions)
    f1_array.append(f1)
    
    cnf_matrix = confusion_matrix(y_test, predictions)    
    cm = cm+cnf_matrix
    i +=1


cnf_matrix = confusion_matrix(y_test, predictions)    
evaluation.plot_confusion_matrix(cnf_matrix, classes=['Hassrede', 'Beleidigung', 'Neutral'], normalize=True,
                      title=' ')
#save_f1_scores(f1_array, 'char_pretrained_results.txt')

In [None]:
f1_array = []
i = 0
cm = np.zeros(shape=(3,3))
while i < 20:
    print(i)
    
    model = create_model(maxlen, max_words, embed='none')
    model, history = fit_model_gen(oh_x_train_gen, oh_x_dev_gen, model)
    
    predictions = evaluation.get_test_predictions(model, oh_x_test)
    f1 = evaluation.print_f1_scores(y_test, predictions)
    f1_array.append(f1)
    
    cnf_matrix = confusion_matrix(y_test, predictions)    
    cm = cm+cnf_matrix
    i +=1

cnf_matrix = confusion_matrix(y_test, predictions)    
evaluation.plot_confusion_matrix(cnf_matrix, classes=['Hassrede', 'Beleidigung', 'Neutral'], normalize=True,
                      title=' ')   
#save_f1_scores(f1_array, 'char_oh_results.txt')