In [2]:
from models import get_model
import pandas as pd
import argparse
import pickle
import string
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.utils import shuffle
from sklearn.metrics import roc_auc_score
import preprocessor as p
from collections import Counter
import os
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix 
from tensorflow.contrib import learn
from tflearn.data_utils import to_categorical, pad_sequences
from scipy import stats
import tflearn
import json

Instructions for updating:
non-resource variables are not supported in the long term


Using Theano backend.


In [3]:
def load_data(dataset):
    filename = get_filename(dataset)
    print("Loading data from file: " + filename)
    data = pickle.load(open(filename, 'rb'))
    x_text = []
    labels = [] 
    for i in range(len(data)):
        if(HASH_REMOVE):
            x_text.append(p.tokenize((data[i]['text']).encode('utf-8')))
        else:
            x_text.append(data[i]['text'])
        labels.append(data[i]['label'])
    if dataset == "wiki":
        reduced_number_of_samples = int(len(x_text) * 0.5)
        x_text, labels = shuffle(x_text, labels, 
            random_state=42, 
            n_samples=reduced_number_of_samples)
        print "WARNING: Wiki data set reduced from %d to %d number of samples!" % (len(data), reduced_number_of_samples)
        

    return x_text,labels

def get_filename(dataset):
    global NUM_CLASSES, HASH_REMOVE
    if(dataset=="twitter"):
        NUM_CLASSES = 3
        HASH_REMOVE = True
        filename = "data/twitter_data.pkl"
    elif(dataset=="formspring"):
        NUM_CLASSES = 2
        filename = "data/formspring_data.pkl"
    elif(dataset=="wiki"):
        NUM_CLASSES = 2
        filename = "data/wiki_data.pkl"
    return filename

In [4]:
def get_embedding_weights(filename, sep):
    embed_dict = {}
    file = open(filename,'r')
    for line in file.readlines():
        row = line.strip().split(sep)
        embed_dict[row[0]] = row[1:]
    print('Loaded from file: ' + str(filename))
    file.close()
    return embed_dict

def map_embedding_weights(embed, vocab, embed_size):
    vocab_size = len(vocab)
    embeddingWeights = np.zeros((vocab_size , embed_size))
    n = 0
    words_missed = []
    for k, v in vocab.iteritems():
        try:
            embeddingWeights[v] = embed[k]
        except:
            n += 1
            words_missed.append(k)
            pass
    print("%d embedding missed"%n, " of " , vocab_size)
    return embeddingWeights

def get_embeddings_dict(vector_type, emb_dim):
    if vector_type == 'sswe':
        emb_dim==50
        sep = '\t'
        vector_file = 'word_vectors/sswe-u.txt'
    elif vector_type =="glove":
        sep = ' '
        if data == "wiki":
            vector_file = 'word_vectors/glove.6B.' + str(emb_dim) + 'd.txt'
        else:
            vector_file = 'word_vectors/glove.twitter.27B.' + str(emb_dim) + 'd.txt'
    else:
        print "ERROR: Please specify a correst model or SSWE cannot be loaded with embed size of: " + str(emb_dim) 
        return None
    
    embed = get_embedding_weights(vector_file, sep)
    return embed

In [5]:
def evaluate_model(model, testX, testY, dump_results, dump_results_file_name):
    temp = model.predict(testX)
    y_pred  = np.argmax(temp, 1)
    y_true = np.argmax(testY, 1)
    if dump_results:
        pd.DataFrame(data={
            "y_true": y_true,
            "y_pred": y_pred
        }).to_csv(dump_results_file_name)
        print("Writter results to \"" + dump_results_file_name + "\"")
    if(data == "twitter"):
        precision = metrics.precision_score(y_true, y_pred, average=None, labels=[0, 2, 1])
        recall = metrics.recall_score(y_true, y_pred, average=None, labels=[0, 2, 1])
        f1_score = metrics.f1_score(y_true, y_pred, average=None, labels=[0, 2, 1])
    else:
        precision = metrics.precision_score(y_true, y_pred, average=None)
        recall = metrics.recall_score(y_true, y_pred, average=None)
        f1_score = metrics.f1_score(y_true, y_pred, average=None)
    print("Precision: " + str(precision) + "\n")
    print("Recall: " + str(recall) + "\n")
    print("f1_score: " + str(f1_score) + "\n")
    print(confusion_matrix(y_true, y_pred))
    print(":: Classification Report")
    print(classification_report(y_true, y_pred))
    return precision, recall, f1_score

In [6]:
def dump_learned_embedding(data, model_type, vector_type, embed_size, embed, vocab_processor):
    vocab = vocab_processor.vocabulary_._mapping
    vocab_size = len(vocab)
    embedDict = {}
    n = 0
    words_missed = []
    for k, v in vocab.iteritems():
        try:
            embeddingDict[v] = embed[k]
        except:
            n += 1
            words_missed.append(k)
            pass
    print("%d embedding missed"%n, " of " , vocab_size)
    
    filename = output_folder_name + data + "_" + model_type + "_" + vector_type + "_" + embed_size + ".pkl"
    with open(filename, 'wb') as handle:
        pickle.dump(embedDict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [7]:
def get_train_test(data, x_text, labels, oversampling_rate):
    
    X_train, X_test, Y_train, Y_test = train_test_split( x_text, labels, random_state=42, test_size=0.10)

    X_train, Y_train = do_oversampling(data, oversampling_rate, Y_train, X_train)
    
    post_length = np.array([len(x.split(" ")) for x in x_text])
    if(data != "twitter"):
        max_document_length = int(np.percentile(post_length, 95))
    else:
        max_document_length = max(post_length)
    print("Document length : " + str(max_document_length))
    
    vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length, MAX_FEATURES)
    vocab_processor = vocab_processor.fit(x_text)

    trainX = np.array(list(vocab_processor.transform(X_train)))
    testX = np.array(list(vocab_processor.transform(X_test)))
    
    trainY = np.asarray(Y_train)
    testY = np.asarray(Y_test)
        
    trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.)
    testX = pad_sequences(testX, maxlen=max_document_length, value=0.)

    trainY = to_categorical(trainY, nb_classes=NUM_CLASSES)
    testY = to_categorical(testY, nb_classes=NUM_CLASSES)
    
    data_dict = {
        "data": data,
        "trainX" : trainX,
        "trainY" : trainY,
        "testX" : testX,
        "testY" : testY,
        "vocab_processor" : vocab_processor
    }
    
    return data_dict

In [8]:
def return_data(data_dict):
    return data_dict["data"], data_dict["trainX"], data_dict["trainY"], data_dict["testX"], data_dict["testY"], data_dict["vocab_processor"]

In [9]:
def shuffle_weights(model, weights=None):
    """Randomly permute the weights in `model`, or the given `weights`.
    This is a fast approximation of re-initializing the weights of a model.
    Assumes weights are distributed independently of the dimensions of the weight tensors
      (i.e., the weights have the same distribution along each dimension).
    :param Model model: Modify the weights of the given model.
    :param list(ndarray) weights: The model's weights will be replaced by a random permutation of these weights.
      If `None`, permute the model's current weights.
    """
    if weights is None:
        weights = model.get_weights()
    weights = [np.random.permutation(w.flat).reshape(w.shape) for w in weights]
    # Faster, but less random: only permutes along the first dimension
    # weights = [np.random.permutation(w) for w in weights]
    model.set_weights(weights)

In [10]:
def train(data_dict, model_type, vector_type, embed_size, dump_results=False, dump_results_file_name=""):

    data, trainX, trainY, testX, testY, vocab_processor = return_data(data_dict)
    
    vocab_size = len(vocab_processor.vocabulary_)
    print("Vocabulary Size: {:d}".format(vocab_size))
    vocab = vocab_processor.vocabulary_._mapping
    
    print("Running Model: " + model_type + " with word vector initiliazed with " + vector_type + " word vectors.")
    model = get_model(model_type, trainX.shape[1], vocab_size, embed_size, NUM_CLASSES, LEARN_RATE)

    if(model_type != "cnn"):
        #This is a very crude bugfix, since this method doesnt exist in the DNN class of tflearn
        initial_weights = model.get_weights()
        shuffle_weights(model, initial_weights)
    
    if(model_type == 'cnn'):
        if(vector_type!="random"):
            print("Word vectors used: " + vector_type)
            embeddingWeights = tflearn.get_layer_variables_by_name('EmbeddingLayer')[0]
            model.set_weights(embeddingWeights, map_embedding_weights(get_embeddings_dict(vector_type, embed_size), vocab, embed_size))
            model.fit(trainX, trainY, n_epoch = EPOCHS, shuffle=True, show_metric=True, batch_size=BATCH_SIZE)
        else:
            model.fit(trainX, trainY, n_epoch = EPOCHS, shuffle=True, show_metric=True, batch_size=BATCH_SIZE)
    else:
        if(vector_type!="random"):
            print("Word vectors used: " + vector_type)
            model.layers[0].set_weights([map_embedding_weights(get_embeddings_dict(vector_type, embed_size), vocab, embed_size)])
            model.fit(trainX, trainY, epochs=EPOCHS, shuffle=True, batch_size=BATCH_SIZE, 
                  verbose=1)
        else:
            model.fit(trainX, trainY, epochs=EPOCHS, shuffle=True, batch_size=BATCH_SIZE, 
                  verbose=1)
    
    return  evaluate_model(model, testX, testY, dump_results, dump_results_file_name)

In [11]:
def print_scores(precision_scores, recall_scores, f1_scores):
    for i in range(NUM_CLASSES):
        print("\nPrecision Class %d (avg): %0.3f (+/- %0.3f)" % (i, precision_scores[:, i].mean(), precision_scores[:, i].std() * 2))
        print( "\nRecall Class %d (avg): %0.3f (+/- %0.3f)" % (i, recall_scores[:, i].mean(), recall_scores[:, i].std() * 2))
        print( "\nF1 score Class %d (avg): %0.3f (+/- %0.3f)" % (i, f1_scores[:, i].mean(), f1_scores[:, i].std() * 2))

In [12]:
def do_oversampling(data, oversampling_rate, labels, x_text): 
    if(data=="twitter"):
        NUM_CLASSES = 3
        
        racism = [i for i in range(len(labels)) if labels[i]==2]
        sexism = [i for i in range(len(labels)) if labels[i]==1]
        x_text = x_text + [x_text[x] for x in racism]*(oversampling_rate-1)+ [x_text[x] for x in sexism]*(oversampling_rate-1)
        labels = labels + [2 for i in range(len(racism))]*(oversampling_rate-1) + [1 for i in range(len(sexism))]*(oversampling_rate-1)
    else:  
        NUM_CLASSES = 2
        bully = [i for i in range(len(labels)) if labels[i]==1]
        x_text = x_text + [x_text[x] for x in bully]*(oversampling_rate-1)
        labels = list(labels) + [1 for i in range(len(bully))]*(oversampling_rate-1)

    print("Counter after oversampling")
    from collections import Counter
    print(Counter(labels))
    
    #this is never used again so im uncommenting it
    #filter_data = []
    #for text in x_text:
    #    filter_data.append("".join(l for l in text if l not in string.punctuation))
        
    return x_text, labels

In [13]:
models = [ 'cnn', 'lstm', 'blstm', 'blstm_attention']
word_vectors = ["random", "glove" ,"sswe"]
EPOCHS = 10
BATCH_SIZE = 128
MAX_FEATURES = 2
NUM_CLASSES = None
DROPOUT = 0.25
LEARN_RATE = 0.01
HASH_REMOVE = None
output_folder_name = "results/"

In [14]:
def run_model(data, oversampling_rate, model_type, vector_type, embed_size, dump_results=False, dump_results_file_name=""):
    x_text, labels = load_data(data) 
    if data == "twitter":
        #map labels
        dict1 = {'racism':2,'sexism':1,'none':0}
        labels = [dict1[b] for b in labels]
    data_dict = get_train_test(data,  x_text, labels, oversampling_rate)
    precision, recall, f1_score = train(data_dict, model_type, vector_type, embed_size, dump_results, dump_results_file_name)
    return precision, recall, f1_score


In [17]:
data = "wiki"
model_type = "blstm_attention"
vector_type = "glove"
embed_size = 100
run_model(data, 3, model_type, vector_type, embed_size,
         True,
          "dumps/%s_%d_%s_%s.csv" % (data, oversampling_rate, vector_type, model_type))

Loading data from file: data/wiki_data.pkl
Counter after oversampling
Counter({0: 46014, 1: 18372})
Document length : 238
Vocabulary Size: 37183
Running Model: blstm_attention with word vector initiliazed with glove word vectors.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 238, 100)          3718300   
_________________________________________________________________
dropout_5 (Dropout)          (None, 238, 100)          0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 238, 200)          160800    
_________________________________________________________________
att_layer_3 (AttLayer)       (None, 200)               200       
_________________________________________________________________
dropout_6 (Dropout)          (None, 200)               0         
____________________________________________

(array([0.96233741, 0.72317263]),
 array([0.96533593, 0.70561457]),
 array([0.96383434, 0.71428571]))

In [18]:
data = "wiki"
model_type = "blstm"
vector_type = "glove"
embed_size = 100
run_model(data, 3, model_type, vector_type, embed_size,
          True, 
          "dumps/%s_%d_%s_%s.csv" % (data, oversampling_rate, vector_type, model_type))

Loading data from file: data/wiki_data.pkl
Counter after oversampling
Counter({0: 46014, 1: 18372})
Document length : 238
Vocabulary Size: 37183
Running Model: blstm with word vector initiliazed with glove word vectors.
Word vectors used: glove
Loaded from file: word_vectors/glove.6B.100d.txt
('5577 embedding missed', ' of ', 37183)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Precision: [0.9627907  0.73659306]

Recall: [0.96747809 0.70864947]

f1_score: [0.9651287  0.72235112]

[[4968  167]
 [ 192  467]]
:: Classification Report
             precision    recall  f1-score   support

          0       0.96      0.97      0.97      5135
          1       0.74      0.71      0.72       659

avg / total       0.94      0.94      0.94      5794



(array([0.9627907 , 0.73659306]),
 array([0.96747809, 0.70864947]),
 array([0.9651287 , 0.72235112]))

## Reproducing table 4

In [None]:
data_sets = ["wiki"]
model_types = ["cnn", "blstm_attention"]
vector_types = ["random", "glove"]
#fixed embed size for faster computation
embed_size = 100
results_table_4 = []
for data in data_sets:
    for taken_oversampling_rate in [1, 3]:
        for vector_type in vector_types:
            for model_type in model_types:
                precision, recall, f1_score = run_model(data, 
                                                        taken_oversampling_rate, 
                                                        model_type, 
                                                        vector_type, 
                                                        embed_size, 
                                                        True, 
                                                        "dumps/%s_%d_%s_%s.csv" % (data, 
                                                                                   taken_oversampling_rate, 
                                                                                   vector_type, 
                                                                                   model_type))
                print "Results done for %s %d with vector type %s and model %s" % (data, 
                                                                                   taken_oversampling_rate, 
                                                                                   vector_type, 
                                                                                   model_type)
                results_table_4.append({
                    "dataset": data,
                    "oversampling_rate": taken_oversampling_rate,
                    "vector_type": vector_type,
                    "model_type": model_type,
                    "precision": precision,
                    "recall": recall,
                    "f1_score": f1_score
                })

Training Step: 5039  | total loss: [1m[32m0.13702[0m[0m | time: 116.043s
| Adam | epoch: 010 | loss: 0.13702 - acc: 0.9494 -- iter: 64384/64386
Training Step: 5040  | total loss: [1m[32m0.15092[0m[0m | time: 116.258s
| Adam | epoch: 010 | loss: 0.15092 - acc: 0.9443 -- iter: 64386/64386
--
Writter results to "dumps/wiki_3_glove_cnn.csv"
Precision: [0.95613369 0.81060606]

Recall: [0.9805258  0.64946889]

f1_score: [0.96817614 0.72114575]

[[5035  100]
 [ 231  428]]
:: Classification Report
             precision    recall  f1-score   support

          0       0.96      0.98      0.97      5135
          1       0.81      0.65      0.72       659

avg / total       0.94      0.94      0.94      5794

Results done for wiki 3 with vector type glove and model cnn
Loading data from file: data/wiki_data.pkl
Counter after oversampling
Counter({0: 46014, 1: 18372})
Document length : 238
Vocabulary Size: 37183
Running Model: blstm_attention with word vector initiliazed with glove word 

In [1]:
data = "wiki"
model_type = "blstm"
vector_type = "glove"
run_model(data, 3, model_type, vector_type, 100)

NameError: name 'run_model' is not defined

## Reproducing table 6

In [15]:
data_sets = ["twitter", "formspring", "wiki"]
model_types = [ 'lstm', 'blstm']
vector_types = ["glove"]
#fixed embed size for faster computation
embed_size = 100
results_table_6 = []
skip = 1
for data in data_sets:
    for oversampling_rate in [3]:
        for vector_type in vector_types:
            for model_type in model_types:
                if skip > 0:
                    skip = skip - 1
                    continue
                precision, recall, f1_score = run_model(data, 
                                                        oversampling_rate, 
                                                        model_type, 
                                                        vector_type, 
                                                        embed_size, 
                                                        True, 
                                                        "dumps/%s_%d_%s_%s.csv" % (data, oversampling_rate, vector_type, model_type))
                print "Results done for %s %d with vector type %s and model %s" % (data, oversampling_rate, vector_type, model_type)
                results_table_6.append({
                    "dataset": data,
                    "oversampling_rate": oversampling_rate,
                    "vector_type": vector_type,
                    "model_type": model_type,
                    "precision": precision,
                    "recall": recall,
                    "f1_score": f1_score
                })

Loading data from file: data/twitter_data.pkl
Counter after oversampling
Counter({0: 9946, 1: 8415, 2: 5190})
Document length : 38
Vocabulary Size: 5711
Running Model: blstm with word vector initiliazed with glove word vectors.
Word vectors used: glove
Loaded from file: word_vectors/glove.twitter.27B.100d.txt
('217 embedding missed', ' of ', 5711)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Writter results to "dumps/twitter_3_glove_blstm.csv"
Precision: [0.87370405 0.74647887 0.65970149]

Recall: [0.85045872 0.76811594 0.70833333]

f1_score: [0.86192469 0.75714286 0.68315301]

[[927 112  51]
 [ 88 221   3]
 [ 46   2 159]]
:: Classification Report
             precision    recall  f1-score   support

          0       0.87      0.85      0.86      1090
          1       0.66      0.71      0.68       312
          2       0.75      0.77      0.76       207

avg / total       0.82      0.81      0.81      1609

Results do

KeyboardInterrupt: 