In [1]:
import tensorflow 
from models import get_model
import argparse
import pickle
import string
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import roc_auc_score
import preprocessor as p
from collections import Counter
import os
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix 
from tensorflow.contrib import learn
from tflearn.data_utils import to_categorical, pad_sequences
from scipy import stats
import tflearn
import json

curses is not supported on this machine (please install/reinstall curses for an optimal experience)








Using TensorFlow backend.


In [2]:
global NUM_CLASSES
NUM_CLASSES = 2

In [3]:
# Loading The file from CSV
def load_data(filename):
    data = pd.read_csv(filename, header = 0)

    x = data['text'].to_list()
    y = data['HS'].to_list()
    return x,y

# Entering the file details to get the data
def get_filename(dataset):
    global NUM_CLASSES, HASH_REMOVE
    filename = "data\hateval2019_en_train.csv"
    return filename

In [4]:
# Evaluation of the model based on F1 score
def evaluate_model(model, testX, testY):
    temp = model.predict(testX)
    y_pred  = np.argmax(temp, 1)
    y_true = np.argmax(testY, 1)
    precision = metrics.precision_score(y_true, y_pred, average=None)
    recall = metrics.recall_score(y_true, y_pred, average=None)
    f1_score = metrics.f1_score(y_true, y_pred, average=None)
    print("f1_score: " + str(f1_score) + "\n")
    print(confusion_matrix(y_true, y_pred))
    return precision, recall, f1_score

In [5]:
def get_train_test(data, x_text, labels):
    
    NUM_CLASSES = 2
    # Splitting the dataset
    X_train, X_test, Y_train, Y_test = train_test_split( x_text, labels, random_state=42, test_size=0.10)
    
    
    # Adding a padding to make all the text uniform
    post_length = np.array([len(x.split(" ")) for x in x_text])
    if(data != "twitter"):
        max_document_length = int(np.percentile(post_length, 95))
    else:
        max_document_length = max(post_length)
    print("Document length : " + str(max_document_length))
    
    
    # Mapping words with vocubulary
    vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length, MAX_FEATURES)
    vocab_processor = vocab_processor.fit(x_text)
    
    # Mapping it with vocabulary
    trainX = np.array(list(vocab_processor.transform(X_train)))
    testX = np.array(list(vocab_processor.transform(X_test)))
    
    # Converting the variables into Array
    trainY = np.asarray(Y_train)
    testY = np.asarray(Y_test)
    
    # Padding the sequences
    trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.)
    testX = pad_sequences(testX, maxlen=max_document_length, value=0.)


    # Making a categorical conversion
    trainY = to_categorical(trainY, nb_classes=NUM_CLASSES)
    testY = to_categorical(testY, nb_classes=NUM_CLASSES)
    
    
    # Making the final dictionary
    data_dict = {
        "data": data,
        "trainX" : trainX,
        "trainY" : trainY,
        "testX" : testX,
        "testY" : testY,
        "vocab_processor" : vocab_processor
    }
    
    return data_dict

In [6]:
def return_data(data_dict):
    return data_dict["data"], data_dict["trainX"], data_dict["trainY"], data_dict["testX"], data_dict["testY"], data_dict["vocab_processor"]

In [7]:
def shuffle_weights(model, weights=None):
    """Randomly permute the weights in `model`, or the given `weights`.
    This is a fast approximation of re-initializing the weights of a model.
    Assumes weights are distributed independently of the dimensions of the weight tensors
      (i.e., the weights have the same distribution along each dimension).
    :param Model model: Modify the weights of the given model.
    :param list(ndarray) weights: The model's weights will be replaced by a random permutation of these weights.
      If `None`, permute the model's current weights.
    """
    if weights is None:
        weights = model.get_weights()
    weights = [np.random.permutation(w.flat).reshape(w.shape) for w in weights]
    # Faster, but less random: only permutes along the first dimension
    # weights = [np.random.permutation(w) for w in weights]
    model.set_weights(weights)

In [8]:
def train(data_dict, model_type, vector_type, embed_size, dump_embeddings=False):

    data, trainX, trainY, testX, testY, vocab_processor = return_data(data_dict)

    vocab_size = len(vocab_processor.vocabulary_)
    print("Vocabulary Size: {:d}".format(vocab_size))
    vocab = vocab_processor.vocabulary_._mapping
    
    # Training the model
    print("Running Model: " + model_type + " with word vector initiliazed with " + vector_type + " word vectors.")
    model = get_model(model_type, trainX.shape[1], vocab_size, embed_size, NUM_CLASSES, LEARN_RATE)
    model.summary()
    initial_weights = model.get_weights()
    shuffle_weights(model, initial_weights)
    print("Loading start")
    model.fit(trainX, trainY, epochs=EPOCHS, shuffle=True, batch_size=BATCH_SIZE, verbose=1)

    # Storing the trained Model
    model.save("Saved_model_Twitter_Hate_speech.h5")
        
    return  evaluate_model(model, trainX, trainY), model

In [9]:
def print_scores(precision_scores, recall_scores, f1_scores):
    for i in range(NUM_CLASSES):
        print("\nPrecision Class %d (avg): %0.3f (+/- %0.3f)" % (i, precision_scores[:, i].mean(), precision_scores[:, i].std() * 2))
        print( "\nRecall Class %d (avg): %0.3f (+/- %0.3f)" % (i, recall_scores[:, i].mean(), recall_scores[:, i].std() * 2))
        print( "\nF1 score Class %d (avg): %0.3f (+/- %0.3f)" % (i, f1_scores[:, i].mean(), f1_scores[:, i].std() * 2))

In [10]:
def get_data(data, oversampling_rate):
    
    x_text, labels = load_data(get_filename(data)) 
    filter_data = []
    for text in x_text:
        filter_data.append("".join(l for l in text if l not in string.punctuation)) 
    return x_text, labels

In [11]:
# All the hyperparameters and model selection
EPOCHS = 1
BATCH_SIZE = 128
MAX_FEATURES = 2
NUM_CLASSES = 1
DROPOUT = 0.25
LEARN_RATE = 0.01

In [12]:
def run_model(data, oversampling_rate, model_type, vector_type, embed_size):    
    x_text, labels = get_data(data, oversampling_rate)
    data_dict = get_train_test(data,  x_text, labels)
    accuracy, model = train(data_dict, model_type, vector_type, embed_size)
    return model

In [None]:
data = "twitter"
model_type = "blstm"
vector_type = "random"

# Train and test the model
model = run_model(data, 3, model_type, vector_type, 200)

Document length : 63
Instructions for updating:
Please use tensorflow/transform or tf.data.
Instructions for updating:
Please use tensorflow/transform or tf.data.
Instructions for updating:
Please use tensorflow/transform or tf.data.
Vocabulary Size: 7038
Running Model: blstm with word vector initiliazed with random word vectors.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 63, 200)           1407600   
_________________________________________________________________
dropout_1 (Dropout)          (None, 63, 200)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 63, 400)           641600    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 6

In [51]:
from models import feature

x_text,labels = get_data("twitter",3)
data_dict = get_train_test(data,  x_text, labels)
input_value = data_dict["testX"]

# Get the intermediate output of the Bidirectional LSTM for a X_batch
layer_idx = 4
bilstm_output = feature(model, layer_idx, input_value)

False
Document length : 63
Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 63, 200)           1407600   
_________________________________________________________________
dropout_5 (Dropout)          (None, 63, 200)           0         
_________________________________________________________________
bidirectional_7 (Bidirection (None, 63, 400)           641600    
_________________________________________________________________
bidirectional_8 (Bidirection (None, 63, 400)           961600    
_________________________________________________________________
bidirectional_9 (Bidirection (None, 400)               961600    
_________________________________________________________________
dropout_6 (Dropout)          (None, 400)               0         
_________________________________________________________________
dense_3 (Dense)            