## Requirements


In [1]:
# Unable warnings
import os
import warnings

warnings.filterwarnings("ignore")
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'


#### Directory adjustment


In [2]:
from pathlib import Path
import sys

# Back to main folder
path = os.path.dirname(os.getcwd()) + "/"
os.chdir(path)
sys.path.append(path)


#### Data Processing


In [3]:
# ETL
import numpy as np
import pandas as pd
import multiprocessing


#### Natural language processing


In [4]:
from gensim.models import KeyedVectors
from gensim import models
import gensim


#### Models
[Check](https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/)

In [5]:
# ML preprocessing
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split


# Deep learnig model
#from keras.wrappers.scikit_learn import KerasClassifier
from scikeras.wrappers import KerasClassifier
from keras_preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from sklearn.model_selection import KFold
from sklearn.base import TransformerMixin
from sklearn.base import BaseEstimator
from sklearn.pipeline import Pipeline
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import Bidirectional
import tensorflow as tf


In [6]:
# Metrics
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

# Train  metrics
METRICS = [
    tf.keras.metrics.TruePositives(name='tp'),
    tf.keras.metrics.FalsePositives(name='fp'),
    tf.keras.metrics.TrueNegatives(name='tn'),
    tf.keras.metrics.FalseNegatives(name='fn'),
    tf.keras.metrics.BinaryAccuracy(name='accuracy'),
    tf.keras.metrics.Precision(name='precision'),
    tf.keras.metrics.Recall(name='recall'),
    tf.keras.metrics.AUC(name='auc'),
    tf.keras.metrics.AUC(name='prc', curve='PR'),  # precision-recall curve
]


#### Tracking

In [7]:
import mlflow
mlflow.set_tracking_uri('http://127.0.0.1:5000')
mlflow.set_experiment('Hate Speech')

<Experiment: artifact_location='./artifacts/1', creation_time=1665929754799, experiment_id='1', last_update_time=1665929754799, lifecycle_stage='active', name='Hate Speech', tags={}>

## Split dataset


In [8]:
# Get data
df = pd.read_csv("data/corpus/augmented_corpus_fortuna.csv")

# Set target and features
target = "label"
features = "text_nonstop"
count = f"length_{features}"
pos = len(df.query('label==1'))
neg = len(df.query('label==0'))


# Break apart dataset
X = df[features].values.astype("U")
y = df[target]

# Split train abd test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# Set k-fold criteria
k_fold = KFold(n_splits=10, shuffle=True, random_state=42)

# Classes balancing
longest_text = df[count].max()
initial_bias = np.log([pos/neg])

weight_for_0 = (1 / neg) * (len(df) / 2.0)
weight_for_1 = (1 / pos) * (len(df) / 2.0)
class_weight = {0: weight_for_0, 1: weight_for_1}


## LSTM with Word2vec

In [9]:
# Load embedding
w2v = KeyedVectors.load_word2vec_format(
    "data/pretrained-glove/glove_s50.txt", binary=False
)

# Embedding props
vocab_size = len(w2v) + 1
vec_dim = w2v.vectors.shape[1]
embedding_weights = np.vstack([
    np.zeros(w2v.vectors.shape[1]),
    w2v.vectors
])


In [10]:
class TokenizerTransformer(BaseEstimator, TransformerMixin, Tokenizer):
    def __init__(self, **tokenizer_params):
        Tokenizer.__init__(self, **tokenizer_params)

    def fit(self, X, y=None):
        self.fit_on_texts(X)
        return self

    def transform(self, X, y=None):
        X_transformed = self.texts_to_sequences(X)
        return X_transformed


class PadSequencesTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, maxlen):
        self.maxlen = maxlen

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_padded = pad_sequences(X, maxlen=self.maxlen)
        return X_padded

class ModelWrapper(mlflow.pyfunc.PythonModel): 
    def __init__(self, model): 
        self.model = model 
 
    def predict(self, context, model_input): 
        return self.model.predict(model_input) 
    


#### Basic pipeline

In [21]:
# Preset parameters
experiment_parameters = {"classifier": "LSTM",
                         "class_weight": class_weight,
                         "epochs": 15,
                         "units": 50,
                         "dropout": 0.4,
                         "recurrent_dropout": 0.2,
                         "kernel_initializer": 'glorot_uniform',
                         "loss": "binary_crossentropy",
                         "optimizer": "adamax",
                         "embedding_input_dim": vec_dim,
                         "batch_size": 64}


In [22]:
# LSTM model

def lstm_builder(embedding_input_dim, embedding_output_dim, embedding_weights):
    output_bias = tf.keras.initializers.Constant(initial_bias)
    lstm = Sequential()

    lstm.add(
        Embedding(
            input_dim=embedding_input_dim,
            output_dim=embedding_output_dim,
            weights=[embedding_weights],
            trainable=False,
            mask_zero=True,
        )
    )
    lstm.add(Bidirectional(LSTM(units=experiment_parameters['units'],
                                dropout=experiment_parameters['dropout'],
                                recurrent_dropout=experiment_parameters['recurrent_dropout'],
                                kernel_initializer=experiment_parameters['kernel_initializer'])))

    lstm.add(Dropout(0.20))

    lstm.add(Dense(units=1,
                   activation="sigmoid",
                   bias_initializer=output_bias))

    lstm.compile(loss=experiment_parameters['loss'],
                 optimizer=experiment_parameters['optimizer'],
                 metrics=METRICS)
    return lstm


In [23]:
# Model execution
lstm = KerasClassifier(
    model=lstm_builder,
    epochs=experiment_parameters['epochs'],
    embedding_input_dim=len(w2v) + 1,
    embedding_output_dim=vec_dim,
    embedding_weights=embedding_weights,
    batch_size=experiment_parameters['batch_size'],
    callbacks=[EarlyStopping(monitor="loss",
                             patience=10,
                             restore_best_weights=True)],
    class_weight=class_weight
)


In [24]:
mlflow.sklearn.autolog()
with mlflow.start_run():

    ml_pipe = Pipeline(
        [("tokenizer",  TokenizerTransformer()),
         ("padder", PadSequencesTransformer(maxlen=longest_text)),
         ("model", lstm)])

    # Model fit
    ml_pipe.fit(X_train, y_train)
    y_predict = ml_pipe.predict(X_test)

    # Tracking
    mlflow.log_metric('precision', precision_score(y_test, y_predict))
    mlflow.log_metric('accuracy', accuracy_score(y_test, y_predict))
    mlflow.log_metric('recall', recall_score(y_test, y_predict))
    mlflow.log_metric('auc', roc_auc_score(y_test, y_predict))
    mlflow.log_metric('f1', f1_score(y_test, y_predict))
    mlflow.pyfunc.log_model(
        python_model=ModelWrapper(ml_pipe),
        artifact_path="LSTM",
    )


	model=<function lstm_builder at 0x7fa461dbf940>
	build_fn=None
	warm_start=False
	random_state=None
	optimizer=rmsprop
	loss=None
	metrics=None
	batch_size=64
	validation_batch_size=None
	verbose=1
	callbacks=[<keras.callbacks.EarlyStopping object at 0x7fa463b56fd0>]
	validation_split=0.0
	shuffle=True
	run_eagerly=False
	epochs=15
	embedding_input_dim=929606
	embedding_output_...`
	model=<function lstm_builder at 0x7fa461dbf940>
	build_fn=None
	warm_start=False
	random_state=None
	optimizer=rmsprop
	loss=None
	metrics=None
	batch_size=64
	validation_batch_size=None
	verbose=1
	callbacks=[<keras.callbacks.EarlyStopping object at 0x7fa463b56fd0>]
	validation_split=0.0
	shuffle=True
	run_eagerly=False
	epochs=15
	embedding_input_dim=929606
	embedding_output_dim=50
	embedding_weights=[[ 0.          0.          0.         ...  0.          0.
   0.        ]
...`


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
INFO:tensorflow:Assets written to: ram:///tmp/tmp1idr1qb7/assets
INFO:tensorflow:Assets written to: ram:///tmp/tmpykrdtk3o/assets
