## Requirements


In [1]:
# Unable warnings
import os
import warnings

warnings.filterwarnings("ignore")
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

#### Directory adjustment


In [2]:
from pathlib import Path
import sys
import os

# Back to main folder
path = os.path.dirname(os.getcwd()) + "/"
os.chdir(path)
sys.path.append(path)


#### Data Processing


In [3]:
# ETL
import numpy as np
import pandas as pd
import multiprocessing


#### Natural language processing


In [4]:
from gensim.models import KeyedVectors
from gensim import models
import gensim


#### Models

In [5]:
# ML preprocessing
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split
from src.TextVectorization import MeanEmbeddingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Deep learnig model
from keras.wrappers.scikit_learn import KerasClassifier
from keras_preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from sklearn.model_selection import KFold
from sklearn.base import TransformerMixin
from sklearn.base import BaseEstimator
from sklearn.pipeline import Pipeline
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import Dense
from keras.layers import LSTM
import tensorflow as tf

# Metrics
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score


#### Tracking

In [6]:
import mlflow
mlflow.set_tracking_uri('http://127.0.0.1:5000')
mlflow.set_experiment('Hate Speech')

<Experiment: artifact_location='./artifacts/1', creation_time=1665929754799, experiment_id='1', last_update_time=1665929754799, lifecycle_stage='active', name='Hate Speech', tags={}>

## Split dataset


In [7]:
# Get data
df = pd.read_csv("data/corpus/augmented_corpus_fortuna.csv")

# Set target and features
target = "label"
features = "text_nonstop"
count = f"length_{features}"
pos = len(df.query('label==1'))
neg = len(df.query('label==0'))


# Break apart dataset
X = df[features].values.astype("U")
y = df[target]

# Split train abd test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# Set k-fold criteria
k_fold = KFold(n_splits=10, shuffle=True, random_state=42)

# Corpus considerations
corpus = X

longest_text = df[count].max()
initial_bias = np.log([pos/neg])

weight_for_0 = (1 / neg) * (len(df) / 2.0)
weight_for_1 = (1 / pos) * (len(df) / 2.0)
class_weight = {0: weight_for_0, 1: weight_for_1}


##  MLP with Tf-idf

#### Basic pipeline

In [8]:
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
# Vectorizer
vectorizer = TfidfVectorizer(lowercase=False, analyzer="word",
                             norm='l2', ngram_range=(1, 2), max_features=10000)

# Basic pipeline
mlflow.sklearn.autolog()
with mlflow.start_run():
    ml_pipe = Pipeline(
        [
            ("vectorizer", vectorizer),
            ("classifier", MLPClassifier(
                alpha=0.00001,
                warm_start=True,
                hidden_layer_sizes=(200),
                activation='relu',
                solver='lbfgs',
                learning_rate="adaptive",
                max_iter=1000,
                max_fun=30000,
                random_state=42)),
        ]
    )

    # Train
    ml_pipe.fit(X_train, y_train)
    y_predict = ml_pipe.predict(X_test)

    # Tracking
    mlflow.log_params(ml_pipe.get_params())
    mlflow.log_metric('precision', precision_score(y_test, y_predict))
    mlflow.log_metric('accuracy', accuracy_score(y_test, y_predict))
    mlflow.log_metric('recall', recall_score(y_test, y_predict))
    mlflow.log_metric('auc', roc_auc_score(y_test, y_predict))
    mlflow.log_metric('f1', f1_score(y_test, y_predict))
    mlflow.sklearn.log_model(ml_pipe, "MLP")

# View
pd.DataFrame(classification_report(
    y_test, ml_pipe.predict(X_test), output_dict=True))


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.805594,0.520286,0.700176,0.66294,0.715775
recall,0.741313,0.610644,0.700176,0.675978,0.700176
f1-score,0.772118,0.561856,0.700176,0.666987,0.705924
support,777.0,357.0,0.700176,1134.0,1134.0


## MLP with wor2vec

In [10]:
# Define a corpus
corpus = X
cores = multiprocessing.cpu_count()

# Train a own word2vec model
my_embedding_50d = gensim.models.Word2Vec(
    corpus,
    vector_size=50,
    window=4,
    min_count=10,
    sg=1,
    workers=cores - 1,
    batch_words=10000,
    alpha=0.1,
    min_alpha=0.0001,
    negative=20,
)

my_embedding_100d = gensim.models.Word2Vec(
    corpus,
    vector_size=50,
    window=4,
    min_count=10,
    sg=1,
    workers=cores - 1,
    batch_words=10000,
    alpha=0.1,
    min_alpha=0.0001,
    negative=20,
)

my_embedding_300d = gensim.models.Word2Vec(
    corpus,
    vector_size=300,
    window=4,
    min_count=10,
    sg=1,
    workers=cores - 1,
    batch_words=10000,
    alpha=0.1,
    min_alpha=0.0001,
    negative=20,
)


# Make embedding dictionary {token:vector}
my_embedding_50d = dict(
    zip(my_embedding_50d.wv.index_to_key, my_embedding_50d.wv.vectors))

my_embedding_100d = dict(
    zip(my_embedding_100d.wv.index_to_key, my_embedding_100d.wv.vectors))

my_embedding_300d = dict(
    zip(my_embedding_300d.wv.index_to_key, my_embedding_300d.wv.vectors))

# Embeddings
embedding = {"skip_50": my_embedding_50d,
             "skip_100": my_embedding_100d,
             "skip_300": my_embedding_300d}


In [11]:
# Model
classifier = MLPClassifier(
    alpha=0.00001,
    hidden_layer_sizes=(200),
    activation='relu',
    solver='lbfgs',
    learning_rate="adaptive",
    max_iter=10_000,
    max_fun=30_000,
    random_state=42)

#### Multi embedding test

In [12]:
for embedding_name, w2v in embedding.items():
    mlflow.sklearn.autolog()
    with mlflow.start_run():

        ml_pipe = Pipeline([('vectorizer', MeanEmbeddingVectorizer(w2v)),
                            ('classifier', classifier)])

        # Model fit
        ml_pipe.fit(X_train, y_train)
        y_predict = ml_pipe.predict(X_test)

        # Tracking
        mlflow.log_params(ml_pipe.get_params())
        mlflow.log_metric('precision', precision_score(y_test, y_predict))
        mlflow.log_metric('accuracy', accuracy_score(y_test, y_predict))
        mlflow.log_metric('recall', recall_score(y_test, y_predict))
        mlflow.log_metric('auc', roc_auc_score(y_test, y_predict))
        mlflow.log_metric('f1', f1_score(y_test, y_predict))
        mlflow.sklearn.log_model(ml_pipe, embedding_name)


