## Requirements


In [1]:
# Unable warnings
import os
import warnings

warnings.filterwarnings("ignore")
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

#### Directory adjustment


In [2]:
from pathlib import Path
import sys
import os

# Back to main folder
path = os.path.dirname(os.getcwd()) + "/"
os.chdir(path)
sys.path.append(path)


#### Data Processing


In [3]:
# ETL
import numpy as np
import pandas as pd
import multiprocessing


#### Natural language processing


In [4]:
from src.utils.TextVectorization import MeanEmbeddingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from src.transformers.text import TextNormalizer
from gensim.models import KeyedVectors
from gensim import models
import gensim


#### Models

In [5]:
# Tracking
from src.experiment.tracking import experiment

# ML preprocessing
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split

# Pipe
from sklearn.pipeline import Pipeline

# Deep neural network
from sklearn.neural_network import MLPClassifier

# Metrics
from sklearn.metrics import classification_report


## Data version control (DVC)

In [6]:
from src.data.control import version

df_train, df_test = version().split(test_size=0.2)


## Data manipulation


In [7]:
# Set target and features
target = "label"
features = "text"

# Set train and test
X_train, y_train = df_train[features], df_train[target]
X_test, y_test = df_test[features], df_test[target]


##  MLP with Tf-idf

#### Base model

In [8]:
# Text normalizer
wordlist = [
    "nomeusuario",
    "paginaweb",
    "emailusario",
    "numerotelefone",
    "simbolomonetario",
]

normalizer = TextNormalizer(
    stopwords=True, wordlist=wordlist, stemmer=False, lemma=False
)

# Text vectorizer
vectorizer = TfidfVectorizer(
    lowercase=False,
    analyzer="word",
    norm="l2",
    ngram_range=(1, 2),
    max_features=1500,
    sublinear_tf=True,
)

# Classfier
classifier = MLPClassifier(
    warm_start=True,
    hidden_layer_sizes=(100),
    activation="relu",
    solver="lbfgs",
    learning_rate="adaptive",
    random_state=42,
    max_iter=30,
)


In [9]:
# Build a classifier pipeline
ml_pipe = Pipeline(
    [("normalizer", normalizer), ("vectorizer", vectorizer), ("classifier", classifier)]
)
# Set experiment
lab = experiment(
    exp_name="Hate Speech",
    model_name="MLP",
    model=ml_pipe,
)
# Evaluate experiment
y_pred = lab.run(X_train, y_train, X_test, y_test, predictions=True)
pd.DataFrame(classification_report(y_test, y_pred, output_dict=True))


[MLFLOW] [START] server already running
[MLFLOW][EXECUTION] running experiment




[MLFLOW][FINISHED] experiment executed successfully


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.794903,0.606452,0.743386,0.700677,0.735576
recall,0.842986,0.526611,0.743386,0.684798,0.743386
f1-score,0.818239,0.563718,0.743386,0.690978,0.738112
support,777.0,357.0,0.743386,1134.0,1134.0


## MLP with word2vec

In [10]:
# Define a corpus
corpus = X_train
cores = multiprocessing.cpu_count()

# Train a own word2vec model
my_embedding_50d = gensim.models.Word2Vec(
    corpus,
    vector_size=50,
    window=4,
    min_count=10,
    sg=1,
    workers=cores - 1,
    batch_words=10000,
    alpha=0.1,
    min_alpha=0.0001,
    negative=20,
)

my_embedding_100d = gensim.models.Word2Vec(
    corpus,
    vector_size=50,
    window=4,
    min_count=10,
    sg=1,
    workers=cores - 1,
    batch_words=10000,
    alpha=0.1,
    min_alpha=0.0001,
    negative=20,
)

my_embedding_300d = gensim.models.Word2Vec(
    corpus,
    vector_size=300,
    window=4,
    min_count=10,
    sg=1,
    workers=cores - 1,
    batch_words=10000,
    alpha=0.1,
    min_alpha=0.0001,
    negative=20,
)


# Make embedding dictionary {token:vector}
my_embedding_50d = dict(
    zip(my_embedding_50d.wv.index_to_key, my_embedding_50d.wv.vectors)
)

my_embedding_100d = dict(
    zip(my_embedding_100d.wv.index_to_key, my_embedding_100d.wv.vectors)
)

my_embedding_300d = dict(
    zip(my_embedding_300d.wv.index_to_key, my_embedding_300d.wv.vectors)
)

# Embeddings
embedding = {
    "skip_50": my_embedding_50d,
    "skip_100": my_embedding_100d,
    "skip_300": my_embedding_300d,
}


#### Multi embedding test

In [11]:
for embedding_name, w2v in embedding.items():

    # Build pipeline
    temp_pipe = Pipeline(
        [
            ("normalizer", normalizer),
            ("vectorizer", MeanEmbeddingVectorizer(w2v)),
            ("classifier", classifier),
        ],
    )

    # Set experiment
    lab = experiment(
        exp_name="Hate Speech",
        model_name=f"MLP_{embedding_name}",
        model=ml_pipe,
    )

    # Evaluate experiment
    y_pred = lab.run(X_train, y_train, X_test, y_test, predictions=False)


[MLFLOW] [START] server already running
[MLFLOW][EXECUTION] running experiment




[MLFLOW] [FINISHED] experiment executed successfully
model:MLP_skip_50 - acc:0.7257495590828924 - rec:0.5686274509803921 - auc:0.683284124460595 - f1:0.5662482566248257 

[MLFLOW] [START] server already running
[MLFLOW][EXECUTION] running experiment




[MLFLOW] [FINISHED] experiment executed successfully
model:MLP_skip_100 - acc:0.7098765432098766 - rec:0.5742296918767507 - auc:0.6732152320387614 - f1:0.5548037889039241 

[MLFLOW] [START] server already running
[MLFLOW][EXECUTION] running experiment




[MLFLOW] [FINISHED] experiment executed successfully
model:MLP_skip_300 - acc:0.6922398589065256 - rec:0.5658263305322129 - auc:0.6580740404269816 - f1:0.5365205843293491 

