## Requirements


In [1]:
# Unable warnings
import os
import warnings

warnings.filterwarnings("ignore")
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

#### Directory adjustment


In [2]:
from pathlib import Path
import sys
import os

# Back to main folder
path = os.path.dirname(os.getcwd()) + "/"
os.chdir(path)
sys.path.append(path)


#### Data Processing


In [3]:
# ETL
import numpy as np
import pandas as pd
import multiprocessing

# ML preprocessing
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.feature_selection import chi2


#### Natural language processing


In [4]:
# Basic
import re

# Tracking
from src.experiment.tracking import experiment

# Vectorization
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from src.transformers.text import TextNormalizer

# Embedding
from src.transformers.vectorizer import MeanEmbeddingVectorizer
from gensim.models import KeyedVectors
from gensim import models
import gensim


#### Models


In [5]:
# Pipe
from sklearn.pipeline import Pipeline

# Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier

# Metrics
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score


### DVC


In [6]:
from src.data.control import version

df_train, df_test = version().split(test_size=0.2)


## Data manipulation


In [7]:
# Set target and features
target = "label"
features = "text"

# Set train and test
X_train, y_train = df_train[features], df_train[target]
X_test, y_test = df_test[features], df_test[target]


# Set k-fold criteria
k_fold = KFold(n_splits=10, shuffle=True, random_state=42)


# Class weights
pos = len(df_train.query("label==1"))
neg = len(df_train.query("label==0"))
weight_for_0 = (1 / neg) * (len(df_train) / 2.0)
weight_for_1 = (1 / pos) * (len(df_train) / 2.0) 
relative_weight = {0: weight_for_0, 1: weight_for_1}


## Model steps

In [8]:
# Text normalizer
wordlist = [
    "nomeusuario",
    "paginaweb",
    "emailusario",
    "numerotelefone",
    "simbolomonetario",
]

normalizer = TextNormalizer(
    stopwords=True, wordlist=wordlist, stemmer=False, lemma=False
)


# Classifiers
classifiers = {
    "GradientBoosting": GradientBoostingClassifier(),
    "KNeighbors": KNeighborsClassifier(),
    "Bernoulli": BernoulliNB(alpha=1.0, binarize=0.0, fit_prior=True, class_prior=None),
    "SVC": LinearSVC(
        penalty="l2",
        loss="squared_hinge",
        dual=True,
        tol=1e-6,
        C=1.1,
        multi_class="crammer_singer",
        fit_intercept=True,
        intercept_scaling=1,
        class_weight=relative_weight,
        random_state=42,
        max_iter=1000,
    ),
    "LogisticRegression": LogisticRegression(penalty="l2", max_iter=200, C=1),
    "SGDC": SGDClassifier(loss="hinge", max_iter=200),
    "DecisionTree": DecisionTreeClassifier(
        random_state=42,
        class_weight=relative_weight,
        min_samples_split=2,
        max_features="auto",
    ),
    "RandomForest": RandomForestClassifier(
        random_state=42,
        class_weight=relative_weight,
        min_samples_split=2,
        max_features="auto",
        oob_score=True,
    ),
    "SVM": svm.SVC(kernel="rbf"),
}


## My own word2vec embedding model

In [9]:
# Define a corpus
corpus = X_train
cores = multiprocessing.cpu_count()

# Train a own word2vec model
my_embedding_50d = gensim.models.Word2Vec(
    corpus,
    vector_size=50,
    window=4,
    min_count=10,
    sg=1,
    workers=cores - 1,
    batch_words=10000,
    alpha=0.1,
    min_alpha=0.0001,
    negative=20,
)

my_embedding_100d = gensim.models.Word2Vec(
    corpus,
    vector_size=50,
    window=4,
    min_count=10,
    sg=1,
    workers=cores - 1,
    batch_words=10000,
    alpha=0.1,
    min_alpha=0.0001,
    negative=20,
)

my_embedding_300d = gensim.models.Word2Vec(
    corpus,
    vector_size=300,
    window=4,
    min_count=10,
    sg=1,
    workers=cores - 1,
    batch_words=10000,
    alpha=0.1,
    min_alpha=0.0001,
    negative=20,
)


my_embedding_1000d = gensim.models.Word2Vec(
    corpus,
    vector_size=1000,
    window=4,
    min_count=10,
    sg=1,
    workers=cores - 1,
    batch_words=10000,
    alpha=0.1,
    min_alpha=0.0001,
    negative=20,
)

# Make embedding dictionary {token:vector}
my_embedding_50d = dict(
    zip(my_embedding_50d.wv.index_to_key, my_embedding_50d.wv.vectors)
)

my_embedding_100d = dict(
    zip(my_embedding_100d.wv.index_to_key, my_embedding_100d.wv.vectors)
)

my_embedding_300d = dict(
    zip(my_embedding_300d.wv.index_to_key, my_embedding_300d.wv.vectors)
)

my_embedding_1000d = dict(
    zip(my_embedding_1000d.wv.index_to_key, my_embedding_1000d.wv.vectors)
)


# Make a vectorizer
vec_50 = MeanEmbeddingVectorizer(my_embedding_50d)
vec_100 = MeanEmbeddingVectorizer(my_embedding_100d)
vec_300 = MeanEmbeddingVectorizer(my_embedding_300d)
vec_1000 = MeanEmbeddingVectorizer(my_embedding_1000d)

# List embeddings
embeddings = {
    "w2v_50": vec_50,
    "w2v_100": vec_100,
    "w2v_300": vec_300,
    "w2v_1000": vec_1000,
}


### Basic pipeline

In [10]:
# Basic pipeline
ml_pipe = Pipeline(
    [
        ("normalizer", normalizer),
        ("vectorizer", vec_50),
        ("classifier", classifiers["SVC"]),
    ]
)

# Train
ml_pipe.fit(X_train, y_train)

pd.DataFrame(classification_report(y_test, ml_pipe.predict(X_test), output_dict=True))


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.767318,0.395797,0.580247,0.581557,0.650358
recall,0.555985,0.633053,0.580247,0.594519,0.580247
f1-score,0.644776,0.487069,0.580247,0.565923,0.595128
support,777.0,357.0,0.580247,1134.0,1134.0


### Multi model x Multi Embedding

In [11]:
for model_name, classifier in classifiers.items():
    for embedding_name, embedding in embeddings.items():

        # Build a classifier pipeline
        ml_pipe = Pipeline(
            [
                ("normalizer", normalizer),
                ("vectorizer", embedding),
                ("classifier", classifier),
            ]
        )

        # Set experiment
        lab = experiment(
            exp_name="Hate Speech",
            model_name=f"{model_name} - {embedding_name}",
            model=ml_pipe,
        )

        # Evaluate experiment
        y_pred = lab.run(X_train, y_train, X_test, y_test)


[MLFLOW] [START] starting server


[2022-11-03 10:51:36 -0300] [46766] [INFO] Starting gunicorn 20.1.0
[2022-11-03 10:51:36 -0300] [46766] [INFO] Listening at: http://127.0.0.1:7500 (46766)
[2022-11-03 10:51:36 -0300] [46766] [INFO] Using worker: sync
[2022-11-03 10:51:36 -0300] [46777] [INFO] Booting worker with pid: 46777
[2022-11-03 10:51:36 -0300] [46778] [INFO] Booting worker with pid: 46778
[2022-11-03 10:51:36 -0300] [46779] [INFO] Booting worker with pid: 46779
[2022-11-03 10:51:36 -0300] [46780] [INFO] Booting worker with pid: 46780


[MLFLOW][EXECUTION] running experiment




[MLFLOW] [FINISHED] experiment executed successfully
model:GradientBoosting - w2v_50 - acc:0.6948853615520282 - rec:0.12605042016806722 - auc:0.5411461882050118 - f1:0.20642201834862386 

[MLFLOW] [START] server already running
[MLFLOW][EXECUTION] running experiment




[MLFLOW] [FINISHED] experiment executed successfully
model:GradientBoosting - w2v_100 - acc:0.699294532627866 - rec:0.13165266106442577 - auc:0.545877810583693 - f1:0.2160919540229885 

[MLFLOW] [START] server already running
[MLFLOW][EXECUTION] running experiment




[MLFLOW] [FINISHED] experiment executed successfully
model:GradientBoosting - w2v_300 - acc:0.6966490299823633 - rec:0.16246498599439776 - auc:0.55227496403967 - f1:0.2521739130434783 

[MLFLOW] [START] server already running
[MLFLOW][EXECUTION] running experiment




[MLFLOW] [FINISHED] experiment executed successfully
model:GradientBoosting - w2v_1000 - acc:0.6975308641975309 - rec:0.16806722689075632 - auc:0.5544325838443486 - f1:0.2591792656587473 

[MLFLOW] [START] server already running
[MLFLOW][EXECUTION] running experiment




[MLFLOW] [FINISHED] experiment executed successfully
model:KNeighbors - w2v_50 - acc:0.6604938271604939 - rec:0.3473389355742297 - auc:0.5758573699750171 - f1:0.39178515007898895 

[MLFLOW] [START] server already running
[MLFLOW][EXECUTION] running experiment




[MLFLOW] [FINISHED] experiment executed successfully
model:KNeighbors - w2v_100 - acc:0.644620811287478 - rec:0.31092436974789917 - auc:0.5544325838443486 - f1:0.35519999999999996 

[MLFLOW] [START] server already running
[MLFLOW][EXECUTION] running experiment




[MLFLOW] [FINISHED] experiment executed successfully
model:KNeighbors - w2v_300 - acc:0.6701940035273368 - rec:0.38095238095238093 - auc:0.592020592020592 - f1:0.42105263157894735 

[MLFLOW] [START] server already running
[MLFLOW][EXECUTION] running experiment




[MLFLOW] [FINISHED] experiment executed successfully
model:KNeighbors - w2v_1000 - acc:0.6507936507936508 - rec:0.3165266106442577 - auc:0.5604512075100311 - f1:0.36334405144694537 

[MLFLOW] [START] server already running
[MLFLOW][EXECUTION] running experiment




[MLFLOW] [FINISHED] experiment executed successfully
model:Bernoulli - w2v_50 - acc:0.6851851851851852 - rec:0.0 - auc:0.5 - f1:0.0 

[MLFLOW] [START] server already running
[MLFLOW][EXECUTION] running experiment




[MLFLOW] [FINISHED] experiment executed successfully
model:Bernoulli - w2v_100 - acc:0.6851851851851852 - rec:0.0 - auc:0.5 - f1:0.0 

[MLFLOW] [START] server already running
[MLFLOW][EXECUTION] running experiment




[MLFLOW] [FINISHED] experiment executed successfully
model:Bernoulli - w2v_300 - acc:0.671957671957672 - rec:0.24369747899159663 - auc:0.5562116738587327 - f1:0.31868131868131866 

[MLFLOW] [START] server already running
[MLFLOW][EXECUTION] running experiment




[MLFLOW] [FINISHED] experiment executed successfully
model:Bernoulli - w2v_1000 - acc:0.562610229276896 - rec:0.6526610644257703 - auc:0.5869482928306458 - f1:0.48440748440748443 

[MLFLOW] [START] server already running
[MLFLOW][EXECUTION] running experiment




[MLFLOW] [FINISHED] experiment executed successfully
model:SVC - w2v_50 - acc:0.5802469135802469 - rec:0.6330532212885154 - auc:0.5945188886365357 - f1:0.4870689655172414 

[MLFLOW] [START] server already running
[MLFLOW][EXECUTION] running experiment




[MLFLOW] [FINISHED] experiment executed successfully
model:SVC - w2v_100 - acc:0.5828924162257496 - rec:0.6358543417366946 - auc:0.5972064501476266 - f1:0.4897518878101402 

[MLFLOW] [START] server already running
[MLFLOW][EXECUTION] running experiment




[MLFLOW] [FINISHED] experiment executed successfully
model:SVC - w2v_300 - acc:0.58994708994709 - rec:0.6526610644257703 - auc:0.6068968127791657 - f1:0.5005370569280344 

[MLFLOW] [START] server already running
[MLFLOW][EXECUTION] running experiment




[MLFLOW] [FINISHED] experiment executed successfully
model:SVC - w2v_1000 - acc:0.5961199294532628 - rec:0.6470588235294118 - auc:0.6098871981224923 - f1:0.5021739130434781 

[MLFLOW] [START] server already running
[MLFLOW][EXECUTION] running experiment




[MLFLOW] [FINISHED] experiment executed successfully
model:LogisticRegression - w2v_50 - acc:0.6843033509700176 - rec:0.0 - auc:0.49935649935649934 - f1:0.0 

[MLFLOW] [START] server already running
[MLFLOW][EXECUTION] running experiment




[MLFLOW] [FINISHED] experiment executed successfully
model:LogisticRegression - w2v_100 - acc:0.6843033509700176 - rec:0.0 - auc:0.49935649935649934 - f1:0.0 

[MLFLOW] [START] server already running
[MLFLOW][EXECUTION] running experiment




[MLFLOW] [FINISHED] experiment executed successfully
model:LogisticRegression - w2v_300 - acc:0.6843033509700176 - rec:0.0 - auc:0.49935649935649934 - f1:0.0 

[MLFLOW] [START] server already running
[MLFLOW][EXECUTION] running experiment




[MLFLOW] [FINISHED] experiment executed successfully
model:LogisticRegression - w2v_1000 - acc:0.6851851851851852 - rec:0.0 - auc:0.5 - f1:0.0 

[MLFLOW] [START] server already running
[MLFLOW][EXECUTION] running experiment




[MLFLOW] [FINISHED] experiment executed successfully
model:SGDC - w2v_50 - acc:0.6851851851851852 - rec:0.0 - auc:0.5 - f1:0.0 

[MLFLOW] [START] server already running
[MLFLOW][EXECUTION] running experiment




[MLFLOW] [FINISHED] experiment executed successfully
model:SGDC - w2v_100 - acc:0.6851851851851852 - rec:0.0 - auc:0.5 - f1:0.0 

[MLFLOW] [START] server already running
[MLFLOW][EXECUTION] running experiment




[MLFLOW] [FINISHED] experiment executed successfully
model:SGDC - w2v_300 - acc:0.6851851851851852 - rec:0.0 - auc:0.5 - f1:0.0 

[MLFLOW] [START] server already running
[MLFLOW][EXECUTION] running experiment




[MLFLOW] [FINISHED] experiment executed successfully
model:SGDC - w2v_1000 - acc:0.6851851851851852 - rec:0.0 - auc:0.5 - f1:0.0 

[MLFLOW] [START] server already running
[MLFLOW][EXECUTION] running experiment




[MLFLOW] [FINISHED] experiment executed successfully
model:DecisionTree - w2v_50 - acc:0.6031746031746031 - rec:0.3585434173669468 - auc:0.5370580664698312 - f1:0.3626062322946175 

[MLFLOW] [START] server already running
[MLFLOW][EXECUTION] running experiment




[MLFLOW] [FINISHED] experiment executed successfully
model:DecisionTree - w2v_100 - acc:0.6084656084656085 - rec:0.37815126050420167 - auc:0.5462184873949579 - f1:0.37815126050420167 

[MLFLOW] [START] server already running
[MLFLOW][EXECUTION] running experiment




[MLFLOW] [FINISHED] experiment executed successfully
model:DecisionTree - w2v_300 - acc:0.6137566137566137 - rec:0.36134453781512604 - auc:0.545537133772428 - f1:0.3706896551724138 

[MLFLOW] [START] server already running
[MLFLOW][EXECUTION] running experiment




[MLFLOW] [FINISHED] experiment executed successfully
model:DecisionTree - w2v_1000 - acc:0.5970017636684304 - rec:0.3473389355742297 - auc:0.5295253236429708 - f1:0.3517730496453901 

[MLFLOW] [START] server already running
[MLFLOW][EXECUTION] running experiment




[MLFLOW] [FINISHED] experiment executed successfully
model:RandomForest - w2v_50 - acc:0.6887125220458554 - rec:0.10644257703081232 - auc:0.5313422666363843 - f1:0.17715617715617715 

[MLFLOW] [START] server already running
[MLFLOW][EXECUTION] running experiment




[MLFLOW] [FINISHED] experiment executed successfully
model:RandomForest - w2v_100 - acc:0.7037037037037037 - rec:0.12044817927170869 - auc:0.5460670754788403 - f1:0.2037914691943128 

[MLFLOW] [START] server already running
[MLFLOW][EXECUTION] running experiment




[MLFLOW] [FINISHED] experiment executed successfully
model:RandomForest - w2v_300 - acc:0.6966490299823633 - rec:0.11484593837535013 - auc:0.5394049511696571 - f1:0.19248826291079812 

[MLFLOW] [START] server already running
[MLFLOW][EXECUTION] running experiment




[MLFLOW] [FINISHED] experiment executed successfully
model:RandomForest - w2v_1000 - acc:0.6948853615520282 - rec:0.11764705882352941 - auc:0.5388750094632448 - f1:0.1953488372093023 

[MLFLOW] [START] server already running
[MLFLOW][EXECUTION] running experiment




[MLFLOW] [FINISHED] experiment executed successfully
model:SVM - w2v_50 - acc:0.6851851851851852 - rec:0.0 - auc:0.5 - f1:0.0 

[MLFLOW] [START] server already running
[MLFLOW][EXECUTION] running experiment




[MLFLOW] [FINISHED] experiment executed successfully
model:SVM - w2v_100 - acc:0.6851851851851852 - rec:0.0 - auc:0.5 - f1:0.0 

[MLFLOW] [START] server already running
[MLFLOW][EXECUTION] running experiment




[MLFLOW] [FINISHED] experiment executed successfully
model:SVM - w2v_300 - acc:0.6851851851851852 - rec:0.0 - auc:0.5 - f1:0.0 

[MLFLOW] [START] server already running
[MLFLOW][EXECUTION] running experiment




[MLFLOW] [FINISHED] experiment executed successfully
model:SVM - w2v_1000 - acc:0.6851851851851852 - rec:0.0 - auc:0.5 - f1:0.0 



## Pre-trained embeddings


In [14]:
# Load a pre-trained model
pretrained_model = models.KeyedVectors.load_word2vec_format(
    "data/pretrained-skipgram/skip_s50.txt", binary=False
)

# Make embedding dictionary {token:vector}
pre_embedding_50d = dict(zip(pretrained_model.index_to_key, pretrained_model.vectors))

# Make a vectorizer
pre_vec_50 = MeanEmbeddingVectorizer(pre_embedding_50d)


### Basic pipeline

In [15]:
# Basic pipeline
ml_pipe = Pipeline(
    [
        ("normalizer", normalizer),
        ("vectorizer", pre_vec_50),
        ("classifier", classifiers["SVC"]),
    ]
)

# Train
ml_pipe.fit(X_train, y_train)

pd.DataFrame(classification_report(y_test, ml_pipe.predict(X_test), output_dict=True))


2022/11/03 11:08:35 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'e76583d22cc1406698bae430bd6e0c7a', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.760902,0.422175,0.620811,0.591539,0.654266
recall,0.651223,0.554622,0.620811,0.602922,0.620811
f1-score,0.701803,0.479419,0.620811,0.590611,0.631793
support,777.0,357.0,0.620811,1134.0,1134.0


### Multi model x Multi embedding
Run `src/TrainMultiModel.py`