## Scope 
This notebook presents the basic architecture that must be followed to run and trace machine learning experiments

## Requirements


In [1]:
# Unable warnings
import os
import warnings

warnings.filterwarnings("ignore")
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'


#### Directory adjustment


In [2]:
from pathlib import Path
import sys
# Back to main folder
path = os.path.dirname(os.getcwd())+"/"
os.chdir(path)
sys.path.append(path)


#### ETL tools


In [3]:
# ETL
import numpy as np
import pandas as pd

#### NLP tools


In [4]:
# Transformers
from src.transformers.text import TextNormalizer
from sklearn.feature_extraction.text import TfidfVectorizer


#### Models


In [5]:
# Tracking
from src.experiment.tracking import experiment

# Pipe
from sklearn.pipeline import Pipeline

# Models
from sklearn.svm import LinearSVC

# Evaluation
from sklearn.metrics import classification_report


## Data manipulation


In [6]:
# Get data
df_train = pd.read_csv("data/corpus/train_data.csv")
df_test = pd.read_csv("data/corpus/test_data.csv")

# Set target and features
target = "label"
features = "text"

# Set train and test
X_train, y_train = df_train[features], df_train[target]
X_test, y_test = df_test[features], df_test[target]


# Class weights
pos = len(df_train.query("label==1"))
neg = len(df_train.query("label==0"))
extra = 1.25
weight_for_0 = (1 / neg) * (len(df_train) / 2.0)
weight_for_1 = (1 / pos) * (len(df_train) / 2.0) * extra
class_weight = {0: weight_for_0, 1: weight_for_1}


## Pipeline steps

In [7]:
# Text normalizer
wordlist = ['nomeusuario', 'paginaweb', 'emailusario',
            'numerotelefone', 'simbolomonetario']

normalizer = TextNormalizer(stopwords=True, wordlist=wordlist)

# Text vectorizer
vectorizer = TfidfVectorizer(lowercase=False,
                             analyzer="word",
                             norm='l2',
                             ngram_range=(1, 2),
                             max_features=1500,
                             sublinear_tf=True,
                             min_df=2)
# Classifier
classifier = LinearSVC(penalty='l2',
                       loss='squared_hinge',
                       dual=True,
                       tol=1e-6, C=1.1,
                       multi_class='crammer_singer',
                       fit_intercept=True,
                       intercept_scaling=1,
                       class_weight=class_weight,
                       random_state=42,
                       max_iter=1000)


## Experiment

In [8]:
# Build a classifier pipeline
ml_pipe = Pipeline(
    [("normalizer", normalizer), ("vectorizer", vectorizer), ("classifier", classifier)]
)
# Set experiment
lab = experiment(
    exp_name="Hate Speech",
    host="localhost",
    port=7500,
    model_name="Linear SVC",
    model=ml_pipe,
)
# Evaluate experiment
y_pred = lab.run(X_train, y_train, X_test, y_test, predictions=True)
pd.DataFrame(classification_report(y_test, y_pred, output_dict=True))


[MLFLOW] [START] starting server


[2022-10-25 17:05:38 -0300] [134750] [INFO] Starting gunicorn 20.1.0
[2022-10-25 17:05:38 -0300] [134750] [INFO] Listening at: http://127.0.0.1:7500 (134750)
[2022-10-25 17:05:38 -0300] [134750] [INFO] Using worker: sync
[2022-10-25 17:05:38 -0300] [134752] [INFO] Booting worker with pid: 134752
[2022-10-25 17:05:38 -0300] [134753] [INFO] Booting worker with pid: 134753
[2022-10-25 17:05:38 -0300] [134754] [INFO] Booting worker with pid: 134754
[2022-10-25 17:05:38 -0300] [134755] [INFO] Booting worker with pid: 134755


[MLFLOW][EXECUTION] running experiment




[MLFLOW][FINISHED] experiment executed successfully


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.848485,0.544304,0.72134,0.696394,0.752456
recall,0.721649,0.72067,0.72134,0.72116,0.72134
f1-score,0.779944,0.620192,0.72134,0.700068,0.729511
support,388.0,179.0,0.72134,567.0,567.0
