## Requirements


In [1]:
# Unable warnings
import os
import warnings

warnings.filterwarnings("ignore")
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'


#### Directory adjustment


In [2]:
from pathlib import Path
import sys
import os
# Back to main folder
path = os.path.dirname(os.getcwd())+"/"
os.chdir(path)
sys.path.append(path)


#### Data Processing


In [3]:
# ETL
import numpy as np
import pandas as pd
from pandas import MultiIndex, Int64Index

# ML preprocessing
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold


#### Natural language processing


In [4]:
import gensim
import spacy
import re

# Vectorization
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from src.transformers.text import TextNormalizer
from gensim.models import KeyedVectors


#### Models


In [5]:
# Tracking
from src.experiment.tracking import experiment

# Pipe
from sklearn.pipeline import Pipeline
from sklearn.utils import class_weight

# Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier

# Metrics
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score


## Data version control (DVC)


In [6]:
# DVC
from src.data.control import version

df_train, df_test = version().split(test_size=0.2)


## Data manipulation

In [7]:
# Set target and features
target = "label"
features = "text"

# Set train and test
X_train, y_train = df_train[features], df_train[target]
X_test, y_test = df_test[features], df_test[target]


# Set k-fold criteria
k_fold = KFold(n_splits=10, shuffle=True, random_state=42)


# Class weights
pos = len(df_train.query("label==1"))
neg = len(df_train.query("label==0"))
weight_for_0 = (1 / neg) * (len(df_train) / 2.0)
weight_for_1 = (1 / pos) * (len(df_train) / 2.0) * 1.1
relative_weight = {0: weight_for_0, 1: weight_for_1}


## Model building 

#### Pipe structures


In [8]:
# Text normalizer
wordlist = [
    "nomeusuario",
    "paginaweb",
    "emailusario",
    "numerotelefone",
    "simbolomonetario",
]

normalizer = TextNormalizer(    stopwords=True, wordlist=wordlist, stemmer=False, lemma=False)

# Text vectorizer
vectorizer = TfidfVectorizer(
    lowercase=False,
    analyzer="char_wb",
    norm="l2",
    ngram_range=(1, 5),
    max_features=2500,
    sublinear_tf=True,
    min_df=2,
)


#### Base model

In [9]:
# Fit train weights
fit_weights = class_weight.compute_sample_weight(
    class_weight=relative_weight, y=y_train
)
# Classifeir
clf = XGBClassifier(booster="gblinear", eta="0.1" ,feature_selector="shuffle" )

# Vectorizer
vectorizer.fit(X_train)
XX_train = vectorizer.transform(normalizer.transform(X_train))
XX_test = vectorizer.transform(normalizer.transform(X_test))

# Train
clf.fit(XX_train, y_train, sample_weight=fit_weights, eval_metric="error")

# Evaluate
pd.DataFrame(classification_report(y_test, clf.predict(XX_test), output_dict=True))


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.78963,0.46841,0.659612,0.62902,0.688505
recall,0.685972,0.602241,0.659612,0.644106,0.659612
f1-score,0.73416,0.526961,0.659612,0.63056,0.66893
support,777.0,357.0,0.659612,1134.0,1134.0


#### Multi model test

In [10]:
# Classifiers
classifiers = {
    "DecisionTree": DecisionTreeClassifier(
        random_state=42,
        class_weight=relative_weight,
        min_samples_split=2,
        max_features="auto",
    ),
    "RandomForest": RandomForestClassifier(
        random_state=42,
        class_weight=relative_weight,
        min_samples_split=2,
        max_features="auto",
        oob_score=True,
    ),
}


# Run experiments
for model_name, classifier in classifiers.items():

    # Build a classifier pipeline
    ml_pipe = Pipeline(
        [
            ("normalizer", normalizer),
            ("vectorizer", vectorizer),
            ("classifier", classifier),
        ]
    )

    # Set experiment
    lab = experiment(
        exp_name="Hate Speech",
        host="localhost",
        port=7500,
        model_name=model_name,
        model=ml_pipe,
    )

    # Evaluate experiment
    y_pred = lab.run(X_train, y_train, X_test, y_test)


[MLFLOW] [START] server already running
[MLFLOW][EXECUTION] running experiment


                         'numerotelefone', 'simbolomonetario'])), ('vectorizer', TfidfVectorizer(analyzer='char_wb', lowercase=False, max_features=2500,
                min_df=2, ngram_range=(1, 5), sublinear_tf=True)), ('classifier', DecisionTreeClassifier(class_weight={0: 0.7299645960733827,
                                     1: 1.7458362491252624},
                       max_features='auto', random_state=...`
                         'numerotelefone', 'simbolomonetario'])), ('vectorizer', TfidfVectorizer(analyzer='char_wb', lowercase=False, max_features=2500,
                min_df=2, ngram_range=(1, 5), sublinear_tf=True)), ('classifier', RandomForestClassifier(class_weight={0: 0.7299645960733827,
                                     1: 1.7458362491252624},
                       max_features='auto', oob_score=Tru...`


[MLFLOW] [FINISHED] experiment executed successfully
model:DecisionTree - acc:0.6305114638447972 - rec:0.48179271708683474 - auc:0.5903172079642668 - f1:0.45085190039318473 

[MLFLOW] [START] server already running
[MLFLOW][EXECUTION] running experiment




[MLFLOW] [FINISHED] experiment executed successfully
model:RandomForest - acc:0.7654320987654321 - rec:0.3949579831932773 - auc:0.6653039594216065 - f1:0.5145985401459854 

