## Requirements


In [1]:
# Unable warnings
import os
import warnings

warnings.filterwarnings("ignore")
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

#### Directory adjustment


In [2]:
from pathlib import Path
import sys
import os

# Back to main folder
path = os.path.dirname(os.getcwd()) + "/"
os.chdir(path)
sys.path.append(path)


#### Data Processing


In [3]:
# ETL
import numpy as np
import pandas as pd
import multiprocessing

# ML preprocessing
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.feature_selection import chi2


#### Natural language processing


In [4]:
# Basic
import re

# Vectorization
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Embedding
from src.TextVectorization import MeanEmbeddingVectorizer
from gensim.models import KeyedVectors
from gensim import models
import gensim


#### Models


In [5]:
# Pipe
from sklearn.pipeline import Pipeline
# Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier
# Metrics
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from src.ModelAnalysis import ranking_recall

from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score


### Tracking


In [6]:
import mlflow
mlflow.set_tracking_uri('http://127.0.0.1:5000')
mlflow.set_experiment('Hate Speech')

<Experiment: artifact_location='./artifacts/1', creation_time=1665929754799, experiment_id='1', last_update_time=1665929754799, lifecycle_stage='active', name='Hate Speech', tags={}>

## Set and split train and test data


In [7]:
# Get data
df = pd.read_csv("data/corpus/augmented_corpus_fortuna.csv")
df.head(2)


Unnamed: 0,text_nonstop,text_lemma,text,length_text_nonstop,length_text_lemma,length_text,label,count_word_text_nonstop,count_word_text_lemma,count_word_text,...,pron,adp,aux,cconj,num,space,intj,sym,punct,part
0,cara vive outro mundo nao mundo real refugiado...,caro viver outro mundo nao mundo real refugiad...,nomeusuario o cara vive em outro mundo nao no ...,85,82,124,1,19,19,20,...,0,0,0,0,0,0,0,0,0,0
1,incompetentes nao cuidam povo brasileiro pouco...,incompetente nao cuidar povo brasileiro pouco ...,nomeusuario estes incompetentes nao cuidam nem...,69,66,108,0,20,20,20,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# Set target and features
target = "label"
features = "text_nonstop"

# Break apart dataset
X = df[features].values.astype("U")
y = df[target]

# Split train abd test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# Class weights
pos = len(df.query('label==1'))
neg = len(df.query('label==0'))
weight_for_0 = (1 / neg) * (len(df) / 2.0)
weight_for_1 = (1 / pos) * (len(df) / 2.0)
class_weight = {0: weight_for_0, 1: weight_for_1}


In [9]:
classifiers = {'GradientBoosting': GradientBoostingClassifier(),
               'KNeighbors': KNeighborsClassifier(),
               'Bernoulli': BernoulliNB(alpha=1.0, binarize=0.0, fit_prior=True, class_prior=None),
               'SVC': LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='crammer_singer', fit_intercept=True, intercept_scaling=1, class_weight=class_weight, verbose=0, random_state=42, max_iter=1000),
               'LogisticRegression': LogisticRegression(penalty='l2', max_iter=200, C=1),
               'SGDC': SGDClassifier(loss='hinge', max_iter=200),
               'DecisionTree': DecisionTreeClassifier(random_state=42, class_weight=class_weight),
               'RandomForest': RandomForestClassifier(random_state=42, class_weight=class_weight),
               'SVM': svm.SVC(kernel='rbf')}

## My own word2vec embedding model

In [10]:
# Define a corpus
corpus = X
cores = multiprocessing.cpu_count()

# Train a own word2vec model
my_embedding_50d = gensim.models.Word2Vec(
    corpus,
    vector_size=50,
    window=4,
    min_count=10,
    sg=1,
    workers=cores - 1,
    batch_words=10000,
    alpha=0.1,
    min_alpha=0.0001,
    negative=20,
)

my_embedding_100d = gensim.models.Word2Vec(
    corpus,
    vector_size=50,
    window=4,
    min_count=10,
    sg=1,
    workers=cores - 1,
    batch_words=10000,
    alpha=0.1,
    min_alpha=0.0001,
    negative=20,
)

my_embedding_300d = gensim.models.Word2Vec(
    corpus,
    vector_size=300,
    window=4,
    min_count=10,
    sg=1,
    workers=cores - 1,
    batch_words=10000,
    alpha=0.1,
    min_alpha=0.0001,
    negative=20,
)


my_embedding_1000d = gensim.models.Word2Vec(
    corpus,
    vector_size=1000,
    window=4,
    min_count=10,
    sg=1,
    workers=cores - 1,
    batch_words=10000,
    alpha=0.1,
    min_alpha=0.0001,
    negative=20,
)

# Make embedding dictionary {token:vector}
my_embedding_50d = dict(
    zip(my_embedding_50d.wv.index_to_key, my_embedding_50d.wv.vectors))

my_embedding_100d = dict(
    zip(my_embedding_100d.wv.index_to_key, my_embedding_100d.wv.vectors))

my_embedding_300d = dict(
    zip(my_embedding_300d.wv.index_to_key, my_embedding_300d.wv.vectors))

my_embedding_1000d = dict(
    zip(my_embedding_1000d.wv.index_to_key, my_embedding_1000d.wv.vectors))


### Basic pipeline

In [11]:
# Basic pipeline
ml_pipe = Pipeline(
    [
        ("vectorizer", MeanEmbeddingVectorizer(my_embedding_50d)),
        ("classifier", LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1, multi_class='crammer_singer',
         fit_intercept=True, intercept_scaling=1, class_weight=class_weight, verbose=0, random_state=42, max_iter=100000),),
    ]
)

# Train
ml_pipe.fit(X_train, y_train)

pd.DataFrame(classification_report(
    y_test, ml_pipe.predict(X_test), output_dict=True))


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.786195,0.425926,0.614638,0.606061,0.672777
recall,0.60103,0.644258,0.614638,0.622644,0.614638
f1-score,0.681255,0.512821,0.614638,0.597038,0.628229
support,777.0,357.0,0.614638,1134.0,1134.0


### Multi model test - 50D

In [12]:
for model_name, classifier in classifiers.items():
    mlflow.sklearn.autolog()
    with mlflow.start_run():
        ml_pipe = Pipeline([('vectorizer', MeanEmbeddingVectorizer(my_embedding_50d)),
                           ('classifier', classifier)])

        # Model fit
        ml_pipe.fit(X_train, y_train)
        y_predict = ml_pipe.predict(X_test)

        # Tracking
        mlflow.log_params(ml_pipe.get_params())
        mlflow.log_metric('precision', precision_score(y_test, y_predict))
        mlflow.log_metric('accuracy', accuracy_score(y_test, y_predict))
        mlflow.log_metric('recall', recall_score(y_test, y_predict))
        mlflow.log_metric('auc', roc_auc_score(y_test, y_predict))
        mlflow.log_metric('f1', f1_score(y_test, y_predict))
        mlflow.sklearn.log_model(ml_pipe, model_name + 'my_embedding_50d')




### Multi model test - 100D

In [13]:
for model_name, classifier in classifiers.items():
    mlflow.sklearn.autolog()
    with mlflow.start_run():
        ml_pipe = Pipeline([('vectorizer', MeanEmbeddingVectorizer(my_embedding_100d)),
                           ('classifier', classifier)])

        # Model fit
        ml_pipe.fit(X_train, y_train)
        y_predict = ml_pipe.predict(X_test)

        # Tracking
        mlflow.log_params(ml_pipe.get_params())
        mlflow.log_metric('precision', precision_score(y_test, y_predict))
        mlflow.log_metric('accuracy', accuracy_score(y_test, y_predict))
        mlflow.log_metric('recall', recall_score(y_test, y_predict))
        mlflow.log_metric('auc', roc_auc_score(y_test, y_predict))
        mlflow.log_metric('f1', f1_score(y_test, y_predict))
        mlflow.sklearn.log_model(ml_pipe, model_name + 'my_embedding_100d')




### Multi model test - 300D

In [14]:
for model_name, classifier in classifiers.items():
    mlflow.sklearn.autolog()
    with mlflow.start_run():
        ml_pipe = Pipeline([('vectorizer', MeanEmbeddingVectorizer(my_embedding_300d)),
                           ('classifier', classifier)])

        # Model fit
        ml_pipe.fit(X_train, y_train)
        y_predict = ml_pipe.predict(X_test)

        # Tracking
        mlflow.log_params(ml_pipe.get_params())
        mlflow.log_metric('precision', precision_score(y_test, y_predict))
        mlflow.log_metric('accuracy', accuracy_score(y_test, y_predict))
        mlflow.log_metric('recall', recall_score(y_test, y_predict))
        mlflow.log_metric('auc', roc_auc_score(y_test, y_predict))
        mlflow.log_metric('f1', f1_score(y_test, y_predict))
        mlflow.sklearn.log_model(ml_pipe, model_name + 'my_embedding_300d')




### Multi model test - 1000D

In [15]:
for model_name, classifier in classifiers.items():
    mlflow.sklearn.autolog()
    with mlflow.start_run():
        ml_pipe = Pipeline([('vectorizer', MeanEmbeddingVectorizer(my_embedding_1000d)),
                           ('classifier', classifier)])

        # Model fit
        ml_pipe.fit(X_train, y_train)
        y_predict = ml_pipe.predict(X_test)

        # Tracking
        mlflow.log_params(ml_pipe.get_params())
        mlflow.log_metric('precision', precision_score(y_test, y_predict))
        mlflow.log_metric('accuracy', accuracy_score(y_test, y_predict))
        mlflow.log_metric('recall', recall_score(y_test, y_predict))
        mlflow.log_metric('auc', roc_auc_score(y_test, y_predict))
        mlflow.log_metric('f1', f1_score(y_test, y_predict))
        mlflow.sklearn.log_model(ml_pipe, model_name + 'my_embedding_1000d')




## Pre-trained model Wor2vec


In [16]:
# # Load a pre-trained model
pretrained_model = models.KeyedVectors.load_word2vec_format(
    "data/pretrained-skipgram/skip_s50.txt", binary=False
)
pretrained_w2v = dict(
    zip(pretrained_model.index_to_key, pretrained_model.vectors))


### Basic pipeline

In [17]:
# Basic pipeline
ml_pipe = Pipeline(
    [
        ("vectorizer", MeanEmbeddingVectorizer(pretrained_w2v)),
        ("classifier", LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1, multi_class='crammer_singer',
         fit_intercept=True, intercept_scaling=1, class_weight=class_weight, verbose=0, random_state=42, max_iter=100000),),
    ]
)

# Train
ml_pipe.fit(X_train, y_train)

pd.DataFrame(classification_report(
    y_test, ml_pipe.predict(X_test), output_dict=True))


2022/10/19 11:34:42 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'fab14c9d475745bb85b3d7f05033761e', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.758308,0.417373,0.616402,0.587841,0.650977
recall,0.646075,0.551821,0.616402,0.598948,0.616402
f1-score,0.697707,0.475271,0.616402,0.586489,0.627681
support,777.0,357.0,0.616402,1134.0,1134.0


### Multi model
Run `src/TrainMultiModel.py`