## Requirements


In [1]:
# Unable warnings
import os
import warnings

warnings.filterwarnings("ignore")
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'


#### Directory adjustment


In [2]:
from pathlib import Path
import sys
import os
# Back to main folder
path = os.path.dirname(os.getcwd())+"/"
os.chdir(path)
sys.path.append(path)


#### Data Processing


In [3]:
# ETL
import numpy as np
import pandas as pd
from pandas import MultiIndex, Int64Index
# ML preprocessing
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold


#### Natural language processing


In [4]:
import gensim
import spacy
import re 

# Vectorization
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import KeyedVectors


#### Models


In [5]:
# Pipe
from sklearn.pipeline import Pipeline
# Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier
# Metrics
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from src.ModelAnalysis import ranking_recall

from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score


### Tracking

In [6]:
import mlflow
mlflow.set_tracking_uri('http://127.0.0.1:5000')
mlflow.set_experiment('Hate Speech')

<Experiment: artifact_location='./artifacts/1', creation_time=1665929754799, experiment_id='1', last_update_time=1665929754799, lifecycle_stage='active', name='Hate Speech', tags={}>

## Set and split train and test data


In [7]:
# Get data
df = pd.read_csv('data/corpus/augmented_corpus_fortuna.csv')
df.head(2)


Unnamed: 0,text_nonstop,text_lemma,text,length_text_nonstop,length_text_lemma,length_text,label,count_word_text_nonstop,count_word_text_lemma,count_word_text,...,pron,adp,aux,cconj,num,space,intj,sym,punct,part
0,cara vive outro mundo nao mundo real refugiado...,caro viver outro mundo nao mundo real refugiad...,nomeusuario o cara vive em outro mundo nao no ...,85,82,124,1,19,19,20,...,0,0,0,0,0,0,0,0,0,0
1,incompetentes nao cuidam povo brasileiro pouco...,incompetente nao cuidar povo brasileiro pouco ...,nomeusuario estes incompetentes nao cuidam nem...,69,66,108,0,20,20,20,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# Set target and features
target = 'label'
features = 'text_nonstop'

# Break apart dataset
X = df[features].values.astype('U')
y = df[target]


# Split train abd test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42)

# Set k-fold criteria
k_fold = KFold(n_splits=10, shuffle=True, random_state=42)


# Class weights
pos = len(df.query('label==1'))
neg = len(df.query('label==0'))
weight_for_0 = (1 / neg) * (len(df) / 2.0)
weight_for_1 = (1 / pos) * (len(df) / 2.0)
class_weight = {0: weight_for_0, 1: weight_for_1}


## Model building 

#### Basic structure


In [10]:
# Pipe
clf = Pipeline([('vectorizer', TfidfVectorizer(lowercase=False,
                                               analyzer="word",
                                               norm='l2',
                                               ngram_range=(1, 2),
                                               max_features=1500)),
                ('classifier', XGBClassifier(random_state=42, seed=42, colsample_bytree=0.6, subsample=0.7))])

# Train
clf.fit(X_train, y_train)

# Evaluate
pd.DataFrame(classification_report(
    y_test, clf.predict(X_test), output_dict=True))




Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.774834,0.671053,0.753968,0.722944,0.742162
recall,0.903475,0.428571,0.753968,0.666023,0.753968
f1-score,0.834225,0.523077,0.753968,0.678651,0.736271
support,777.0,357.0,0.753968,1134.0,1134.0


#### Multi model test

In [11]:
vectorizer = TfidfVectorizer(lowercase=False, analyzer="word",
                             norm='l2', ngram_range=(1, 2), max_features=1500)


In [12]:
classifiers = {'GradientBoosting': GradientBoostingClassifier(),
               'KNeighbors': KNeighborsClassifier(),
               'MultinomialNB': MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None),
               'Bernoulli': BernoulliNB(alpha=1.0, binarize=0.0, fit_prior=True, class_prior=None),
               'SVC': LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='crammer_singer', fit_intercept=True, intercept_scaling=1, class_weight=class_weight, verbose=0, random_state=42, max_iter=1000),
               'LogisticRegression': LogisticRegression(penalty='l2', max_iter=200, C=1),
               'SGDC': SGDClassifier(loss='hinge', max_iter=200),
               'DecisionTree': DecisionTreeClassifier(random_state=42, class_weight=class_weight),
               'RandomForest': RandomForestClassifier(random_state=42, class_weight=class_weight),
               'SVM': svm.SVC(kernel='rbf')}


In [13]:
for model_name, classifier in classifiers.items():
    mlflow.sklearn.autolog()
    with mlflow.start_run():
        ml_pipe = Pipeline([('vectorizer', vectorizer),
                           ('classifier', classifier)])

        # Model fit
        ml_pipe.fit(X_train, y_train)
        y_predict = ml_pipe.predict(X_test)

        # Tracking
        mlflow.log_params(ml_pipe.get_params())
        mlflow.log_metric('precision', precision_score(y_test, y_predict))
        mlflow.log_metric('accuracy', accuracy_score(y_test, y_predict))
        mlflow.log_metric('recall', recall_score(y_test, y_predict))
        mlflow.log_metric('auc', roc_auc_score(y_test, y_predict))
        mlflow.log_metric('f1', f1_score(y_test, y_predict))
        mlflow.sklearn.log_model(ml_pipe, model_name)
