## Requirements


In [1]:
# Unable warnings
import os
import warnings

warnings.filterwarnings("ignore")
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'


#### Directory adjustment


In [2]:
from pathlib import Path
import sys
import os
# Back to main folder
path = os.path.dirname(os.getcwd())+"/"
os.chdir(path)
sys.path.append(path)


#### Charts


In [3]:
from IPython.display import SVG, display
import matplotlib.pyplot as plt
import seaborn as sns


#### Data Processing


In [4]:
# ETL
import numpy as np
import pandas as pd
# ML preprocessing
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.feature_selection import chi2

#### Natural language processing


In [5]:
# Processing
from nltk.stem.snowball import PortugueseStemmer
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from cleantext import clean
import nltk
import spacy
import re
# Vectorization
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


#### Models


In [6]:
# Tracking
import mlflow
# Pipe
from sklearn.pipeline import Pipeline
# Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC
from sklearn import svm
# Metrics
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report


## Set and split train and test data


In [7]:
# Get data
df_train = pd.read_csv('data/corpus/train_data.csv')
df_test = pd.read_csv('data/corpus/test_data.csv')

# Set target and features
target = 'label'
features = 'text'

# Set train and test
X_train, y_train = df_train[features], df_train[target]
X_test, y_test = df_test[features], df_test[target]


# Class weights
pos = len(df_train.query('label==1'))
neg = len(df_train.query('label==0'))
extra = 1.25
weight_for_0 = (1 / neg) * (len(df_train) / 2.0)
weight_for_1 = (1 / pos) * (len(df_train) / 2.0)*extra
class_weight = {0: weight_for_0, 1: weight_for_1}


# Set k-fold criteria
k_fold = KFold(n_splits=10, shuffle=True, random_state=42)


In [8]:
def Normalizer(text):
    text = re.sub(r"@[^\s]+", "nome_usuario", text)
    text = clean(
        text,
        fix_unicode=True, to_ascii=True,
        lower=True, no_emoji=True,
        no_line_breaks=True, no_urls=True,
        no_emails=True, no_phone_numbers=True,
        no_numbers=False, no_digits=False,
        no_currency_symbols=False, no_punct=True,
        replace_with_punct="",
        replace_with_url="pagina_web",
        replace_with_email="email_usario",
        replace_with_phone_number="numero_telefone",
        replace_with_currency_symbol="simbolo_monetario",
    )
    return text


def StopRemover(text, wordslist):
    stopwords = nltk.corpus.stopwords.words("portuguese")
    stopwords.extend(wordslist)
    return ' '.join([word for word in text.split() if word not in (stopwords)])


In [9]:
class TextNormalizer(BaseEstimator, TransformerMixin):

    def __init__(self, stopwords=True, wordlist=[], stemmer=False, lemma=False):
        self.stopwords = stopwords
        self.wordlist = wordlist
        self.stemmer = stemmer
        self.lemma = lemma

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X = X.apply(str).apply(lambda text: Normalizer(text=text))

        if self.stopwords:
            X = X.apply(str).apply(
                lambda text: StopRemover(text=text, wordslist=self.wordlist))

        if self.lemma:
            nlp = spacy.load('pt_core_news_sm')
            X = X.apply(str).apply(
                lambda x: " ".join([w.lemma_.lower() for w in nlp(x)]))

        if self.stemmer:
            X = X.apply(str).apply(PortugueseStemmer().stem)

        return X.values.astype('U')


In [10]:
# Text normalizer
wordlist = ['nomeusuario', 'paginaweb', 'emailusario',
            'numerotelefone', 'simbolomonetario']

normalizer = TextNormalizer(stopwords=True, wordlist=wordlist)

# Text vectorizer
vectorizer = TfidfVectorizer(lowercase=False,
                             analyzer="word",
                             norm='l2',
                             ngram_range=(1, 2),
                             max_features=1500,
                             sublinear_tf=True,
                             min_df=2)
# Classifier
classifier = LinearSVC(penalty='l2',
                       loss='squared_hinge',
                       dual=True,
                       tol=1e-6, C=1.1,
                       multi_class='crammer_singer',
                       fit_intercept=True,
                       intercept_scaling=1,
                       class_weight=class_weight,
                       random_state=42,
                       max_iter=1000)


In [11]:
# Build a classifier pipeline
ml_pipe = Pipeline([('normalizer', normalizer),
                    ('vectorizer', vectorizer),
                    ('classifier', classifier)])

# Train and predict
ml_pipe.fit(X_train, y_train)
y_pred = ml_pipe.predict(X_test)

# Evaluate
pd.DataFrame(classification_report(y_test, y_pred, output_dict=True))


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.848485,0.544304,0.72134,0.696394,0.752456
recall,0.721649,0.72067,0.72134,0.72116,0.72134
f1-score,0.779944,0.620192,0.72134,0.700068,0.729511
support,388.0,179.0,0.72134,567.0,567.0


## MLflow tracking


In [12]:
# mlflow.set_tracking_uri('http://127.0.0.1:5000')
# mlflow.set_experiment('Hate Speech')


In [13]:
# from sklearn.metrics import accuracy_score
# from sklearn.metrics import recall_score
# from sklearn.metrics import roc_auc_score
# from sklearn.metrics import f1_score


In [14]:
# with mlflow.start_run():

#     ml_pipe = Pipeline([('vectorizer', TfidfVectorizer(lowercase=False,
#                                                        analyzer="word",
#                                                        norm='l2',
#                                                        ngram_range=(1, 3),
#                                                        max_features=100)),
#                         ('classifier', DecisionTreeClassifier(random_state=42,
#                                                               class_weight={0: 1, 1: 1.5}))])

#     ml_pipe.fit(X_train, y_train)
#     y_predict = ml_pipe.predict(X_test)
    
#     mlflow.log_params(ml_pipe.get_params())
#     mlflow.log_metric('accuracy', accuracy_score(y_test, y_predict))
#     mlflow.log_metric('recall', recall_score(y_test, y_predict))
#     mlflow.log_metric('auc', roc_auc_score(y_test, y_predict))
#     mlflow.log_metric('f1', f1_score(y_test, y_predict))
#     mlflow.sklearn.log_model(ml_pipe, 'DecisionTreeClassifier')
