## Requirements


In [1]:
# Unable warnings
import os
import warnings

warnings.filterwarnings("ignore")
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'


#### Directory adjustment


In [2]:
from pathlib import Path
import sys
import os
# Back to main folder
path = os.path.dirname(os.getcwd())+"/"
os.chdir(path)
sys.path.append(path)


#### Data Processing


In [3]:
# ETL
import numpy as np
import pandas as pd
from pandas import MultiIndex, Int64Index
# ML preprocessing
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold


#### Natural language processing


In [4]:
import gensim
import spacy
import re 

# Vectorization
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import KeyedVectors


#### Models


In [5]:
# Tracking
from src.experiment.tracking import experiment

# Pipe
from sklearn.pipeline import Pipeline

# Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier

# Metrics
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report


from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score


#### DVC

In [6]:
from src.data.control import version

df_train, df_test = version().split(test_size=0.1)


## Set and split train and test data


In [7]:
# Set target and features
target = "label"
features = "text"

# Set train and test
X_train, y_train = df_train[features], df_train[target]
X_test, y_test = df_test[features], df_test[target]

# Set k-fold criteria
k_fold = KFold(n_splits=10, shuffle=True, random_state=42)


# Class weights
pos = len(df_train.query("label==1"))
neg = len(df_train.query("label==0"))

weight_for_0 = (1 / neg) * (len(df_train) / 2.0)
weight_for_1 = (1 / pos) * (len(df_train) / 2.0)
class_weight = {0: weight_for_0, 1: weight_for_1}


## Model optimization 

#### Base model


In [8]:
# Build a classifier pipeline
vectorizer = TfidfVectorizer(
    lowercase=False,
    analyzer="word",
    norm="l2",
    ngram_range=(1, 2),
    max_features=1500,
    min_df=5,
)

classifier = LinearSVC(
    penalty="l2",
    loss="squared_hinge",
    dual=True,
    tol=1e-6,
    C=1.1,
    multi_class="crammer_singer",
    fit_intercept=True,
    intercept_scaling=1,
    class_weight=class_weight,
    random_state=42,
    max_iter=1000,
)

ml_pipe = Pipeline([("vectorizer", vectorizer), ("classifier", classifier)])




#### Grid search

In [9]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score


In [10]:
weights_list = [
    {0: weight_for_0, 1: weight_for_1},
    {0: weight_for_0, 1: weight_for_1 * 1.1},
]

grid_params = {
    "classifier__C": [1.1],
    "classifier__class_weight": weights_list,
    "vectorizer__analyzer": ["word", "char_wb"],
    "vectorizer__ngram_range": [(1, 2), (1, 5)],
    "vectorizer__max_features": [1500, 2500],
    "vectorizer__sublinear_tf": [True],
    "vectorizer__min_df": [1, 2, 3],
}


In [11]:
# Parameters search
grid = GridSearchCV(ml_pipe, grid_params, cv=k_fold, scoring="f1_weighted")
grid.fit(X_train, y_train)

print("Best Score:  ", grid.best_score_)
print("Best Estimator:  ", grid.best_estimator_)


Best Score:   0.7042666070483065
Best Estimator:   Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(analyzer='char_wb', lowercase=False,
                                 max_features=2500, min_df=2,
                                 ngram_range=(1, 5), sublinear_tf=True)),
                ('classifier',
                 LinearSVC(C=1.1,
                           class_weight={0: 0.7298340961098398,
                                         1: 1.5877411325451152},
                           multi_class='crammer_singer', random_state=42,
                           tol=1e-06))])


## Experiment

In [12]:
# Set experiment
lab = experiment(
    exp_name="Hate Speech",
    model_name="Linear SVC",
    model=grid.best_estimator_,
)

# Evaluate experiment
y_pred = lab.run(X_train, y_train, X_test, y_test, predictions=True)
pd.DataFrame(classification_report(y_test, y_pred, output_dict=True))


[MLFLOW] [START] server already running
[MLFLOW][EXECUTION] running experiment
[MLFLOW][FINISHED] experiment executed successfully


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.820728,0.547619,0.719577,0.684174,0.734509
recall,0.755155,0.642458,0.719577,0.698806,0.719577
f1-score,0.786577,0.59126,0.719577,0.688918,0.724916
support,388.0,179.0,0.719577,567.0,567.0
