# Import

Import all packages that will be used

In [1]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import sklearn
import joblib
import time
from string import punctuation

In [2]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, f1_score, plot_roc_curve, make_scorer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.utils.class_weight import compute_class_weight
import lightgbm as lgb

import nltk
from nltk.corpus import stopwords

tqdm.pandas()

# Load and Split Dataset

Load and split dataset:
- Make each processing step as a function.
- Make main function.
- Make "tuneable" parameters

In [3]:
def split_xy(df, x_col, y_col):
    """
    Splitting x and y variables.
    
    Args:
    - df(DataFrame): initial input dataframe
    - x_col(list): List of x variable columns
    - y_col(list): List of y variable columns
    
    Returns:
    - df[x_col](DataFrame): Dataframe contains x columns and id
    - df[y_col](DataFrame): Dataframe contains y columns and id
    """
    x_col = ['id']+x_col
    y_col = ['id']+y_col
    return df[x_col], df[y_col]


def get_stratify_col(y, stratify_col):
    """
    Splitting x and y variables.
    
    Args:
    - y(DataFrame): DataFrame contains target variables and id
    - stratify_col(str): column name of the reference column.
    
    Returns:
    - stratification: Dataframe contains column that will be used as stratification reference
    """
    if stratify_col is None:
        stratification = None
    else:
        stratification = y[stratify_col]
    
    return stratification


def run_split_data(x, y, stratify_col=None, TEST_SIZE=0.2):
    """
    Splitting x and y variables.
    
    Args:
    - y(DataFrame): DataFrame contains predictor variables and id
    - y(DataFrame): DataFrame contains target variables and id
    - stratify_col(str): column name of the reference column.
    - TEST_SIZE(float): Size of the test and validation dataset size.
    
    Returns:
    - x_blabla(DataFrame): X variables for train/valid/test dataset
    - y_blabla(DataFrame): Y variables for train/valid/test dataset
    """
    strat_train = get_stratify_col(y, stratify_col)
    x_train, x_test, y_train, y_test = train_test_split(x, y,
                                       stratify = strat_train,
                                       test_size= TEST_SIZE*2,
                                       random_state= 42)
    
    strat_test = get_stratify_col(y_test, stratify_col)
    x_valid, x_test, y_valid, y_test = train_test_split(x_test, y_test,
                                       stratify = strat_test,
                                       test_size= 0.5,
                                       random_state= 42)
    
    return x_train, y_train, x_valid, y_valid, x_test, y_test

### Main Function

Main function for loading and splitting dataset. It runs:
- X & Y split
- Train-Valid-Test split with stratification

In [4]:
def main_load(params):
    df = pd.read_csv(params["file_loc"])
    x_all, y_all = split_xy(df, [params['x_col']], [params['y_col']])
    x_train, y_train,x_valid, y_valid,x_test, y_test = run_split_data(x_all, y_all, 
                                                                      params['stratify'], 
                                                                      params['test_size'])
    joblib.dump(x_train, params["out_path"]+"x_train.pkl")
    joblib.dump(y_train, params["out_path"]+"y_train.pkl")
    joblib.dump(x_valid, params["out_path"]+"x_valid.pkl")
    joblib.dump(y_valid, params["out_path"]+"y_valid.pkl")
    joblib.dump(x_test, params["out_path"]+"x_test.pkl")
    joblib.dump(y_test, params["out_path"]+"y_test.pkl")
    
    return x_train, y_train, x_valid, y_valid, x_test, y_test

In [5]:
params = {'file_loc': '../data/comments_data.csv', 
          'x_col':'comment_text', 
          'y_col':'toxic', 
          'stratify': 'toxic',
          'out_path': "../output/",
          'test_size':0.2}

In [6]:
x_train, y_train, x_valid, y_valid, x_test, y_test = main_load(params)

# Data Preprocessing
Data preprocessing:
- Make each processing step as a function.
- Make main function.
- Make parameters for preprocessing on/off for experimentation.

In [7]:
def lowercase_char(df_in, do=True):
    """
    Function for lowercasing strings
    """
    df = df_in.copy()  # Avoid modifying the main dataframe
    if do:
        df['comment_text'] = df['comment_text'].str.lower()
    return df

def phrase_decontraction(phrase):
    """
    Function to decontract phrases
    """
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

def decontract(df_in, do=True):
    """
    Main function for decontracting phrases
    """
    df = df_in.copy()  # Avoid modifying the main dataframe
    if do:
        df['comment_text'] = df['comment_text'].apply(phrase_decontraction)
    return df

def remove_numbers(df_in, do=True):
    """
    Function for removing numbers from text
    """
    df = df_in.copy()  # Avoid modifying the main dataframe
    if do:
        df['comment_text'] = df['comment_text'].apply(lambda x: ''.join(string for string in x if not string.isdigit()))
    return df

def remove_punc(df_in, do=True):
    """
    Function for removing punctuation in text
    """
    df = df_in.copy()  # Avoid modifying the main dataframe
    if do:
        df['comment_text'] = df['comment_text'].str.replace(f'[{punctuation}]', ' ', regex=True )
    return df

def remove_whitespace(df_in, do=True):
    """
    Function for removing whitespace in text
    """
    df = df_in.copy()  # Avoid modifying the main dataframe
    if do:
        df['comment_text'] = df['comment_text'].apply(lambda x: " ".join(x.split()))
    return df

def remove_stop(df_in, eng_stopwords, do=True):
    """
    Function for removing stopwords in text
    """
    df = df_in.copy()  # Avoid modifying the main dataframe
    if do:
        df['comment_text'] = df['comment_text'].apply(lambda x: " ".join([word for word in nltk.word_tokenize(x) if word not in eng_stopwords]))
    return df

Make preprocess function which executes each of the preprocessing steps

In [8]:
def preprocess(df_in, params):
    """
    A function to execute the preprocessing steps.
    
    Args:
    - df_in(DataFrame): Input dataframe
    - params(dict): preprocessing parameters
    
    Return:
    - df(DataFrame): preprocessed data
    """
    eng_stopwords = stopwords.words('english')
    df = df_in.copy()
    df = lowercase_char(df, params['lowercase'])
    df = decontract(df, params['decontract'])
    df = remove_numbers(df, params['remove_num'])
    df = remove_punc(df, params['remove_punc'])
    df = remove_whitespace(df, params['remove_space'])
    df = remove_stop(df, eng_stopwords, params['remove_stop'])
    return df

Main function, executing preprocessing for each train, valid, test dataset

In [9]:
def main_prep(x_train,x_valid,x_test, params):
    x_list = [x_train,x_valid,x_test]

    x_preprocessed = []
    for x in tqdm(x_list):
        temp = preprocess(x, params)
        x_preprocessed.append(temp)

    name = ['train','valid','test']
    for i,x in tqdm(enumerate(x_preprocessed)):
        joblib.dump(x, f"{params['out_path']}x_{name[i]}_preprocessed.pkl")
    
    return x_preprocessed

Create a params variable which contains dictionary of which process should be turned on or off

In [10]:
params_preprocess = { 'lowercase': True, 
                      'decontract':True, 
                      'remove_num':True, 
                      'remove_punc': True, 
                      'remove_space': True, 
                      'remove_stop': True, 
                      'out_path': "../output/"}

In [11]:
x_preprocessed_list = main_prep(x_train,x_valid,x_test,params_preprocess)

100%|██████████| 3/3 [01:56<00:00, 38.97s/it]
3it [00:00,  6.24it/s]


# Feature Engineering

Create functions that perform feature engineering. In the current project, we only use tf-idf without additional step for feature creation or feature selection.

In [12]:
def vectorize_tfidf(df_in, params, vectorizer=None):
    """
    function to execute vectorization using tfidf
    
    Args:
    - df_in(DataFrame): Input data
    - params(dict): Vectorizer parameters
    - vectorizer(callable): tfidf vectorizer, default to None. 
    If None, then the function will create a new tfidf vectorizer  
    """
    df = df_in.copy()
    if vectorizer is None:  # fit to train data
        vectorizer = TfidfVectorizer(
            analyzer='word',
            stop_words='english',
            min_df = params['min_df']
        )
        vectorized = vectorizer.fit_transform(df['comment_text'])
        joblib.dump(vectorizer, f"../output/{params['vectorizer_file']}.pkl")
    else:
        vectorized = vectorizer.transform(df['comment_text'])
    
    vectorized_df = pd.DataFrame(vectorized.toarray(), 
                                 columns=vectorizer.get_feature_names(), 
                                 index = df.index)
    df_non_sentence = df.drop(['comment_text'],axis=1)
    df_final = pd.concat([vectorized_df, df_non_sentence],axis=1)
    return df_final, vectorizer

Main function for executing feature engineering

In [13]:
def main_feat(x_preprocessed_list, params):
    """
    Main function for feature engineering
    """
    x_train_preprocessed, x_valid_preprocessed, x_test_preprocessed = x_preprocessed_list
    df_train_vect, vectorizer = vectorize_tfidf(x_train_preprocessed, params)
    df_valid_vect, _ = vectorize_tfidf(x_valid_preprocessed, params, vectorizer)
    df_test_vect, _ = vectorize_tfidf(x_test_preprocessed, params, vectorizer)
    joblib.dump(df_train_vect, f"{params['out_path']}x_train_vect.pkl")
    joblib.dump(df_valid_vect, f"{params['out_path']}x_valid_vect.pkl")
    joblib.dump(df_test_vect, f"{params['out_path']}x_test_vect.pkl")
    
    return df_train_vect, df_valid_vect, df_test_vect

In [14]:
param_vec = {'min_df':0.01, 
             'vectorizer_file': 'vectorizer', 
             'out_path': "../output/"}

In [15]:
x_train_vect, x_valid_vect, x_test_vect = main_feat(x_preprocessed_list, param_vec)



# Modeling

Functions to initiate classifier models

In [None]:
def model_logreg(class_weight = None):
    """
    Function for initiating Logistic Regression Model
    """
    param_dist = {'C' : [0.25, 0.5, 1]}
    base_model = LogisticRegression(random_state=42, solver='liblinear', class_weight=class_weight)
    
    return param_dist, base_model

def model_rf(class_weight = None):
    """
    Function for initiating Random Forest Model
    """
    param_dist = {'n_estimators' : [25, 50, 100]}
    base_model = RandomForestClassifier(random_state=42, n_jobs=-1, class_weight=class_weight)
    
    return param_dist, base_model

def model_lgb(class_weight = None):
    """
    Function for initiating LightGBM Model
    """
    param_dist = {'n_estimators' : [25, 50, 100], 'boosting_type':['gbdt', 'goss']}
    base_model = lgb.LGBMClassifier(random_state=42, n_jobs=-1, class_weight=class_weight)
    
    return param_dist, base_model

Base function for hyperparameter search and classifier calibration

In [None]:
def random_search_cv(model, param, scoring, n_iter, x, y, verbosity=0):
    """
    Just a function to run the hyperparameter search
    """
    random_fit = RandomizedSearchCV(estimator = model, 
                                    param_distributions = param, 
                                    scoring = scoring, 
                                    n_iter = n_iter, 
                                    cv = 5, 
                                    random_state = 42, 
                                    verbose = verbosity)
    random_fit.fit(x, y)
    return random_fit

def calibrate_classifier(model, x_valid, y_valid):
    model_calibrated = CalibratedClassifierCV(model, cv='prefit')
    model_calibrated.fit(x_valid, y_valid)
    
    return model_calibrated

In [None]:
def tune_threshold(model, x_valid, y_valid, scorer):
    """
    Function for threshold adjustment
    
    Args:
        - model(callable): Sklearn model
        - x_valid(DataFrame):
        - y_valid(DataFrame):
        - scorer(callable): Sklearn scorer function, for example: f1_score
        
    Returns:
    - metric_score(float): Maximum metric score
    - best_threshold(float): Best threshold value
    """
    thresholds = np.linspace(0,1,101)
    proba = model.predict_proba(x_valid)[:, 1]
    proba = pd.DataFrame(proba)
    proba.columns = ['probability']
    score = []
    for threshold_value in thresholds:
        proba['prediction'] = np.where( proba['probability'] > threshold_value, 1, 0)
        metric_score = scorer(proba['prediction'], y_valid, average='macro')
        score.append(metric_score)
    metric_score = pd.DataFrame([thresholds,score]).T
    metric_score.columns = ['threshold','metric_score']
    best_score = (metric_score['metric_score'] == metric_score['metric_score'].max())
    best_threshold = metric_score[best_score]['threshold']
    
    return metric_score["metric_score"].max(), best_threshold.values[0]

def select_model(train_log_dict):
    """
    Function for selecting best model
    """
    max_score = max(train_log_dict['model_score'])
    max_index = train_log_dict['model_score'].index(max_score)
    best_model = train_log_dict['model_fit'][max_index]
    best_report = train_log_dict['model_report'][max_index]
    best_threshold = train_log_dict['threshold'][max_index]
    name = train_log_dict['model_name'][max_index]

    return best_model, best_report, best_threshold, name

In [None]:
def classif_report(model_obj, x_test, y_test, best_threshold=None, calc_auc=True):
    code2rel = {'0': 'Non-Toxic', '1': 'Toxic'}
    
    if best_threshold is None:
        pred = model_obj.predict(x_test)
    else:
        proba = model_obj.predict_proba(x_test)[:, 1]
        pred = np.where(proba > best_threshold, 1, 0)

    res = classification_report(
        y_test, pred, output_dict=True, zero_division=0)
    res = pd.DataFrame(res).rename(columns=code2rel).T

    if calc_auc:
        proba = model_obj.predict_proba(x_test)[:, 1]
        auc_score = roc_auc_score(y_test, proba)

        print(
            f"AUC score: {auc_score}, F1-Macro: {res['f1-score']['macro avg']}")
    return pred, res

Create a wrapper function to fit and validate the model:
- Fit : Performs hyperparameter optimization for each model
- Validate: Calibrate model, tune model threshold, and validate model.

In [None]:
def fit(x_train, y_train, model, model_param, scoring='f1', n_iter=3, verbosity=3):
    """
    Fit model
    
    Args:
        - model(callable): sklearn model
        - model_param(dict): sklearn's RandomizedSearchCV params_distribution
    
    Return:
        - model_fitted(callable): model with optimum hyperparams
    """
    model_fitted = random_search_cv(model, model_param, 
                                    scoring, 
                                    n_iter, 
                                    x_train, y_train, 
                                    verbosity)
    print(
        f'Model: {model_fitted.best_estimator_}, {scoring}: {model_fitted.best_score_}')
    
    return model_fitted

def validate(x_valid, y_valid, model_fitted, tune = True):
    """
    Validate model

    Args:
        - x_valid(DataFrame): Validation independent variables
        - y_valid(DataFrame): Validation Dependent variables
        - model_fitted(callable): Sklearn / imblearn fitted model
        
    Return:
        - report_model: sklearn model report
        - model_calibrated(callable): Calibrated model
        - best_threshold(float): Best threshold
    """
    code2rel = {'0': 'Non-Toxic', '1': 'Toxic'}

    # Calibrate Classifier
    model_calibrated = CalibratedClassifierCV(base_estimator=model_fitted,
                                              cv="prefit")
    model_calibrated.fit(x_valid, y_valid)
    
    if tune:
        metric_score, best_threshold = tune_threshold(model_calibrated,
                                                      x_valid,
                                                      y_valid,
                                                      f1_score)
        
        print(f'Best threshold is: {best_threshold}, with score: {metric_score}')
        pred_model, report_model = classif_report(model_calibrated,
                                                  x_valid,
                                                  y_valid,
                                                  best_threshold,
                                                  True)
    else:
        # Report default
        best_threshold = None
        pred_model, report_model = classif_report(
            model_calibrated, x_valid, y_valid, True)

    return report_model, model_calibrated, best_threshold

Main training function that executes the `fit` and `validate` functions:
- Drop the `id` column
- Compute class weight for imbalanced problem
- Create a logging dictionary
- Try each model: Fit and validate each model
- Select best performing model
- Dump model to pickle

In [None]:
def main(x_train, y_train, x_valid, y_valid, params):
    
    x_train = x_train.drop(columns='id')
    y_train = y_train.drop(columns='id')
    x_valid = x_valid.drop(columns='id')
    y_valid = y_valid.drop(columns='id')
    
    y_train = y_train.values.ravel()
    y_valid = y_valid.values.ravel()

    # Add class weight
    if params['use_weight']:
        class_weight = compute_class_weight(class_weight = 'balanced', 
                                            classes = np.unique(y_train), 
                                            y = y_train)
        class_weights = dict(zip(np.unique(y_train), class_weight))
    else:
        class_weights = None
    
    # Initiate models
    logreg = model_logreg
    rf = model_rf
    lgb = model_lgb
    
    # Initiate logs
    train_log_dict = {'model': [logreg, rf, lgb],
                      'model_name': [],
                      'model_fit': [],
                      'model_report': [],
                      'model_score': [],
                      'threshold': [],
                      'fit_time': []}


    # Try Each models
    for model in train_log_dict['model']:
        param_model, base_model = model(class_weights)
        train_log_dict['model_name'].append(base_model.__class__.__name__)
        print(f'Fitting {base_model.__class__.__name__}')

        # Train
        t0 = time.time()
        scoring = make_scorer(f1_score,average='macro')
        fitted_model = fit(
            x_train, y_train, base_model, param_model, 
            scoring=scoring, verbosity=params['verbosity'])
        elapsed_time = time.time() - t0
        print(f'elapsed time: {elapsed_time} s \n')
        train_log_dict['fit_time'].append(elapsed_time)

        # Validate
        report, calibrated_model, best_threshold = validate(
            x_valid, y_valid, fitted_model)
        train_log_dict['model_fit'].append(calibrated_model)
        train_log_dict['threshold'].append(best_threshold)
        train_log_dict['model_report'].append(report)
        train_log_dict['model_score'].append(report['f1-score']['macro avg'])

    best_model, best_report, best_threshold, name = select_model(
        train_log_dict)
    print(
        f"Model: {name}, Score: {best_report['f1-score']['macro avg']}")
    joblib.dump(best_model, params['out_path']+'mantab_model.pkl')
    joblib.dump(best_threshold, params['out_path']+'../model/threshold.pkl')
    joblib.dump(train_log_dict, params['out_path']+'../model/train_log.pkl')
    print(f'\n {best_report}')
    
    return best_model


In [None]:
param_model={'use_weight':True, 
             'verbosity':2, 
             'out_path': "../model/"}

In [None]:
best_model = main(x_train_vect, y_train, x_valid_vect, y_valid, param_model)

Fitting LogisticRegression
Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] END .............................................C=0.25; total time=   0.8s
[CV] END .............................................C=0.25; total time=   0.7s
[CV] END .............................................C=0.25; total time=   0.7s
[CV] END .............................................C=0.25; total time=   0.7s
[CV] END .............................................C=0.25; total time=   0.7s
[CV] END ..............................................C=0.5; total time=   0.7s
[CV] END ..............................................C=0.5; total time=   0.7s
[CV] END ..............................................C=0.5; total time=   0.7s
[CV] END ..............................................C=0.5; total time=   0.7s
[CV] END ..............................................C=0.5; total time=   0.8s
[CV] END ................................................C=1; total time=   0.8s
[CV] END .............

# Prediction

To create prediction function, first, you must know how the data will be passed to the predictor. Often, it requires agreement from your Backend Engineer, MLOps Engineer, and Project manager.

Let's assume that the data will be predicted one by one.

In [None]:
# In the previous preprocessing, we work with DataFrame.
# It'll be easier for us to also work with DataFrame in the prediction stage

def df_constructor(text, id=0):
    df = pd.DataFrame(data={'id':[id], 'comment_text':[text]})
    return df

In [None]:
def main_predict(text, tfidf_vectorizer, model, threshold, param_preprocess, param_vec, id=0):
    df = df_constructor(text, id)
    df_preprocessed = preprocess(df, param_preprocess)
    df_vect, _ = vectorize_tfidf(df_preprocessed, param_vec, tfidf_vectorizer)
    
    code2rel = {0: 'Non-Toxic', 1: 'Toxic'}
    df_vect = df_vect.drop(columns='id')
    proba = model.predict_proba(df_vect)[:, 1]
    predict = 1 if proba > threshold else 0
    
    return code2rel[predict], proba

In [None]:
tfidf_vect = joblib.load("../output/vectorizer.pkl")
model = joblib.load('../model/mantab_model.pkl')
threshold = joblib.load('../model/threshold.pkl')

In [None]:
text = "fuck fuck fuck you cunt, you can't do anything but whining, you useless trash piece of shit."
predict, proba = main_predict(text, tfidf_vect, model, threshold, params_preprocess, param_vec)



In [None]:
predict

'Toxic'