In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas  as pd
import numpy as np
from scipy.sparse import hstack, vstack
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, confusion_matrix, classification_report

from sqlalchemy import create_engine
from joblib import load, dump
from text_preproc_pipeline import preproc
from env import *
from multiprocessing import Pool, cpu_count
import time

In [2]:
def vects(df):
    title_vec_model_final = load(r'vectoriser_models\title_tfidf_cat5_v4_1.pkl')
    desc_vec_model_final = load(r'vectoriser_models\desc_tfidf_cat5_v4_1.pkl')
    subcat_vec_model_final = load(r'vectoriser_models\subcat_tfidf_cat5_v4_1.pkl')
    pl_vec_model_final = load(r'vectoriser_models\pl_tfidf_cat5_v4_1.pkl')

    title_mat = title_vec_model_final.transform(df['title_proc'].fillna(''))
    desc_mat = desc_vec_model_final.transform(df['desc_proc'].fillna(''))
    subcat_mat = subcat_vec_model_final.transform(df['subcat_proc'].fillna(''))
    pl_mat = pl_vec_model_final.transform(df['pl_proc'].fillna(''))

    X_mat = hstack((title_mat, desc_mat, subcat_mat, pl_mat))
    return X_mat

### tokenising the independent variables
def preproc2(df_proc, isbase=False):
    df_proc['title_proc'] = df_proc['title'].apply(lambda x: preproc(x))
    df_proc['desc_proc'] = df_proc['description'].apply(lambda x: preproc(x))
    df_proc['subcat_proc'] = df_proc['shopify_subcategory'].apply(lambda x: preproc(x))
    if isbase == True:
        df_proc['pl_proc'] = df_proc['page_link'].str.split('/products/',expand=True)[1].apply(lambda x: preproc(x))
    else:
        df_proc['pl_proc'] = df_proc['page_link'].apply(lambda x: preproc(x))

    df2 = df_proc[['title_proc','subcat_proc','desc_proc', 'pl_proc']]

    return vects(df2)
    

In [7]:
def train_models(models, df_main, baseline_df):

    df_list = []
    X_mat = preproc2(df_main)
    y = df_main['category5_token']
    X_base_mat = preproc2(baseline_df, isbase=True)
    y_base = baseline_df['cat5_tokens']
    y_bin = LabelBinarizer().fit_transform(y)
    

    for name, model in models:
        start = time.time()
        clf = model.fit(X_mat, y)
        stop = time.time()
        y_hat = clf.predict(X_mat)
        acc = accuracy_score(y, y_hat)
        y_hat_base = clf.predict(X_base_mat)
        base_acc = accuracy_score(y_base, y_hat_base)
        roc_sc = roc_auc_score(y_bin, clf.predict_proba(X_mat), multi_class="ovr")

        score_df = pd.DataFrame({
            'model_name' : [name],
            'Accuracy' : [acc],
            'Baseline_accuracy' : [base_acc],
            'roc_auc_score_ovr' : [roc_sc],
            'training_time_min' : [(stop-start)/60]
        })
        df_list.append(score_df)        
        dump(clf, f'classification_models/{name}_model.pkl')

    final_scores = pd.concat(df_list)
    final_scores.to_csv('final_scores.csv', index=False)

In [3]:
df_train = pd.read_csv('new_train_set_93k.csv')
df_train.shape

(93638, 6)

In [100]:
base_df = pd.read_csv('baseline_14112022.csv').rename(columns={'id':'_id', 'store_id':'_store_id', 'handle':'page_link', 'body_html':'description', 'product_type' :'shopify_subcategory'})
base_df.shape

(823, 13)

In [8]:
models = [('naive_bayes', MultinomialNB()), 
          ('decision_tree', DecisionTreeClassifier()),
          ('random_forest', RandomForestClassifier()), 
          ('xgboost', XGBClassifier()),
          ('lightgbm', LGBMClassifier())
        ]

train_models(models, df_train, base_df)



In [11]:
score_df = pd.read_csv('final_scores.csv')
score_df

Unnamed: 0,model_name,Accuracy,Baseline_accuracy,training_time
0,naive_bayes,0.919819,0.741191,0.048277
1,decision_tree,0.999178,0.918591,11.727274
2,random_forest,0.999178,0.972053,7.843192
3,xgboost,0.999028,0.922236,41.518883
4,lightgbm,0.013563,0.001215,56.002119


In [110]:
pkl_files = ['naive_bayes_model.pkl',
             'decision_tree_model.pkl',
             'random_forest_model.pkl',
             'xgboost_model.pkl',
             'lightgbm_model.pkl'
            ]


lb = LabelBinarizer()
roc_list= []
X_mat = preproc2(df_train)
y = df_train['category5_token']
y_bin = lb.fit_transform(y)


for pkl_file in pkl_files:
    model = load(f'classification_models/{pkl_file}')
    y_proba = model.predict_proba(X_mat)
    roc_sc = roc_auc_score(y_bin, y_proba, multi_class='ovr')
    roc_list.append(roc_sc)

score_df = pd.read_csv('final_scores.csv')
score_df['roc_auc_score_ovr'] = roc_list
score_df

KeyboardInterrupt: 