In [3]:
import warnings
warnings.filterwarnings("ignore")

import pandas  as pd
import numpy as np
from scipy.sparse import hstack, vstack
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, confusion_matrix, classification_report

from sqlalchemy import create_engine
from joblib import load
from text_preproc_pipeline import preproc
from env import *
from multiprocessing import Pool, cpu_count
import time

In [None]:
def vects(df):
    title_vec_model_final = TfidfVectorizer(max_df=0.3, min_df=0.0001)
    desc_vec_model_final = TfidfVectorizer(max_df=0.3, min_df=0.0001)
    subcat_vec_model_final = TfidfVectorizer(max_df=0.3, min_df=0.0001)
    pl_vec_model_final = TfidfVectorizer(max_df=0.3, min_df=0.0001)

    title_mat = title_vec_model_final.fit_transform(df['title_proc'].fillna(''))
    desc_mat = desc_vec_model_final.fit_transform(df['desc_proc'].fillna(''))
    subcat_mat = subcat_vec_model_final.fit_transform(df['subcat_proc'].fillna(''))
    pl_mat = pl_vec_model_final.fit_transform(df['pl_proc'].fillna(''))

    X_mat = hstack((title_mat, desc_mat, subcat_mat, pl_mat))
    return X_mat

### tokenising the independent variables
def preproc2(df_proc):
    df_proc['title_proc'] = df_proc['title'].apply(lambda x: preproc(x))
    df_proc['desc_proc'] = df_proc['description'].apply(lambda x: preproc(x))
    df_proc['subcat_proc'] = df_proc['shopify_subcategory'].apply(lambda x: preproc(x))
    df_proc['pl_proc'] = df_proc['page_link'].apply(lambda x: preproc(x))

    df2 = df_proc[['title_proc','subcat_proc','desc_proc', 'pl_proc']]

    return vects(df2)
    

In [None]:
def train_models(models, df_main, baseline_df):

    df_list = []
    X_mat = preproc2(df_main)
    y = df_main['category5_token']
    X_base_mat = preproc2(baseline_df)
    y_base = baseline_df['cat5_tokens']

    for name, model in models[0:1]:
        start = time.time()
        clf = model.fit(X_mat, y)
        stop = time.time()
        y_hat = clf.predict(X_mat)
        acc = accuracy_score(y, y_hat)
        y_hat_base
        base_acc = accuracy_score(y_base, y_hat_base)
        score_df = pd.DataFrame({
            'model_name' : name,
            'Accuracy' : acc,
            'Baseline_accuracy' : base_acc,
            'training_time' : (stop-start)/60
        })
        df_list.append(score_df)        
        joblib.dump(clf, f'classification_models/{name}_model.pkl')

    final_scores = pd.concat(df_list)
    final_scores.to_csv('final_scores.csv')

In [None]:
df_train = pd.read_csv('new_train_set_96k.csv')
df_train.shape

In [6]:
base_df = pd.read_csv('baseline_14112022.csv')
base_df.shape

(823, 13)

In [None]:
models = [('naive_bayes', MultinomialNB()), 
          ('decision_tree', DecisionTreeClassifier()),
          ('rfc', RandomForestClassifier()), 
          ('xgboost', XGBClassifier()),
          ('lightgbm', LGBMClassifier())
        ]

train_models(models, df_train, base_df)