In [9]:
import pandas  as pd
import numpy as np
from scipy.sparse import hstack, vstack
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, confusion_matrix, classification_report

from sqlalchemy import create_engine
from joblib import load
from text_preproc_pipeline import preproc
from env import *
from multiprocessing import Pool, cpu_count
import warnings
warnings.filterwarnings("ignore")

In [2]:
con = create_engine(PRIMARY_DATABASE_URI)


prod_df2 = pd.read_sql("select shopify_product_id, id, title, store_id, category_id, category2_id, category3_id, category5_id, shopify_subcategory, description, page_link, openai_summary, price, is_deleted from products where is_deleted = False ", con=con)
# cat3_df = pd.read_sql("select id, display_name, token_name from category3",con=con).rename(columns={"id":"category3_id","display_name":"category3"})

cat5_df = pd.read_csv('cat5_df.csv')
# prod_df = prod_df.merge(cat1_df).merge(cat2_df).merge(cat5_df)
# prod_df = prod_df.merge(cat3_df).merge(cat5_df)
store_df = pd.read_sql("select id, store_name from brands", con=con).rename(columns={"id":"store_id"})

prod_df2 = prod_df2.merge(store_df, on=['store_id'], how='left')
prod_df2 = prod_df2[~(prod_df2.store_id == 581)]
prod_df2 = prod_df2[~(prod_df2.store_id == 605)]
prod_df2.shape

(30886, 15)

In [3]:
prod_df2.columns

Index(['shopify_product_id', 'id', 'title', 'store_id', 'category_id',
       'category2_id', 'category3_id', 'category5_id', 'shopify_subcategory',
       'description', 'page_link', 'openai_summary', 'price', 'is_deleted',
       'store_name'],
      dtype='object')

In [4]:
prod_df2 = prod_df2.dropna(subset=['category5_id'])
prod_df2['category5_token'] = prod_df2['category5_id'].apply(lambda x: cat5_df[cat5_df['category5_id'] == x]['category5_tokens'].values[0])
prod_df3 = prod_df2[['shopify_product_id', 'title', 'page_link', 'description', 'shopify_subcategory', 'category5_token']]

In [44]:
# df_train = pd.read_csv("cat5_training_set_final_08112022.csv")
df_train = pd.read_json("cat5_training_set_final_08112022.json", orient='split')
df_train.shape

(90292, 6)

In [45]:
df_train2 = pd.concat([df_train, prod_df3])
df_train2.shape

(121146, 6)

In [46]:
df_train3 = df_train2.drop_duplicates(subset=['shopify_product_id'], keep='last')
df_train3.shape

(93638, 6)

In [None]:
new_cat_df

In [49]:
# correcting the training set

new_cat_df = pd.read_csv('correct_prod.csv')
to_update_df = df_train3[df_train3['shopify_product_id'].isin(new_cat_df['shopify_product_id'])]
to_update_df['category5_token'] = to_update_df['shopify_product_id'].apply(lambda x: new_cat_df[new_cat_df['shopify_product_id'] == x]['cat5_final'].values)
df_train3 = pd.concat([df_train3, to_update_df])
df_train3.drop_duplicates(subset=['shopify_product_id'], keep='last', inplace=True)

In [18]:
def parallel_proc(df, fn, n_cores=5):
    if cpu_count() < n_cores:
        raise ValueError("The number of CPU's specified exceed the amount available")

    df_list = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    res = pool.map(fn, df_list)
    pool.close()
    pool.join()
    return pd.concat(res)

In [None]:
### tokenising the independent variables

df_train3['title_proc'] = df_train3['title'].apply(lambda x: preproc(x))
df_train3['desc_proc'] = df_train3['description'].apply(lambda x: preproc(x))
df_train3['subcat_proc'] = df_train3['shopify_subcategory'].apply(lambda x: preproc(x))
df_train3['pl_proc'] = df_train3['page_link'].apply(lambda x: preproc(x))

In [10]:
### creating X (independent variables) and y (dependent variable) matrices

X = df_train3[['title_proc','subcat_proc','desc_proc', 'pl_proc']]

# X = df_train3[['title_proc',','desc_proc', 'pl_proc']]

y = df_train3['category5_token']

In [11]:
### Train/Test Split

X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.8, test_size=0.2, stratify=y, random_state=10)

In [12]:
### Term-Frequency and Inverse document Frequency vectorizer with bigrams

title_vec_model = TfidfVectorizer(max_df=0.3, min_df=0.0001, ngram_range = (1,1), stop_words = ["english"])
desc_vec_model = TfidfVectorizer(max_df=0.3, min_df=0.0001, ngram_range = (1,1), stop_words = ["english"])
subcat_vec_model = TfidfVectorizer(max_df=0.3, min_df=0.0001, ngram_range = (1,1), stop_words = ["english"])
pl_vec_model = TfidfVectorizer(max_df=0.3, min_df=0.0001, ngram_range = (1,1), stop_words = ["english"])

In [13]:
### Fitting the Vectorisers on dependent variables

title_mat_train = title_vec_model.fit_transform(X_train['title_proc'].fillna(''))
desc_mat_train = desc_vec_model.fit_transform(X_train['desc_proc'].fillna(''))
subcat_mat_train = subcat_vec_model.fit_transform(X_train['subcat_proc'].fillna(''))
pl_mat_train = pl_vec_model.fit_transform(X_train['pl_proc'].fillna(''))

In [14]:
X_mat_train = hstack((title_mat_train, desc_mat_train, subcat_mat_train, pl_mat_train))
# X_mat_train = hstack((title_mat_train, desc_mat_train, pl_mat_train))

X_mat_train

<74910x25643 sparse matrix of type '<class 'numpy.float64'>'
	with 4458446 stored elements in Compressed Sparse Row format>

In [15]:
title_mat_test = title_vec_model.transform(X_test['title_proc'].fillna(''))
desc_mat_test = desc_vec_model.transform(X_test['desc_proc'].fillna(''))
subcat_mat_test = subcat_vec_model.transform(X_test['subcat_proc'].fillna(''))
pl_mat_test = pl_vec_model.transform(X_test['pl_proc'].fillna(''))

In [16]:
X_mat_test = hstack((title_mat_test, desc_mat_test, subcat_mat_test, pl_mat_test))
# X_mat_test = hstack((title_mat_test, desc_mat_test, pl_mat_test))

X_mat_test

<18728x25643 sparse matrix of type '<class 'numpy.float64'>'
	with 1114000 stored elements in Compressed Sparse Row format>

In [17]:
lr_model = LogisticRegression(n_jobs=6)

lr_model.fit(X_mat_train, y_train)

LogisticRegression(n_jobs=6)

In [18]:
yhat = lr_model.predict(X_mat_test)

In [19]:
print("Accuracy Score Is : ", accuracy_score(y_test, yhat))
# print("ROC_Score : " +str(roc_auc_score(y_test, yhat,multi_class="ovr")))
# print('classification Score =','\n', classification_report(y_test,yhat))
# print("Confusion Matrix HeatMap : \n", confusion_matrix(y_test, yhat))

Accuracy Score Is :  0.9679624092268262


### Building the final model

In [20]:
X = pd.concat([X_train,X_test])
y = pd.concat([y_train,y_test])

In [21]:
X

Unnamed: 0,title_proc,subcat_proc,desc_proc,pl_proc
1951419,loungewear lazy lama round green printed botto...,nightsuit,made finest cotton classic never go wrong incl...,loungewear lazy lama round green printed botto...
561168,tot mom organic drink mix almond date jaggery,health supplement,tot mom almond date drink mix nutritious drink...,tot mom organic drink mix almond date jaggery
285244,cricket birthday theme cake,cake,minimum prep time hr theme cake cricket lover ...,cricket birthday theme cake
27364,fostino suspender dark green round neck shirt,shirt,fostino shirt great way add pop color wardrobe...,http fostino com product fostino suspender dar...
14943,floral printed tote bag,luggage bag shopping tote,description introducing extra large suitcase b...,http refash product floral printed tote bag ut...
...,...,...,...,...
17640,carmen carrot dress,dress,description endless summer would amazing fit l...,http refash product carmen carrot dress utm so...
12746,gold plated kundan maang tikka,none,polish gold plated stone kundan maang tikka le...,gold plated kundan maang tikka
13438,tinted solid men crew neck shirt,basic shirt,care instruction mild wash fit type regular fi...,http shoptinted com product tinted solid men c...
12133,yellow silk embroidery lehenga,none,lehenga fabric art silklehenga embroidery thre...,yellow silk embroidery lehenga


In [22]:
title_vec_model_final = TfidfVectorizer(max_df=0.3, min_df=0.0001)
desc_vec_model_final = TfidfVectorizer(max_df=0.3, min_df=0.0001)
subcat_vec_model_final = TfidfVectorizer(max_df=0.3, min_df=0.0001)
pl_vec_model_final = TfidfVectorizer(max_df=0.3, min_df=0.0001)

In [23]:
title_mat = title_vec_model_final.fit_transform(X['title_proc'].fillna(''))
desc_mat = desc_vec_model_final.fit_transform(X['desc_proc'].fillna(''))
subcat_mat = subcat_vec_model_final.fit_transform(X['subcat_proc'].fillna(''))
pl_mat = pl_vec_model_final.fit_transform(X['pl_proc'].fillna(''))

In [24]:
X_mat = hstack((title_mat, desc_mat, subcat_mat, pl_mat))
# X_mat = hstack((title_mat, desc_mat, pl_mat))
X_mat

<93638x25262 sparse matrix of type '<class 'numpy.float64'>'
	with 5570695 stored elements in Compressed Sparse Row format>

In [25]:
lr_model_final = LogisticRegression(n_jobs=6)
lr_model_final.fit(X_mat, y)

LogisticRegression(n_jobs=6)

In [26]:
yhat = lr_model_final.predict(X_mat)
print("Accuracy Score Is : ", accuracy_score(y, yhat))
# print("ROC_Score : " +str(roc_auc_score(y, lr_model_final.predict(X_mat))))
# al.predict_proba(X_mat),multi_class="ovr")))
# print('classification Score =','\n', classification_report(y,yhat))
# print("Confusion Matrix HeatMap : \n", confusion_matrix(y, yhat))

Accuracy Score Is :  0.9902710438070015


### Exporting the model

In [28]:
import joblib

joblib.dump(title_vec_model_final,'vectoriser_models/title_tfidf_cat5_v4_1.pkl')
joblib.dump(desc_vec_model_final,'vectoriser_models/desc_tfidf_cat5_v4_1.pkl')
joblib.dump(subcat_vec_model_final,'vectoriser_models/subcat_tfidf_cat5_v4_1.pkl')
joblib.dump(pl_vec_model_final,'vectoriser_models/pl_tfidf_cat5_v4_1.pkl')
joblib.dump(lr_model_final,'classification_models/logistic_regression_model_cat5_v4_1.pkl')

['classification_models/logistic_regression_model_cat5_v4_1.pkl']

### Testing on baseline

In [184]:
# prod_df = pd.read_csv("df_baseline_trainingset.csv").rename(columns={'id':'_id', 'store_id':'_store_id', 'handle':'page_link', 'body_html':'description', 'product_type' :'shopify_subcategory'})
prod_df = pd.read_csv("baseline_14112022.csv").rename(columns={'id':'_id', 'store_id':'_store_id', 'handle':'page_link', 'body_html':'description', 'product_type' :'shopify_subcategory'})

prod_df.shape

(823, 13)

In [185]:
prod_df['title_proc'] = prod_df['title'].apply(lambda x: preproc(x))
prod_df['desc_proc'] = prod_df['description'].apply(lambda x: preproc(x))
prod_df['subcat_proc'] = prod_df['shopify_subcategory'].apply(lambda x: preproc(x))
prod_df['pl_proc'] = prod_df['page_link'].str.split('/products/',expand=True)[1].apply(lambda x: preproc(x))

In [186]:
X_df_rt = prod_df[['title_proc','subcat_proc','desc_proc', 'pl_proc']]
# X_df_rt = prod_df[['title_proc','desc_proc', 'pl_proc']]

In [187]:
title_mat_df_rt = title_vec_model_final.transform(X_df_rt['title_proc'].fillna(''))
desc_mat_df_rt = desc_vec_model_final.transform(X_df_rt['desc_proc'].fillna(''))
pl_mat_df_rt= pl_vec_model_final.transform(X_df_rt['pl_proc'].fillna(''))
subcat_mat_df_rt= subcat_vec_model_final.transform(X_df_rt['subcat_proc'].fillna(''))

X_mat_prod_df = hstack((title_mat_df_rt, desc_mat_df_rt, subcat_mat_df_rt, pl_mat_df_rt))
# X_mat_prod_df = hstack((title_mat_df_rt, desc_mat_df_rt, pl_mat_df_rt))

X_mat_prod_df

<823x25262 sparse matrix of type '<class 'numpy.float64'>'
	with 50290 stored elements in Compressed Sparse Row format>

In [190]:
prod_df['cat5_pred'] = lr_model_final.predict(X_mat_prod_df)
prod_df['predict_proba'] =  lr_model_final.predict_proba(X_mat_prod_df).max(1)

In [192]:
print("Accuracy Score Is : ", accuracy_score(prod_df['cat5_tokens'], lr_model_final.predict(X_mat_prod_df)))

Accuracy Score Is :  0.8918590522478737


In [195]:
#  df_review = prod_df[prod_df['cat5_tokens'] != prod_df['cat5_pred']]
#  df_review.to_csv('prod_df_review.csv')

In [None]:
# prod_df['cat5_predict'] = lr_model_final.predict(X_mat_prod_df)
# prod_df

In [None]:
# prod_df['check'] = prod_df['cat5_tokens'] == prod_df['cat5_predict']
# prod_df['cat5_tokens'] == prod_df['cat5_predict']

In [None]:
# prod_df[prod_df.check == False]

In [None]:
# prod_df[prod_df.check == False].to_clipboard(index=False)

In [None]:
# prod_df[prod_df.check == True].to_clipboard(index=False)

In [None]:
# ## Loading prev cat5 Models

# latest_title_vec_model = load('/Users/apoorvatiwari/Documents/Apps_RecSy/downtown-product-classifcation/cat_level3/Old_Files/models_Backup/08112022/vectoriser_models/title_tfidf_cat5_gd_ct_5.pkl')
# latest_desc_vec_model = load('/Users/apoorvatiwari/Documents/Apps_RecSy/downtown-product-classifcation/cat_level3/Old_Files/models_Backup/08112022/vectoriser_models/desc_tfidf_cat5_gd_ct_5.pkl')
# latest_pl_vec_model = load('/Users/apoorvatiwari/Documents/Apps_RecSy/downtown-product-classifcation/cat_level3/Old_Files/models_Backup/08112022/vectoriser_models/pl_tfidf_cat5_gd_ct_5.pkl')
# # latest_subcat_vec_model = load('vectoriser_models/subcat_tfidf_cat5_gd_ct.pkl')
# latest_lr_model = load('/Users/apoorvatiwari/Documents/Apps_RecSy/downtown-product-classifcation/cat_level3/Old_Files/models_Backup/08112022/classification_models/logistic_regression_model_cat5_gd_ct_5.pkl')

In [None]:
title_mat_df_rt = latest_title_vec_model.transform(X_df_rt['title_proc'].fillna(''))
desc_mat_df_rt = latest_desc_vec_model.transform(X_df_rt['desc_proc'].fillna(''))
pl_mat_df_rt= latest_pl_vec_model.transform(X_df_rt['pl_proc'].fillna(''))
# subcat_mat_df_rt= latest_subcat_vec_model.transform(X_df_rt['subcat_proc'].fillna(''))

# X_mat_prod_df = hstack((title_mat_df_rt, desc_mat_df_rt, subcat_mat_df_rt, pl_mat_df_rt))
X_mat_prod_df = hstack((title_mat_df_rt, desc_mat_df_rt,  pl_mat_df_rt))

X_mat_prod_df

In [None]:
print("Accuracy Score Is : ", accuracy_score(prod_df['cat5_tokens'], latest_lr_model.predict(X_mat_prod_df)))

In [1]:
0.945321992709599 - 0.9198055893074119

0.025516403402187193