In [19]:
from multiprocessing import Pool, cpu_count
import pandas  as pd
import numpy as np
from scipy.sparse import hstack, vstack
from sklearn.model_selection import train_test_split

from text_preproc_pipeline import preproc
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression


from sqlalchemy import create_engine
from joblib import load

from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, confusion_matrix, classification_report


# from env import *

In [3]:
# df_train = pd.read_csv("cat5_training_set_final_08112022.csv")
df_train = pd.read_json("cat5_training_set_final_08112022.json", orient='split')
df_train.shape

(90292, 6)

In [17]:
def parallel_proc(df, fn, n_cores=5):
    if cpu_count() < n_cores:
        raise ValueError("The number of CPU's specified exceed the amount available")

    df_list = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    res = pool.map(fn, df_list)
    pool.close()
    pool.join()
    return pd.concat(res)

In [4]:
### tokenising the independent variables

df_train['title_proc'] = df_train['title'].apply(lambda x: preproc(x))
df_train['desc_proc'] = df_train['description'].apply(lambda x: preproc(x))
df_train['subcat_proc'] = df_train['shopify_subcategory'].apply(lambda x: preproc(x))
df_train['pl_proc'] = df_train['page_link'].apply(lambda x: preproc(x))


In [16]:
df_train['title_proc'].to_frame()

Unnamed: 0,title_proc
1715584,dhokra metal craft anklet
1097228,pramud silver kid anklet pair yr
1690836,taraash sterling silver combo anklet toe ring ...
1099625,akshara silver gemstone anklet red
800672,anklet woman girl pack
...,...
4495,whiskey glass piece clear ml mart
4496,luigi bormioli michelangelo f whisky glass set
4497,single barrel whiskey glass set
4498,crystal glass twist design whiskey glass set m...


In [21]:
parallel_proc(df_train['title_proc'], preproc)

TypeError: cannot concatenate object of type '<class 'str'>'; only Series and DataFrame objs are valid

In [5]:
### creating X (independent variables) and y (dependent variable) matrices

X = df_train[['title_proc','subcat_proc','desc_proc', 'pl_proc']]

# X = df_train[['title_proc',','desc_proc', 'pl_proc']]

y = df_train['category5_token']

In [6]:
### Train/Test Split

X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.8, test_size=0.2, stratify=y, random_state=10)

In [7]:
### Term-Frequency and Inverse document Frequency vectorizer with bigrams

title_vec_model = TfidfVectorizer(max_df=0.3, min_df=0.0001, ngram_range = (1,1), stop_words = ["english"])
desc_vec_model = TfidfVectorizer(max_df=0.3, min_df=0.0001, ngram_range = (1,1), stop_words = ["english"])
subcat_vec_model = TfidfVectorizer(max_df=0.3, min_df=0.0001, ngram_range = (1,1), stop_words = ["english"])
pl_vec_model = TfidfVectorizer(max_df=0.3, min_df=0.0001, ngram_range = (1,1), stop_words = ["english"])

In [8]:
### Fitting the Vectorisers on dependent variables

title_mat_train = title_vec_model.fit_transform(X_train['title_proc'].fillna(''))
desc_mat_train = desc_vec_model.fit_transform(X_train['desc_proc'].fillna(''))
subcat_mat_train = subcat_vec_model.fit_transform(X_train['subcat_proc'].fillna(''))
pl_mat_train = pl_vec_model.fit_transform(X_train['pl_proc'].fillna(''))

In [9]:
X_mat_train = hstack((title_mat_train, desc_mat_train, subcat_mat_train, pl_mat_train))
# X_mat_train = hstack((title_mat_train, desc_mat_train, pl_mat_train))

X_mat_train

<72233x25131 sparse matrix of type '<class 'numpy.float64'>'
	with 4269542 stored elements in Compressed Sparse Row format>

In [10]:
title_mat_test = title_vec_model.transform(X_test['title_proc'].fillna(''))
desc_mat_test = desc_vec_model.transform(X_test['desc_proc'].fillna(''))
subcat_mat_test = subcat_vec_model.transform(X_test['subcat_proc'].fillna(''))
pl_mat_test = pl_vec_model.transform(X_test['pl_proc'].fillna(''))

In [11]:
X_mat_test = hstack((title_mat_test, desc_mat_test, subcat_mat_test, pl_mat_test))
# X_mat_test = hstack((title_mat_test, desc_mat_test, pl_mat_test))

X_mat_test

<18059x25131 sparse matrix of type '<class 'numpy.float64'>'
	with 1055566 stored elements in Compressed Sparse Row format>

In [12]:
lr_model = LogisticRegression(n_jobs=-1)

lr_model.fit(X_mat_train, y_train)

KeyboardInterrupt: 

In [13]:
yhat = lr_model.predict(X_mat_test)

In [14]:
print("Accuracy Score Is : ", accuracy_score(y_test, yhat))
# print("ROC_Score : " +str(roc_auc_score(y_test, yhat,multi_class="ovr")))
# print('classification Score =','\n', classification_report(y_test,yhat))
# print("Confusion Matrix HeatMap : \n", confusion_matrix(y_test, yhat))

Accuracy Score Is :  0.9576942244864056


In [15]:
### Building the final model

In [16]:
X = pd.concat([X_train,X_test])
y = pd.concat([y_train,y_test])

In [17]:
X

Unnamed: 0,title_proc,subcat_proc,desc_proc,pl_proc
40927,rodium plated cufflink men sj,cufflink,powerdressed work everyday premium workwear ac...,rodium plated cufflink men sj
73716,skore chocolate flavoured condom raised dot x,health beauty health care condom,,skore chocolate flavoured condom raised dot x
23194,party pack drink,none,party pack drink contains sachet flavour unlea...,party pack drink
11850,crochet craving laptop sleeve,none,popitout brings decent classy range laptop mes...,crochet craving laptop sleeve
6447,jumbo golden raisin organic healthy naturally ...,none,jumbo golden raisin world huge almost big grap...,raisin gm
...,...,...,...,...
12311,lip hydration balm pack,none,type lip quantity ml country origin india,lip hydration balm pack
14383,black net emboridered kurta set,salwar kameez,buy magnificent black net embroidered kurta se...,magnificent black net thread work kurta set
24522,customized name plate victorian pink flower,nameplate,style painting style watercolour paper victori...,victorian pink nameplate
2924,cold coffee bundle,drinkware,featured experience make summer cold coffee bu...,cold coffee bundle


In [None]:
title_vec_model_final = TfidfVectorizer(max_df=0.3, min_df=0.0001)
desc_vec_model_final = TfidfVectorizer(max_df=0.3, min_df=0.0001)
subcat_vec_model_final = TfidfVectorizer(max_df=0.3, min_df=0.0001)
pl_vec_model_final = TfidfVectorizer(max_df=0.3, min_df=0.0001)

In [None]:
title_mat = title_vec_model_final.fit_transform(X['title_proc'].fillna(''))
desc_mat = desc_vec_model_final.fit_transform(X['desc_proc'].fillna(''))
subcat_mat = subcat_vec_model_final.fit_transform(X['subcat_proc'].fillna(''))
pl_mat = pl_vec_model_final.fit_transform(X['pl_proc'].fillna(''))

In [None]:
X_mat = hstack((title_mat, desc_mat, subcat_mat, pl_mat))
# X_mat = hstack((title_mat, desc_mat, pl_mat))
X_mat

In [None]:
lr_model_final = LogisticRegression(verbose=1, max_iter=75)
lr_model_final.fit(X_mat, y)

In [None]:
yhat = lr_model_final.predict(X_mat)
print("Accuracy Score Is : ", accuracy_score(y, yhat))
# print("ROC_Score : " +str(roc_auc_score(y, lr_model_final.predict(X_mat))))
# al.predict_proba(X_mat),multi_class="ovr")))
# print('classification Score =','\n', classification_report(y,yhat))
# print("Confusion Matrix HeatMap : \n", confusion_matrix(y, yhat))

In [None]:
### Exporting the model

import joblib

joblib.dump(title_vec_model_final,'vectoriser_models/title_tfidf_cat5_v4.pkl')
joblib.dump(desc_vec_model_final,'vectoriser_models/desc_tfidf_cat5_v4.pkl')
joblib.dump(subcat_vec_model_final,'vectoriser_models/subcat_tfidf_cat5_v4.pkl')
joblib.dump(pl_vec_model_final,'vectoriser_models/pl_tfidf_cat5_v4.pkl')

joblib.dump(lr_model_final,'classification_models/logistic_regression_model_cat5_v4.pkl')

In [None]:
### Testing on baseline

In [None]:
# prod_df = pd.read_csv("df_baseline_trainingset.csv").rename(columns={'id':'_id', 'store_id':'_store_id', 'handle':'page_link', 'body_html':'description', 'product_type' :'shopify_subcategory'})
prod_df = pd.read_csv("baseline_checks - Sheet2.csv").rename(columns={'id':'_id', 'store_id':'_store_id', 'handle':'page_link', 'body_html':'description', 'product_type' :'shopify_subcategory'})

prod_df.shape

In [None]:
prod_df['title_proc'] = prod_df['title'].apply(lambda x: preproc(x))
prod_df['desc_proc'] = prod_df['description'].apply(lambda x: preproc(x))
prod_df['subcat_proc'] = prod_df['shopify_subcategory'].apply(lambda x: preproc(x))
prod_df['pl_proc'] = prod_df['page_link'].str.split('/products/',expand=True)[1].apply(lambda x: preproc(x))

In [None]:
X_df_rt = prod_df[['title_proc','subcat_proc','desc_proc', 'pl_proc']]
# X_df_rt = prod_df[['title_proc','desc_proc', 'pl_proc']]

In [None]:
title_mat_df_rt = title_vec_model_final.transform(X_df_rt['title_proc'].fillna(''))
desc_mat_df_rt = desc_vec_model_final.transform(X_df_rt['desc_proc'].fillna(''))
pl_mat_df_rt= pl_vec_model_final.transform(X_df_rt['pl_proc'].fillna(''))
subcat_mat_df_rt= subcat_vec_model_final.transform(X_df_rt['subcat_proc'].fillna(''))

X_mat_prod_df = hstack((title_mat_df_rt, desc_mat_df_rt, subcat_mat_df_rt, pl_mat_df_rt))
# X_mat_prod_df = hstack((title_mat_df_rt, desc_mat_df_rt, pl_mat_df_rt))

X_mat_prod_df

In [None]:
print("Accuracy Score Is : ", accuracy_score(prod_df['cat5_tokens'], lr_model_final.predict(X_mat_prod_df)))

In [None]:
# prod_df['cat5_predict'] = lr_model_final.predict(X_mat_prod_df)
# prod_df

In [None]:
# prod_df['check'] = prod_df['cat5_tokens'] == prod_df['cat5_predict']
# prod_df['cat5_tokens'] == prod_df['cat5_predict']

In [None]:
# prod_df[prod_df.check == False]

In [None]:
# prod_df[prod_df.check == False].to_clipboard(index=False)

In [None]:
# prod_df[prod_df.check == True].to_clipboard(index=False)

In [None]:
## Loading prev cat5 Models

latest_title_vec_model = load('/Users/apoorvatiwari/Documents/Apps_RecSy/downtown-product-classifcation/cat_level3/Old_Files/models_Backup/08112022/vectoriser_models/title_tfidf_cat5_gd_ct_5.pkl')
latest_desc_vec_model = load('/Users/apoorvatiwari/Documents/Apps_RecSy/downtown-product-classifcation/cat_level3/Old_Files/models_Backup/08112022/vectoriser_models/desc_tfidf_cat5_gd_ct_5.pkl')
latest_pl_vec_model = load('/Users/apoorvatiwari/Documents/Apps_RecSy/downtown-product-classifcation/cat_level3/Old_Files/models_Backup/08112022/vectoriser_models/pl_tfidf_cat5_gd_ct_5.pkl')
# latest_subcat_vec_model = load('vectoriser_models/subcat_tfidf_cat5_gd_ct.pkl')
latest_lr_model = load('/Users/apoorvatiwari/Documents/Apps_RecSy/downtown-product-classifcation/cat_level3/Old_Files/models_Backup/08112022/classification_models/logistic_regression_model_cat5_gd_ct_5.pkl')

In [None]:
title_mat_df_rt = latest_title_vec_model.transform(X_df_rt['title_proc'].fillna(''))
desc_mat_df_rt = latest_desc_vec_model.transform(X_df_rt['desc_proc'].fillna(''))
pl_mat_df_rt= latest_pl_vec_model.transform(X_df_rt['pl_proc'].fillna(''))
# subcat_mat_df_rt= latest_subcat_vec_model.transform(X_df_rt['subcat_proc'].fillna(''))

# X_mat_prod_df = hstack((title_mat_df_rt, desc_mat_df_rt, subcat_mat_df_rt, pl_mat_df_rt))
X_mat_prod_df = hstack((title_mat_df_rt, desc_mat_df_rt,  pl_mat_df_rt))

X_mat_prod_df

In [None]:
print("Accuracy Score Is : ", accuracy_score(prod_df['cat5_tokens'], latest_lr_model.predict(X_mat_prod_df)))

In [None]:
0.945321992709599 - 0.9198055893074119