In [16]:
#Packages
import pandas as pd
import numpy as np
import os
import pickle
import random
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV,RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score
from sklearn.compose import ColumnTransformer
import warnings
warnings.filterwarnings("ignore")

In [17]:
#Environment variables
DATASET_PATH=os.getenv('DATASET_PATH')
MODEL_PATH=os.getenv('MODEL_PATH')
METRICS_PATH=os.getenv('METRICS_PATH')

In [18]:
#Import data
df = pd.read_csv(DATASET_PATH)
print(df.shape)
df.head()

(38000, 15)


Unnamed: 0,product_id,seller_id,query,search_page,position,title,concatenated_tags,creation_date,price,weight,express_delivery,minimum_quantity,view_counts,order_counts,category
0,11394449,8324141,espirito santo,2,6,Mandala Espírito Santo,mandala mdf,2015-11-14 19:42:12,171.89,1200.0,1,4,244,,Decoração
1,15534262,6939286,cartao de visita,2,0,Cartão de Visita,cartao visita panfletos tag adesivos copos lon...,2018-04-04 20:55:07,77.67,8.0,1,5,124,,Papel e Cia
2,16153119,9835835,expositor de esmaltes,1,38,Organizador expositor p/ 70 esmaltes,expositor,2018-10-13 20:57:07,73.920006,2709.0,1,1,59,,Outros
3,15877252,8071206,medidas lencol para berco americano,1,6,Jogo de Lençol Berço Estampado,t jogo lencol menino lencol berco,2017-02-27 13:26:03,118.770004,0.0,1,1,180,1.0,Bebê
4,15917108,7200773,adesivo box banheiro,3,38,ADESIVO BOX DE BANHEIRO,adesivo box banheiro,2017-05-09 13:18:38,191.81,507.0,1,6,34,,Decoração


In [19]:
#Check NaN values
df.isnull().sum()

product_id               0
seller_id                0
query                    0
search_page              0
position                 0
title                    0
concatenated_tags        2
creation_date            0
price                    0
weight                  58
express_delivery         0
minimum_quantity         0
view_counts              0
order_counts         20105
category                 0
dtype: int64

In [20]:
#Treating the data and dropping irrelevant columns
df=df.drop(['product_id','seller_id','creation_date','order_counts'],axis=1)
df=df.dropna()
df.shape

(37940, 11)

In [21]:
#Split data for training and test
X = df.drop(['category'],axis=1)
y = df['category']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.33)
cv = StratifiedKFold(n_splits=5)

print(X_train.shape)
print(y_train.shape)

(25419, 10)
(25419,)


In [22]:
#Select best parameters for each model tested
def choose_model(pip,params,cv,X,y,seed):
    
    CV = RandomizedSearchCV(estimator=pip, 
                            param_distributions=params,
                            cv=cv, 
                            n_jobs= -1,
                            scoring='roc_auc',
                            random_state=seed)            
    CV.fit(X_train, y_train) 
    best_choice = CV.best_estimator_

    print('Best choice:','\n',best_choice)

    return best_choice
    

In [23]:
#Treat text data separately
preprocess = ColumnTransformer(
    [('query_countvec', CountVectorizer(), 'query'),
     ('title_countvec', CountVectorizer(), 'title'),
     ('concatenated_tags_tfidf', TfidfVectorizer(ngram_range=(1,3)), 'concatenated_tags')],
    remainder='passthrough')

In [24]:
#Logistic regression model
pipeline = Pipeline([

    ('union',preprocess),
    
    ('scaler', StandardScaler(with_mean=False)),
    

    ('clf',LogisticRegression(random_state=42))
    
    ])

params = [
    {
        "clf__penalty": ['l2', 'l1'],
        "clf__C": np.logspace(0, 2, 10),
        "clf__solver":['newton-cg','saga', 'liblinear']
    }
] 

logreg_result = choose_model(pip=pipeline,params=params,cv=cv,X=X_train,y=y_train,seed=42)
acc = cross_val_score(logreg_result, X_train, y_train, cv=cv,scoring='accuracy')
f1 =  cross_val_score(logreg_result, X_train, y_train, cv=cv,scoring='f1_weighted')
print("%f accuracy with std of %f, and f1-score of %f with std of %f." % (acc.mean(), acc.std(),f1.mean(),f1.std()))

Best choice: 
 Pipeline(steps=[('union',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('query_countvec',
                                                  CountVectorizer(), 'query'),
                                                 ('title_countvec',
                                                  CountVectorizer(), 'title'),
                                                 ('concatenated_tags_tfidf',
                                                  TfidfVectorizer(ngram_range=(1,
                                                                               3)),
                                                  'concatenated_tags')])),
                ('scaler', StandardScaler(with_mean=False)),
                ('clf',
                 LogisticRegression(random_state=42, solver='newton-cg'))])
0.886227 accuracy with std of 0.003398, and f1-score of 0.884573 with std of 0.003645.


In [25]:
#Random forest model
pipeline = Pipeline([

    ('union',preprocess),
    
    ('scaler', StandardScaler(with_mean=False)),

    ('clf',RandomForestClassifier(random_state=42))])

params = [
    {
        "clf__n_estimators": [10, 50, 100, 1000],
        "clf__max_depth":[5, 10, 15, 25, 50, None],
        "clf__min_samples_leaf":[1, 2, 5, 10, 15, 100],
        "clf__max_leaf_nodes": [2, 3,5, 15,20]
    }
] 

rf_result = choose_model(pip=pipeline,params=params,cv=cv,X=X_train,y=y_train,seed=42)
acc = cross_val_score(rf_result, X_train, y_train, cv=cv,scoring='accuracy')
f1 =  cross_val_score(rf_result, X_train, y_train, cv=cv,scoring='f1_weighted')
print("%f accuracy with std of %f, and f1-score of %f with std of %f." % (acc.mean(), acc.std(),f1.mean(),f1.std()))

Best choice: 
 Pipeline(steps=[('union',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('query_countvec',
                                                  CountVectorizer(), 'query'),
                                                 ('title_countvec',
                                                  CountVectorizer(), 'title'),
                                                 ('concatenated_tags_tfidf',
                                                  TfidfVectorizer(ngram_range=(1,
                                                                               3)),
                                                  'concatenated_tags')])),
                ('scaler', StandardScaler(with_mean=False)),
                ('clf',
                 RandomForestClassifier(max_depth=15, max_leaf_nodes=20,
                                        min_samples_leaf=2, n_estimators=10,
                                        random_state

In [26]:
#Check metrics and choose final model
pipelines = [logreg_result,rf_result]

result = pd.DataFrame(columns=['Model','train_acc','test_acc','train_prec','test_prec',
                               'train_rec','test_rec','train_f1','test_f1','train_auc','test_auc'])

for p in pipelines:
    
    model = p['clf'].__class__.__name__
    tracc = accuracy_score(y_train, p.predict(X_train))
    teacc = accuracy_score(y_test, p.predict(X_test))
    trprec = precision_score(y_train, p.predict(X_train),average='weighted')
    teprec = precision_score(y_test, p.predict(X_test), average='weighted')
    trrec = recall_score(y_train, p.predict(X_train), average='weighted')
    terec = recall_score(y_test, p.predict(X_test), average='weighted')
    trf1 = f1_score(y_train, p.predict(X_train), average='weighted')
    tef1 = f1_score(y_test, p.predict(X_test), average='weighted')
    trauc = roc_auc_score(y_train, p.predict_proba(X_train), multi_class="ovo")
    teauc = roc_auc_score(y_test, p.predict_proba(X_test), multi_class="ovo")
    
    result = result.append({'Model':model,
                            'train_acc':tracc,
                            'test_acc':teacc,
                            'train_prec':trprec,
                            'test_prec':teprec,
                            'train_rec':trrec,
                            'test_rec':terec,
                            'train_f1':trf1,
                            'test_f1':tef1,
                            'train_auc':trauc,
                            'test_auc':teauc},ignore_index=True)

result    

Unnamed: 0,Model,train_acc,test_acc,train_prec,test_prec,train_rec,test_rec,train_f1,test_f1,train_auc,test_auc
0,LogisticRegression,0.999725,0.889865,0.999725,0.888605,0.999725,0.889865,0.999725,0.888291,1.0,0.962894
1,RandomForestClassifier,0.600456,0.592045,0.733721,0.730936,0.600456,0.592045,0.529985,0.520235,0.868424,0.863796


In [27]:
#Save best model
model=logreg_result
filepath=MODEL_PATH
pickle.dump(model,open(filepath,'wb'))

In [36]:
#Save metrics
pd.DataFrame(result.loc[0]).to_csv(METRICS_PATH)