# InStyle Net Promoted Score

## 0 - IMPORTS AND FUNCTIONS

In [1]:
# Data Analysis and Data Manipulation Libraries
import inflection
import joblib
import pandas                                       as pd
import numpy                                        as np
import seaborn                                      as sns
import scikitplot                                   as skplt
import plotly.express                               as px
import plotly.graph_objects                         as go

# Data Processing Libraries
from sklearn                 import model_selection as ms
from sklearn                 import preprocessing   as pp
from sklearn.model_selection import KFold

# ML Models and Optimization Libraries
import optuna
import lightgbm                                     as lgbm

from catboost                import CatBoostClassifier
from xgboost                 import XGBClassifier
from sklearn.metrics         import precision_score 
from sklearn.metrics         import confusion_matrix, ConfusionMatrixDisplay

# Dysplay
%matplotlib inline
from IPython.core.display    import HTML
from IPython.display         import Image

import warnings
warnings.filterwarnings( 'ignore' )

### 0.1 - HELPER FUNCTIONS

In [None]:
# These functions were created to help us to apply data cleaning, featue engineering and data transformations in our dataset, seeking to apply same principles to every datasets to be used in this notebook.
def train_prep(dftr):
    ## Renaming Columns
    cols_old = ['id', 'Gender', 'Customer Type', 'Age', 'Type of Purchase','Store size', 'Store distance', 'InStore wifi',
                'Open/Close time convenient', 'Easy of online shopping', 'Store location', 'Toilet cleaning', 'Dressing room', 
                'Waiting room','Kids entertainment', 'Seller service', 'Showroom ', 'Self-Store', 'Purchase service', 
                'Store Service', 'Cleanliness', 'Carrier delay in minutes', 'Delivery delay in minutes', 'Satisfaction']

    snakecase = lambda x: inflection.underscore(x.replace(' ', '_'))
    cols_new = list( map( snakecase, cols_old ) )

    dftr.columns = cols_new

    # FEATURE ENGINEERING

    #dftr['distancia'] = dftr['store_distance'].apply(lambda x: 'close' if x <= 2500 else 
                                                               #'far' )

    #dftr['age_group'] = dftr['age'].apply(lambda x: 'young' if x <= 18 else
                                                    #'adult' if ((x > 18) and (x <= 60)) else
                                                    #'elderly')
    
    # Fill NaN
    dftr['delivery_delay_in_minutes'] = dftr['delivery_delay_in_minutes'].fillna(dftr['carrier_delay_in_minutes'])

    # FREQUENCY ENCODER
    # gender
    fe_gender = dftr.groupby('gender').size() / len(dftr)
    dftr['gender'] = dftr['gender'].map( fe_gender )
    joblib.dump(fe_gender, 'fe_gender.joblib')

    # type_of_purchase
    fe_type_of_purchase = dftr.groupby( 'type_of_purchase' ).size() / len( dftr ) 
    dftr['type_of_purchase'] = dftr['type_of_purchase'].map( fe_type_of_purchase )
    joblib.dump(fe_type_of_purchase, 'fe_tpurchase.joblib')

    # ORDINAL ENCODER
    #oea = pp.OrdinalEncoder()
    #oeb = pp.OrdinalEncoder()
    oec = pp.OrdinalEncoder()
    oed = pp.OrdinalEncoder()
    oee = pp.OrdinalEncoder()
    
    # distance
    #dftr['distancia'] = oea.fit_transform(dftr[['distancia']].values)
    #joblib.dump(oea, 'oe_distance.joblib')

    # distance
    #dftr['age_group'] = oeb.fit_transform(dftr[['age_group']].values)
    #joblib.dump(oeb, 'oe_ageg.joblib')

    # distance
    dftr['customer_type'] = oec.fit_transform(dftr[['customer_type']].values)
    joblib.dump(oec, 'oe_customer.joblib')

    # Store Size
    dftr['store_size'] = oed.fit_transform( dftr[['store_size']].values )
    joblib.dump(oed, 'oe_store.joblib')

    # REESCALING
    mms = pp.MinMaxScaler()

    # age
    dftr['age'] = mms.fit_transform( dftr[['age']].values )
    joblib.dump(mms, 'mm_age.joblib')

    # ROBUST SCALER
    rs = pp.RobustScaler()

    # Store Distance
    dftr['store_distance'] = rs.fit_transform( dftr[['store_distance']].values )
    joblib.dump(rs, 'rs_stored.joblib')

    # TRANSFORMACAO LOGARITMICA 
    # carrier_delay_in_minutes
    dftr['carrier_delay_in_minutes'] = dftr['carrier_delay_in_minutes'].apply(lambda x: np.log(x) if x > 0 else x) 

    # delivery_delay_in_minutes
    dftr['delivery_delay_in_minutes'] = dftr['delivery_delay_in_minutes'].apply(lambda x: np.log(x) if x > 0 else x) 

    # Response
    # satisfaction
    dftr['satisfaction'] = dftr['satisfaction'].apply(lambda x: 0 if x=='Satisfied' else 1)
    
    return (dftr)

def test_prep(dfte):
    ######## TEST DATASET ############
    ## Renaming Columns
    cols_old = ['id', 'Gender', 'Customer Type', 'Age', 'Type of Purchase','Store size', 'Store distance', 'InStore wifi',
                'Open/Close time convenient', 'Easy of online shopping', 'Store location', 'Toilet cleaning', 'Dressing room', 
                'Waiting room','Kids entertainment', 'Seller service', 'Showroom ', 'Self-Store', 'Purchase service', 
                'Store Service', 'Cleanliness', 'Carrier delay in minutes', 'Delivery delay in minutes']

    snakecase = lambda x: inflection.underscore(x.replace(' ', '_'))
    cols_new = list( map( snakecase, cols_old ) )

    dfte.columns = cols_new

    # FEATURE ENGINEERING

    #dfte['distancia'] = dfte['store_distance'].apply(lambda x: 'close' if x <= 2500 else 
                                                               #'far' )



    #dfte['age_group'] = dfte['age'].apply(lambda x: 'young' if x <= 18 else
                                                    #'adult' if ((x > 18) and (x <= 60)) else
                                                    #'elderly')

    # TRANSFORMERS
    
    fe_gender    = joblib.load( 'fe_gender.joblib')
    fe_tpurchase = joblib.load( 'fe_tpurchase.joblib')
    #oe_distance  = joblib.load( 'oe_distance.joblib')
    #oe_ageg      = joblib.load( 'oe_ageg.joblib')
    oe_customer  = joblib.load( 'oe_customer.joblib')
    oe_store     = joblib.load( 'oe_store.joblib')
    mm_age       = joblib.load( 'mm_age.joblib')
    rs_stored    = joblib.load( 'rs_stored.joblib')
    
    # gender
    dfte['gender'] = dfte['gender'].map(fe_gender)
    
    # type_of_purchase
    dfte['type_of_purchase'] = dfte['type_of_purchase'].map( fe_tpurchase )

    # distance
    #dfte['distancia'] = oe_distance.transform(dfte[['distancia']].values)

    # distance
    #dfte['age_group'] =oe_ageg.transform(dfte[['age_group']].values)

    # distance
    dfte['customer_type'] =oe_customer.transform(dfte[['customer_type']].values)

    # Store Size
    dfte['store_size'] = oe_store.transform(dfte[['store_size']].values)

    # REESCALING
 
    # age
    dfte['age'] = mm_age.transform(dfte[['age']].values)

    # Store Distance
    dfte['store_distance'] = rs_stored.transform(dfte[['store_distance']].values)

    # TRANSFORMACAO LOGARITMICA 
    
    # carrier_delay_in_minutes
    dfte['carrier_delay_in_minutes'] = dfte['carrier_delay_in_minutes'].apply(lambda x: np.log(x) if x > 0 else x) 

    # delivery_delay_in_minutes
    dfte['delivery_delay_in_minutes'] = dfte['delivery_delay_in_minutes'].apply(lambda x: np.log(x) if x > 0 else x) 

    return(dfte)

def cross_validation(kfold, modelName, model, X, y):
    
    # Number of folds
    fold=ms.StratifiedKFold(n_splits = kfold, shuffle=True, random_state=42)
    
    # Performance variables
    precision_list = []
    
    for train_cv,val_cv in fold.split(X, y):
        
        # Separating train and validation dataset for each kfold
        # training data
        x_train_fold = X.iloc[train_cv]
        x_train_fold = x_train_fold
        
        y_train_fold = y.iloc[train_cv]
        y_train_fold = y_train_fold
        
        # validation data
        x_val_fold = X.iloc[val_cv]
        x_val_fold = x_val_fold
        
        y_val_fold = y.iloc[val_cv]
        y_val_fold = y_val_fold
        
        # fitting the model
        model_fitted = model.fit(x_train_fold, y_train_fold)
        
        # predict
        yhat = model_fitted.predict(x_val_fold)
        
        #performance
        precision = precision_score(y_val_fold, yhat, average='micro')
        precision_list.append(precision)
        
    return np.mean(precision_list)

In [None]:
def jupyter_settings():
    display( HTML( '<style>.container {width:100% !important; }</style>') )
    sns.set()
jupyter_settings()

## 1 - DATA EXTRACTION

In [None]:
# All Data - Windows
df_raw = pd.read_csv('C:/Users/perot/Documents/ds_repos/projects/InStyle_Net_Promoter_Score/data/train.csv')

df_test = pd.read_csv('C:/Users/perot/Documents/ds_repos/projects/InStyle_Net_Promoter_Score/data/test.csv')

In [None]:
## Renaming Columns
cols_old = ['id', 'Gender', 'Customer Type', 'Age', 'Type of Purchase','Store size', 'Store distance', 'InStore wifi',
            'Open/Close time convenient', 'Easy of online shopping', 'Store location', 'Toilet cleaning', 'Dressing room', 
            'Waiting room','Kids entertainment', 'Seller service', 'Showroom ', 'Self-Store', 'Purchase service', 
            'Store Service', 'Cleanliness', 'Carrier delay in minutes', 'Delivery delay in minutes', 'Satisfaction']

snakecase = lambda x: inflection.underscore(x.replace(' ', '_'))
cols_new = list( map( snakecase, cols_old ) )

df_raw.columns = cols_new

In [None]:
df1 = df_raw.copy()

### 1.1 - Data Descriptive

In [None]:
#prof = ProfileReport(df_raw)
#prof.to_file(output_file='data_descriptive.html')

In [None]:
# Data Dimensions
print( 'Number of Rows: {}'.format( df1.shape[0] ) )
print( 'Number of Cols: {}'.format( df1.shape[1] ) )

In [None]:
# Data Types
df1.dtypes

In [None]:
# Verifying if there are NaN values
df1.isna().sum()

There are 310 missing values for the variable delivery_delay_in_minutes. The decision made is to fill the NaN values with the correspondent value on the column carrier_delay_in_minutes, because were noticed that when the carrier is 0, the delay is also 0, and around 50% of the NaN is 0 for carrier_delay.

In [None]:
# Analyze and Fill NaN

df1['delivery_delay_in_minutes'] = df1['delivery_delay_in_minutes'].fillna(df1['carrier_delay_in_minutes'])

### 1.1.1 - Distribuition Analysis

In [None]:
# numerical features
num_attributes = df1.select_dtypes(include=['int64', 'float64'])

# categorical features
cat_attributes = df1.select_dtypes(exclude=['int64', 'float64'])

In [None]:
# Central Tendency - mean, median
ct1 = pd.DataFrame( num_attributes.apply( np.mean ) ).T
ct2 = pd.DataFrame( num_attributes.apply( np.median ) ).T

# Dispersion - std, min, max, range, skew, kurtosis
d1 = pd.DataFrame( num_attributes.apply( np.std ) ).T
d2 = pd.DataFrame( num_attributes.apply( min ) ).T
d3 = pd.DataFrame( num_attributes.apply( max ) ).T
d4 = pd.DataFrame( num_attributes.apply( lambda x: x.max() - x.min() ) ).T
d5 = pd.DataFrame( num_attributes.apply( lambda x: x.skew() ) ).T
d6 = pd.DataFrame( num_attributes.apply( lambda x: x.kurtosis() ) ).T

# Concatenar
m = pd.concat( [ d2, d3, d4, ct1, ct2, d1, d5, d6 ] ).T.reset_index()
m.columns = ['attributes', 'min', 'max', 'range', 'mean', 'median', 'std', 'skew', 'kurtosis']
m

## 2 - FEATURE ENGINEERING

In [None]:
df2 = df1.copy()

In [None]:
# New Features

df2['distancia'] = df2['store_distance'].apply(lambda x: 'close' if x <= 2500 else 
                                                         'far' )



df2['age_group'] = df2['age'].apply(lambda x: 'young' if x <= 18 else
                                              'adult' if ((x > 18) and (x <= 60)) else
                                              'elderly')

# 3. DATA FILTERING

In [None]:
df3 = df2.copy()

# 4. EDA

In [None]:
df4_eda = df3.copy()

## 4.1 Univariate Analysis

### Numerical Attributes

In [None]:
columns_to_plot = num_attributes.columns

fig, axes = plt.subplots(nrows=7, ncols=3, figsize=(30, 15))
fig.subplots_adjust(hspace=0.5)

#Usar o divmod para encontrar os axes 
for i, column in enumerate(columns_to_plot):
    row, col = divmod(i, 3)
    ax = axes[row, col]
    #plotar todos os boxplots
    sns.boxplot(df1, x=column, ax=ax)
    ax.set_title(column)

plt.tight_layout()
plt.show()

### Categorical Attributes

In [None]:
labels = cat_attributes.columns
label = 'satisfaction'

In [None]:
plt.figure(figsize=(14, len(labels) * 2))
for i, col in enumerate(labels):
    plt.subplot(len(labels) // 2 + 1, 3, i + 1)
    sns.countplot(x=col, hue=label, data=df1)
    plt.title(f"{col} vs {label}")
    plt.tight_layout()

## 4.2 Bivariate Analysis

In [None]:
print(px.colors.qualitative.Plotly)

In [None]:
# 1 - As the dressing room score increases, the volume of dissatisfied customers decreases.
# Hyphotesis is true.
aux = df4_eda[['id','dressing_room','satisfaction']].groupby(['dressing_room','satisfaction']).count().reset_index()
fig = px.bar(aux, x="dressing_room", y="id",
             color='satisfaction',
             barmode='group',
             text='id',
             color_discrete_sequence=['mediumturquoise', 'lightgreen'],
             height=600,
             width=1000)

fig.update_traces(texttemplate='%{text:.2s}', 
                  textposition='outside')

fig.update_layout(uniformtext_minsize=8, 
                  uniformtext_mode='hide',
                  title={ 'text':'Dressing Room Score vs Customer Satisfaction', 'x':0.45, 'y':0.95},
                  plot_bgcolor='white',
                  paper_bgcolor='white')

fig.show()

In [None]:
# 2 - As the toilet cleaning score increases, the volume of dissatisfied customers decreases.
aux = df4_eda[['id','toilet_cleaning','satisfaction']].groupby(['toilet_cleaning','satisfaction']).count().reset_index()
fig = px.bar(aux, x="toilet_cleaning", y="id",
             color='satisfaction',
             barmode='group',
             text='id',
             color_discrete_sequence=['mediumturquoise', 'lightgreen'],
             height=600,
             width=1000)

fig.update_traces(texttemplate='%{text:.2s}', 
                  textposition='outside')

fig.update_layout(uniformtext_minsize=8, 
                  uniformtext_mode='hide',
                  title={ 'text':'Toilet Cleaning Score vs Customer Satisfaction', 'x':0.45, 'y':0.95},
                  plot_bgcolor='white',
                  paper_bgcolor='white')

fig.show()

In [None]:
# 3 - Clientes homens sao em geral, mais satisfeitos que clientes mulheres.

aux2 = pd.crosstab(df4_eda['gender'], df4_eda['satisfaction'])
aux2['percentage'] = aux2['Neutral or Dissatisfaction']/(aux2['Neutral or Dissatisfaction']+aux2['Satisfied'])

colors = ['mediumturquoise', 'lightgreen']

fig = go.Figure(data=[go.Pie(labels= ['Neutral or Dissatisfaction', 'Satisfied'], values= [57, 56])])

fig.update_traces(hoverinfo='label+percent', textinfo='value', textfont_size=20,
                  marker=dict(colors=colors, line=dict(color='#000000', width=2)))


fig.update_layout(uniformtext_minsize=8, 
                  uniformtext_mode='hide',
                  title={ 'text':'Percent. of Gender vs Customer Satisfaction', 'x':0.40, 'y':0.95},
                  height=600,  # Adjust the height as needed
                  width=1000    # Adjust the width as needed
                  )
fig.show()

In [None]:
# 4 - The larger the store, the more satisfied its customers tend to be.
aux1 = df4_eda[['id','store_size','satisfaction']].groupby(['store_size','satisfaction']).count().reset_index()

fig = px.bar(aux1, x="store_size", y="id",
             color='satisfaction',
             barmode='group',
             text='id',
             color_discrete_sequence=['mediumturquoise', 'lightgreen'],
             height=600,
             width=1000)

fig.update_traces(texttemplate='%{text:.2s}', 
                  textposition='outside')

fig.update_layout(uniformtext_minsize=8, 
                  uniformtext_mode='hide',
                  title={ 'text':'Store Size vs Customer Satisfaction', 'x':0.35, 'y':0.95},
                  plot_bgcolor='white',
                  paper_bgcolor='white')

fig.show()

In [None]:
# Even unsatisfied customers are happy with the store service.
aux =df4_eda[['id','store_service','satisfaction']].groupby(['store_service','satisfaction']).count().reset_index()

fig = px.bar(aux, x="store_service", y="id",
             color='satisfaction',
             barmode='group',
             text='id',
             color_discrete_sequence=['mediumturquoise', 'lightgreen'],
             height=600,
             width=1000)

fig.update_traces(texttemplate='%{text:.2s}', 
                  textposition='outside')

fig.update_layout(uniformtext_minsize=8, 
                  uniformtext_mode='hide',
                  title={ 'text':'Store Service Score vs Customer Satisfaction', 'x':0.50, 'y':0.95},
                  plot_bgcolor='white',
                  paper_bgcolor='white')

fig.show()


## 4.3 Multivariate Analysis

In [None]:
aux = df4_eda.copy()

mask = np.zeros_like(aux.corr(), dtype=bool) 
mask[np.triu_indices_from(mask)] = True 

f, ax = plt.subplots(figsize=(16, 12))

plt.title('Matriz de Correlação',fontsize=18)
sns.heatmap(aux.corr(),linewidths=0.25,fmt=".2f", cmap="Blues", linecolor='w',annot=True,annot_kws={"size":9},mask=mask,cbar_kws={"shrink": .9});

# 5. DATA PREPARATION

In [None]:
df5 = df4_eda.copy()

In [None]:
# FREQUENCY ENCODER
# gender
fe_gender = df5.groupby('gender').size() / len(df5)
df5['gender'] = df5['gender'].map( fe_gender )
#joblib.dump(fe_gender, 'fe_gender.joblib')

# type_of_purchase
fe_type_of_purchase = df5.groupby( 'type_of_purchase' ).size() / len( df5 ) 
df5['type_of_purchase'] = df5['type_of_purchase'].map( fe_type_of_purchase )
#joblib.dump(fe_tpurchase, 'fe_tpurchase.joblib')

# ORDINAL ENCODER
oea = pp.OrdinalEncoder()
oeb = pp.OrdinalEncoder()
oec = pp.OrdinalEncoder()
oed = pp.OrdinalEncoder()

# distance
#df5['distancia'] = oea.fit_transform(df5[['distancia']].values)
#joblib.dump(oe_distance, 'oe_distance.joblib')

# distance
#df5['age_group'] = oeb.fit_transform(df5[['age_group']].values)
#joblib.dump(oe_ageg, 'oe_ageg.joblib')

# distance
df5['customer_type'] =oec.fit_transform(df5[['customer_type']].values)
#joblib.dump(oe_customer, 'oe_customer.joblib')

# Store Size
df5['store_size'] = oed.fit_transform( df5[['store_size']].values )
#joblib.dump(oe_store, 'oe_store.joblib')

# REESCALING
mms = pp.MinMaxScaler()

# age
df5['age'] = mms.fit_transform( df5[['age']].values )
#joblib.dump(mm_age, 'mm_age.joblib')

# ROBUST SCALER
rs = pp.RobustScaler()

# Store Distance
df5['store_distance'] = rs.fit_transform( df5[['store_distance']].values )
#joblib.dump(rs_stored, 'rs_stored.joblib')

# TRANSFORMACAO LOGARITMICA 
# carrier_delay_in_minutes
df5['carrier_delay_in_minutes'] = df5['carrier_delay_in_minutes'].apply(lambda x: np.log(x) if x > 0 else x) 

# delivery_delay_in_minutes
df5['delivery_delay_in_minutes'] = df5['delivery_delay_in_minutes'].apply(lambda x: np.log(x) if x > 0 else x) 

# Response
# satisfaction
df5['satisfaction'] = df5['satisfaction'].apply(lambda x: 0 if x=='Satisfied' else 1)

# 6. FEATURE SELECTION

In [None]:
df6 = df_raw.copy()
df6_prep = train_prep(df6)
feat_x = df6_prep.drop(['satisfaction'], axis=1)
feat_y = df6_prep['satisfaction'].copy() 

In [None]:
# model definition
xgboost = XGBClassifier()

# model training
xgboost.fit(feat_x, feat_y)

In [None]:
feature_importances = xgboost.feature_importances_
feature_names = feat_x.columns 
feature_importance_dict = dict(zip(feature_names, feature_importances))
sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)
sorted_feature_names, sorted_importance_scores = zip(*sorted_feature_importance)
plt.figure(figsize=(10, 10))

plt.barh(sorted_feature_names, sorted_importance_scores)
plt.xlabel("Feature Importance")
plt.ylabel("Feature Name")
plt.title("Feature Importance")
plt.show()

As this is the first cycle, we will consider all features for the model, but it is something that could be tested ont he new further cycles and see the impact on performance, resources, etc.

# 7. MACHINE LEARNING

In [None]:
df7 = df5.copy()

## 7.1 - DATASET SPLIT

In [None]:
# Dividindo o dataset em treino, validacao e teste
X = df7.drop( ['satisfaction'], axis=1 )
y = df7['satisfaction'].copy()

train_x, val_x, train_y, val_y   = ms.train_test_split( X, y, test_size=0.15, shuffle=True, random_state=None )
train_x, test_x, train_y, test_y   = ms.train_test_split( train_x, train_y, test_size=0.15, shuffle=True, random_state=None )
print(f" train_x: {train_x.shape}\n train_y: {train_y.shape}\n val_x: {val_x.shape}\n val_y: {val_y.shape}\n test_x: {test_x.shape}\n test_y: {test_y.shape}\n")

## 7.2 Machine Learning Models

### 7.2.1 XGB

In [None]:
clf_xgb = XGBClassifier()

evalxgb = cross_validation(5, 'XGB', clf_xgb, train_x, train_y)
evalxgb

In [None]:
def plot_lgbm_param_scores(param_name, param_range, fixed_params, X_train, y_train, X_val, y_val):
    
    train_scores = []
    val_scores = []
    
    for param_value in param_range:
        params = fixed_params
        params[param_name] = param_value
        
        lgbm = XGBClassifier(**params)
        lgbm.fit(X_train, y_train)
        
        train_scores.append(precision_score(y_train, lgbm.predict(X_train)))
        val_scores.append(precision_score(y_val, lgbm.predict(X_val)))
    
    plt.figure(figsize=(8, 4))
    plt.plot(param_range, train_scores, label="Train", color="#264653")
    plt.plot(param_range, val_scores, label="Val", color="#e76f51")

    plt.title("LightGBM")
    plt.xlabel(param_name)
    plt.ylabel("Precision")
    plt.tight_layout()
    plt.legend(loc="best");

In [None]:
params_xgb = {
    'max_depth': 3,
    'eta': 0.02,
    'learning_rate': 0.7,
    'n_estimators': 2000,
    'min_child_weight': 2,
    'gamma': 0.3,
    'subsample': 0.7,
    'colsample_bytree': 0.6,
    'reg_alpha': 4.2e-06,
    'reg_lambda': 7.2e-05
}

In [None]:
params_xgb = {
    'max_depth': 12,
    'eta': 0.019834339603301256,
    'learning_rate': 0.07,
    'n_estimators': 2000,
    'min_child_weight': 2,
    'gamma': 0.3113641371820694,
    'subsample': 0.7039764982894372,
    'colsample_bytree': 0.6313892474468165,
    'reg_alpha': 4.239283976142879e-06,
    'reg_lambda': 7.270434099537632e-05
}

In [None]:
param_name = "n_estimators"
param_range = np.arange(1000, 3000, 500)
fixed_params = params_xgb.copy()

plot_lgbm_param_scores(param_name, param_range, fixed_params, train_x, train_y, val_x, val_y)

In [None]:
# model training
clf_xgb.fit( train_x, train_y )

# model prediction - Check if the model perform well
yhat_xgb = clf_xgb.predict( val_x )

In [None]:
precision_score(val_y, yhat_xgb, average='micro') 

In [None]:
cm_xgb = confusion_matrix(val_y, yhat_xgb,  labels=clf_xgb.classes_)
cm_xgb_disp = ConfusionMatrixDisplay(confusion_matrix=cm_xgb,
                                     display_labels=clf_xgb.classes_,
                                    )

cm_xgb_disp.plot(cmap=plt.cm.Blues)
plt.grid(False) # removing grid
plt.gca().set_facecolor('white') # changing background color to white

plt.show()

0 represents Satisfied
1 represents Neutral or Dissatisfaction

### 7.2.2 LGBM

In [None]:
clf_lgbm = lgbm.LGBMClassifier()

evallgbm = cross_validation(5, 'LGBM', clf_lgbm, train_x, train_y)
evallgbm

In [None]:
clf_lgbm = lgbm.LGBMClassifier()

# model training
clf_lgbm.fit( train_x, train_y )

# model prediction - Check if the model perform well
yhat_lgbm = clf_lgbm.predict( val_x )

In [None]:
precision_score(val_y, yhat_lgbm, average='micro') 

In [None]:
cm_lgbm = confusion_matrix(val_y, yhat_lgbm,  labels = clf_lgbm.classes_)
cm_lgbm_disp = ConfusionMatrixDisplay(confusion_matrix = cm_lgbm,
                                      display_labels = clf_lgbm.classes_,
                                     )

cm_lgbm_disp.plot(cmap=plt.cm.Blues)
plt.grid(False) # removing grid
plt.gca().set_facecolor('white') # changing background color to white

plt.show()

# 8. FINE TUNING

## 8.1 XGB

In [None]:
def objective_xgb(trial):
    """Define the objective function"""

    params = {
        'max_depth': trial.suggest_int('max_depth', 5, 20),
        'eta': trial.suggest_float('eta', 0.01, 0.05),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 15),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'subsample': trial.suggest_loguniform('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.1, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
        'eval_metric': 'mlogloss',
        'use_label_encoder': False
    }

    # Fit the model
    optuna_xgb = XGBClassifier(**params)
    optuna_xgb.fit(x_train, y_train)

    # Make predictions
    optuna_xgb_pred = optuna_xgb.predict(test_x)

    # Evaluate predictions
    precision = precision_score(test_y, optuna_xgb_pred, average='micro')
    return precision

In [None]:
# %%time
# study_xgb = optuna.create_study(direction='maximize')
# study_xgb.optimize(objective_xgb, n_trials=150)

In [None]:
print('Number of finished trials: {}'.format(len(study_xgb.trials)))
print('Best trial:')
trial = study_xgb.best_trial

print('  Value: {}'.format(trial.value))
print('  Params: ')

for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))

In [None]:
params_xgb = {
    'max_depth': 12,
    'eta': 0.019834339603301256,
    'learning_rate': 0.01249243707719708,
    'n_estimators': 759,
    'min_child_weight': 2,
    'gamma': 0.3113641371820694,
    'subsample': 0.7039764982894372,
    'colsample_bytree': 0.6313892474468165,
    'reg_alpha': 4.239283976142879e-06,
    'reg_lambda': 7.270434099537632e-05
}

In [None]:
fig = optuna.visualization.plot_slice(study_xgb, params=["learning_rate", "max_depth", "n_estimators"])#, "reg_alpha", "reg_lambda"])
fig.show()

In [None]:
param_name = "max_depth"
param_range = np.arange(6, 24, 1)
fixed_params = params_xgb.copy()

plot_scores(param_name, param_range, fixed_params, XGBClassifier, x_train, y_train, x_valid, y_valid)

## 8.2 LGBM

In [None]:
def objective_lgbm(trial):
    """Objective function"""
    param = {
        "n_estimators":trial.suggest_int('n_estimators', 100, 1000),
        'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt','dart']),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 1e-1, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-6, 1e-4, log=True),
        "learning_rate":trial.suggest_float('learning_rate', 0.01, 0.1),
        "max_depth":trial.suggest_int('max_depth', 5, 20),
        "objective": "binary",
        "metric": "binary_logloss",
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
    }
    
    # Definicao do Modelo
    lgbm_model = lgbm.LGBMClassifier(**param)
    
    # Treinamento do Modelo
    lgbm_model.fit(train_x, train_y)
    
    # Predicoes do Modelo
    preds = lgbm_model.predict(test_x)
    
    # Metrica
    precision = precision_score(test_y, preds, average='micro')
    return precision

In [None]:
# %%time
# study_lgbm = optuna.create_study(direction='maximize')
# study_lgbm.optimize(objective_lgbm, n_trials=150)

In [None]:
print('Number of finished trials: {}'.format(len(study_lgbm.trials)))
print('Best trial:')
trial = study_lgbm.best_trial

print('  Value: {}'.format(trial.value))
print('  Params: ')

for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))

In [None]:
params_lgbm = {
    'n_estimators': 615,
    'boosting_type': 'dart',
    'reg_alpha': 0.001060107983785464,
    'reg_lambda': 4.014338101780075e-06,
    'learning_rate': 0.06976461270675846,
    'max_depth': 18,
    'num_leaves': 74,
    'bagging_fraction': 0.6210384794469442
}

In [None]:
fig = optuna.visualization.plot_slice(study_lgbm, params=["learning_rate", "max_depth", "n_estimators", "reg_alpha", "reg_lambda"])
fig.show()

In [None]:
param_name = "max_depth"
param_range = np.arange(6, 24, 1)
fixed_params = params_lgbm.copy()

plot_scores(param_name, param_range, fixed_params, lgbm.LGBMClassifier, x_train, y_train, x_valid, y_valid)

# 9.0 MACHINE LEARNING MODELS AFTER TUNING

In [None]:
X = train_prep(df_raw)

In [None]:
X_training = X.drop( ['satisfaction'], axis=1 )
y_training = X['satisfaction'].copy()

## 9.1 XGB

In [None]:
xgb_tuned = XGBClassifier(**params_xgb)

eval_xgb = cross_validation(5, 'XGB', xgb_tuned, X_training, y_training)
eval_xgb

## 9.2 LGBM

In [None]:
lgbm_tuned = lgbm.LGBMClassifier(**params_lgbm)

eval_lgbm = cross_validation(5, 'LGBM', lgbm_tuned, X_training, y_training)
eval_lgbm

## Last Training to save for deploy

In [None]:
#lgbm_tunado = lgbm.LGBMClassifier(**params_lgbm).fit(X, y)

In [None]:
# Saving trained model to production
#joblib.dump(lgbm_tunado, 'lgbm.joblib')

## 9.3 ENSEMBLE

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
Ensemble = VotingClassifier(estimators = [('lgbm', clfft_lgbm), ('xgb', clfft_xgb)], 
                            voting='hard',
                            weights = [0.8,0.2])
Ensemble.fit(X, y)

# 10. SUBMISSION

In [None]:
# Model Prediction
yhat_submission = Ensemble.predict(x_test)

In [None]:
x_test['satisfaction'] = yhat_submission

In [None]:
submission = df_test[['id','satisfaction']]

In [None]:
submission.head()

In [None]:
submission.to_csv("submission.csv", index=None, sep=',')

# 11. TO PRODUCTION

## 11.1 InStyle Class

In [None]:

import json
import joblib
import inflection
import pandas   as pd
import numpy    as np

class instyle( object ):
    def __init__( self ):
        self.home_path     = 'C:/Users/perot/Documents/ds_repos/API e WebAPP/InStyle_API/src/features'
        self.fe_gender     = joblib.load( open( self.home_path + '/fe_gender.joblib', 'rb') )
        self.fe_tpurchase  = joblib.load( open( self.home_path + '/fe_tpurchase.joblib', 'rb') )
        self.mm_age        = joblib.load( open( self.home_path + '/mm_age.joblib', 'rb') )
        self.oe_ageg       = joblib.load( open( self.home_path + '/oe_ageg.joblib', 'rb'))
        self.oe_customer   = joblib.load( open( self.home_path + '/oe_customer.joblib', 'rb') )
        self.oe_distance   = joblib.load( open( self.home_path + '/oe_distance.joblib', 'rb') )
        self.oe_store      = joblib.load( open( self.home_path + '/oe_store.joblib', 'rb') )
        self.rs_stored     = joblib.load( open( self.home_path + '/rs_stored.joblib', 'rb') )
        
    def test_prep(self, df):
        ## Rename Columns
        cols_old = ['id', 'Gender', 'Customer Type', 'Age', 'Type of Purchase','Store size', 'Store distance', 'InStore wifi',
                    'Open/Close time convenient', 'Easy of online shopping', 'Store location', 'Toilet cleaning', 'Dressing room', 
                    'Waiting room','Kids entertainment', 'Seller service', 'Showroom ', 'Self-Store', 'Purchase service', 
                    'Store Service', 'Cleanliness', 'Carrier delay in minutes', 'Delivery delay in minutes']

        snakecase = lambda x: inflection.underscore(x.replace(' ', '_'))
        cols_new = list( map( snakecase, cols_old ) )

        df.columns = cols_new

        # FEATURE ENGINEERING

        df['distancia'] = df['store_distance'].apply(lambda x: 'close' if x <= 2500 else 
                                                               'far' )



        df['age_group'] = df['age'].apply(lambda x: 'young' if x <= 18 else
                                                    'adult' if ((x > 18) and (x <= 60)) else
                                                    'elderly')

        # TRANSFORMERS
        # gender
        df['gender'] = df['gender'].map(self.fe_gender)

        # type_of_purchase
        df['type_of_purchase'] = df['type_of_purchase'].map( self.fe_tpurchase )

        # distance
        df['distancia'] = self.oe_distance.transform(df[['distancia']].values)

        # distance
        df['age_group'] = self.oe_ageg.transform(df[['age_group']].values)

        # distance
        df['customer_type'] = self.oe_customer.transform(df[['customer_type']].values)

        # Store Size
        df['store_size'] = self.oe_store.transform(df[['store_size']].values)

        # REESCALING

        # age
        df['age'] = self.mm_age.transform(df[['age']].values)

        # Store Distance
        df['store_distance'] = self.rs_stored.transform(df[['store_distance']].values)

        # LOG TRANSFORMATION 

        # carrier_delay_in_minutes
        df['carrier_delay_in_minutes'] = df['carrier_delay_in_minutes'].apply(lambda x: np.log(x) if x > 0 else x) 

        # delivery_delay_in_minutes
        df['delivery_delay_in_minutes'] = df['delivery_delay_in_minutes'].apply(lambda x: np.log(x) if x > 0 else x) 

        return(df)
    
    def get_prediction (self, model, original_data, test_data ):
        # prediction
        pred = model.predict( test_data )
        
        # join pred into the original data
        original_data['satisfaction'] =  pred 
        
        return original_data.to_json( orient='records', date_format='iso' )  

In [None]:
import joblib
import pandas as pd
import os
from flask                           import Flask, request, Response
from instyle.instyle import instyle

# logading model
model = joblib.load( open('src/models/lgbm.joblib', 'rb') )
                          
# initialize API
app = Flask( __name__ )

@app.route( '/instyle/predict', methods=['POST'] )
def instyle_predict():
    test_json = request.get_json()
    
    if test_json: #there is data
               
        if isinstance( test_json, dict ): # unique example
            test_raw = pd.DataFrame( test_json, index=[0] )
    
        else:
            test_raw = pd.DataFrame( test_json, columns=test_json[0].keys() ) # multiple examples
            
        # Instantiate Instyle Class
        pipeline = instyle()

        # Data Preparation
        df1 = pipeline.test_prep( test_raw )
                              
        # Prediction
        df_response = pipeline.get_prediction( model, test_raw, df1 )
        
        return df_response
        
    else:
        return Response( '{}', status=200, mimetype='application/json' )

if __name__ == '__main__':
    app.run('0.0.0.0')
    #port = os.environ.get('PORT', 5000)
    #app.run( host='0.0.0.0', port=port )

# API CALL

In [None]:
import requests
import pandas as pd

In [None]:
test = pd.read_csv('C:/Users/perot/Documents/ds_repos/projects/InStyle_Net_Promoter_Score/data/test.csv')

In [None]:
teste = test.head(10)

In [None]:
data = json.dumps( test.to_dict( orient='records'))

In [None]:
# API Call
url = 'http://127.0.0.1:5000/instyle/predict'
header = {'Content-type': 'application/json'}
data = data

r = requests.post( url, data=data , headers=header )
print( 'Stadus Code{}'.format( r.status_code ) )

In [None]:
d1 = pd.DataFrame( r.json(), columns=r.json()[0].keys() )