In this notebook, I am making sure if what I did previously will work properly before to create the infra to deploy it into production. I am checking if the data processings steps are working, if the selected algorithm is working and performing accordingly the tests made.


## 0 - LIBRARIES AND HELPER FUNCTIONS

### 0.1 - LIBRARIES

**Loading the libraries to be used on the project**

In [1]:
# Data Analysis and Data Manipulation Libraries
import inflection
import joblib
import pandas                                       as pd
import numpy                                        as np
import seaborn                                      as sns
import scikitplot                                   as skplt
import plotly.express                               as px
import plotly.graph_objects                         as go
from matplotlib              import pyplot          as plt

# Data Processing
from sklearn                 import model_selection as ms
from sklearn                 import preprocessing   as pp
from sklearn.model_selection import KFold

#import optuna

# ML Models and Metrics
import lightgbm                                     as lgbm
from sklearn                 import ensemble        as en
from xgboost                 import XGBClassifier
from sklearn.linear_model    import LogisticRegression

from sklearn.metrics         import precision_score 
from sklearn.metrics         import precision_recall_curve
from sklearn.metrics         import confusion_matrix, ConfusionMatrixDisplay

# Dysplay
from IPython.core.display    import HTML
from IPython.display         import Image

import warnings
warnings.filterwarnings( 'ignore' )

### 0.2 - HELPER FUNCTIONS

**Personalized functions created to optimize the project**

In [2]:
# Function to help to indentify the best value for each parameter at time
def plot_scores(param_name, param_range, fixed_params, model, model_name, X_train, y_train, X_val, y_val):
    
    train_scores = []
    val_scores = []
    
    for param_value in param_range:
        params = fixed_params
        params[param_name] = param_value
        
        lgbm = model(**params)
        lgbm.fit(X_train, y_train)
        
        train_scores.append(precision_score(y_train, lgbm.predict(X_train)))
        val_scores.append(precision_score(y_val, lgbm.predict(X_val)))
    
    plt.figure(figsize=(8, 4))
    plt.plot(param_range, train_scores, label="Train", color="#264653", marker='o')
    plt.plot(param_range, val_scores, label="Val", color="#e76f51", marker='o')

    plt.title(model_name)
    plt.xlabel(param_name)
    plt.ylabel("Precision")
    plt.tight_layout()
    plt.legend(loc="best");

def cross_validation(kfold, modelName, model, X, y):
    
    # Number of folds
    fold=ms.StratifiedKFold(n_splits = kfold, shuffle=True, random_state=42)
    
    # Performance variables
    precision_list = []
    
    for train_cv,val_cv in fold.split(X, y):
        
        # Separating train and validation dataset for each kfold
        # training data
        x_train_fold = X.iloc[train_cv]
        x_train_fold = x_train_fold
        
        y_train_fold = y.iloc[train_cv]
        y_train_fold = y_train_fold
        
        # validation data
        x_val_fold = X.iloc[val_cv]
        x_val_fold = x_val_fold
        
        y_val_fold = y.iloc[val_cv]
        y_val_fold = y_val_fold
        
        # fitting the model
        model_fitted = model.fit(x_train_fold, y_train_fold)
        
        # predict
        yhat = model_fitted.predict(x_val_fold)
        
        #performance
        precision = precision_score(y_val_fold, yhat, average='micro')
        precision_list.append(precision)
        
    return np.mean(precision_list)

def jupyter_settings():
    %matplotlib inline
   
    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [18, 8]
    plt.rcParams['font.size'] = 24
    
    display( HTML( '<style>.container {width:100% !important; }</style>') )
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option( 'display.expand_frame_repr', False )
    
    sns.set()

In [3]:
jupyter_settings()

## 11.2 Running the application locally

In [None]:
# Loading the Test Dataset
df_test_raw = pd.read_csv('C:/Users/ricardo/Documents/ds_repos/projects/InStyle_Net_Promoter_Score/data/test.csv')

In [None]:
# Function to apply the pre-proceesing onto the test dataset

def test_prep(dfte):
    
    ## Renaming Columns
    cols_old = ['id', 'Gender', 'Customer Type', 'Age', 'Type of Purchase','Store size', 'Store distance', 'InStore wifi',
                'Open/Close time convenient', 'Easy of online shopping', 'Store location', 'Toilet cleaning', 'Dressing room', 
                'Waiting room','Kids entertainment', 'Seller service', 'Showroom ', 'Self-Store', 'Purchase service', 
                'Store Service', 'Cleanliness', 'Carrier delay in minutes', 'Delivery delay in minutes']

    snakecase = lambda x: inflection.underscore(x.replace(' ', '_'))
    cols_new = list( map( snakecase, cols_old ) )

    dfte.columns = cols_new
    
    # Fill NaN
    dfte['delivery_delay_in_minutes'] = dfte['delivery_delay_in_minutes'].fillna(dfte['carrier_delay_in_minutes'])
    
    # FEATURE ENGINEERING
    # New Features
    dfte['distance'] = dfte['store_distance'].apply(lambda x: 'close' if x <= 2500 else 
                                                              'far' )



    dfte['age_group'] = dfte['age'].apply(lambda x: 'young' if x <= 18 else
                                                    'adult' if ((x > 18) and (x <= 60)) else
                                                    'elderly')
    
    # TRANSFORMERS
    
    fe_gender    = joblib.load( 'C:/Users/ricardo/Documents/ds_repos/api/InStyle_API/src/features/fe_gender.joblib')
    fe_tpurchase = joblib.load( 'C:/Users/ricardo/Documents/ds_repos/api/InStyle_API/src/features/fe_tpurchase.joblib')
    oe_distance  = joblib.load( 'C:/Users/ricardo/Documents/ds_repos/api/InStyle_API/src/features/oe_distance.joblib')
    oe_ageg      = joblib.load( 'C:/Users/ricardo/Documents/ds_repos/api/InStyle_API/src/features/oe_ageg.joblib')
    oe_customer  = joblib.load( 'C:/Users/ricardo/Documents/ds_repos/api/InStyle_API/src/features/oe_customer.joblib')
    oe_store     = joblib.load( 'C:/Users/ricardo/Documents/ds_repos/api/InStyle_API/src/features/oe_store_size.joblib')
    mm_age       = joblib.load( 'C:/Users/ricardo/Documents/ds_repos/api/InStyle_API/src/features/mms_age.joblib')
    rs_stored    = joblib.load( 'C:/Users/ricardo/Documents/ds_repos/api/InStyle_API/src/features/rs_stored.joblib')
    
    # gender
    dfte['gender'] = dfte['gender'].map(fe_gender)
    
    # type_of_purchase
    dfte['type_of_purchase'] = dfte['type_of_purchase'].map(fe_tpurchase)

    # distance
    dfte['distance'] = oe_distance.transform(dfte[['distance']].values)

    # age_group
    dfte['age_group'] = oe_ageg.transform(dfte[['age_group']].values)

    # Customer
    dfte['customer_type'] =oe_customer.transform(dfte[['customer_type']].values)

    # Store Size
    dfte['store_size'] = oe_store.transform(dfte[['store_size']].values)

    # REESCALING
 
    # age
    dfte['age'] = mm_age.transform(dfte[['age']].values)

    # Store Distance
    dfte['store_distance'] = rs_stored.transform(dfte[['store_distance']].values)

    # TRANSFORMACAO LOGARITMICA 
    
    # carrier_delay_in_minutes
    dfte['carrier_delay_in_minutes'] = dfte['carrier_delay_in_minutes'].apply(lambda x: np.log(x) if x > 0 else x) 

    # delivery_delay_in_minutes
    dfte['delivery_delay_in_minutes'] = dfte['delivery_delay_in_minutes'].apply(lambda x: np.log(x) if x > 0 else x) 

    return(dfte)

In [None]:
x_test = test_prep( df_test_raw)

In [None]:
# Model Prediction
yhat_submission = lgbm_final.predict(x_test)

In [None]:
x_test['satisfaction'] = yhat_submission

In [None]:
submission = x_test[['id','satisfaction']]

In [None]:
submission.head()

In [None]:
submission.to_csv("submission.csv", index=None, sep=',')

## 11.3 InStyle Class

In [None]:
import json
import joblib
import inflection
import pandas   as pd
import numpy    as np

class instyle( object ):
    def __init__( self ):
        self.home_path     = 'C:/Users/perot/Documents/ds_repos/API e WebAPP/InStyle_API/src/features'
        self.fe_gender     = joblib.load( open( self.home_path + '/fe_gender.joblib', 'rb') )
        self.fe_tpurchase  = joblib.load( open( self.home_path + '/fe_tpurchase.joblib', 'rb') )
        self.mm_age        = joblib.load( open( self.home_path + '/mm_age.joblib', 'rb') )
        self.oe_ageg       = joblib.load( open( self.home_path + '/oe_ageg.joblib', 'rb'))
        self.oe_customer   = joblib.load( open( self.home_path + '/oe_customer.joblib', 'rb') )
        self.oe_distance   = joblib.load( open( self.home_path + '/oe_distance.joblib', 'rb') )
        self.oe_store      = joblib.load( open( self.home_path + '/oe_store.joblib', 'rb') )
        self.rs_stored     = joblib.load( open( self.home_path + '/rs_stored.joblib', 'rb') )
        
    def test_prep(self, df):
        ## Rename Columns
        cols_old = ['id', 'Gender', 'Customer Type', 'Age', 'Type of Purchase','Store size', 'Store distance', 'InStore wifi',
                    'Open/Close time convenient', 'Easy of online shopping', 'Store location', 'Toilet cleaning', 'Dressing room', 
                    'Waiting room','Kids entertainment', 'Seller service', 'Showroom ', 'Self-Store', 'Purchase service', 
                    'Store Service', 'Cleanliness', 'Carrier delay in minutes', 'Delivery delay in minutes']

        snakecase = lambda x: inflection.underscore(x.replace(' ', '_'))
        cols_new = list( map( snakecase, cols_old ) )

        df.columns = cols_new

        # FEATURE ENGINEERING

        df['distancia'] = df['store_distance'].apply(lambda x: 'close' if x <= 2500 else 
                                                               'far' )



        df['age_group'] = df['age'].apply(lambda x: 'young' if x <= 18 else
                                                    'adult' if ((x > 18) and (x <= 60)) else
                                                    'elderly')

        # TRANSFORMERS
        # gender
        df['gender'] = df['gender'].map(self.fe_gender)

        # type_of_purchase
        df['type_of_purchase'] = df['type_of_purchase'].map( self.fe_tpurchase )

        # distance
        df['distancia'] = self.oe_distance.transform(df[['distancia']].values)

        # distance
        df['age_group'] = self.oe_ageg.transform(df[['age_group']].values)

        # distance
        df['customer_type'] = self.oe_customer.transform(df[['customer_type']].values)

        # Store Size
        df['store_size'] = self.oe_store.transform(df[['store_size']].values)

        # REESCALING

        # age
        df['age'] = self.mm_age.transform(df[['age']].values)

        # Store Distance
        df['store_distance'] = self.rs_stored.transform(df[['store_distance']].values)

        # LOG TRANSFORMATION 

        # carrier_delay_in_minutes
        df['carrier_delay_in_minutes'] = df['carrier_delay_in_minutes'].apply(lambda x: np.log(x) if x > 0 else x) 

        # delivery_delay_in_minutes
        df['delivery_delay_in_minutes'] = df['delivery_delay_in_minutes'].apply(lambda x: np.log(x) if x > 0 else x) 

        return(df)
    
    def get_prediction (self, model, original_data, test_data ):
        # prediction
        pred = model.predict( test_data )
        
        # join pred into the original data
        original_data['satisfaction'] =  pred 
        
        return original_data.to_json( orient='records', date_format='iso' )  

In [None]:
import joblib
import pandas as pd
import os
from flask                           import Flask, request, Response
from instyle.instyle import instyle

# logading model
model = joblib.load( open('src/models/lgbm.joblib', 'rb') )
                          
# initialize API
app = Flask( __name__ )

@app.route( '/instyle/predict', methods=['POST'] )
def instyle_predict():
    test_json = request.get_json()
    
    if test_json: #there is data
               
        if isinstance( test_json, dict ): # unique example
            test_raw = pd.DataFrame( test_json, index=[0] )
    
        else:
            test_raw = pd.DataFrame( test_json, columns=test_json[0].keys() ) # multiple examples
            
        # Instantiate Instyle Class
        pipeline = instyle()

        # Data Preparation
        df1 = pipeline.test_prep( test_raw )
                              
        # Prediction
        df_response = pipeline.get_prediction( model, test_raw, df1 )
        
        return df_response
        
    else:
        return Response( '{}', status=200, mimetype='application/json' )

if __name__ == '__main__':
    app.run('0.0.0.0')
    #port = os.environ.get('PORT', 5000)
    #app.run( host='0.0.0.0', port=port )

# 12. API TESTING

In [None]:
import requests
import pandas as pd

In [None]:
test = pd.read_csv('C:/Users/perot/Documents/ds_repos/projects/InStyle_Net_Promoter_Score/data/test.csv')

In [None]:
teste = test.head(10)

In [None]:
data = json.dumps( test.to_dict( orient='records'))

In [None]:
# API Call
url = 'http://127.0.0.1:5000/instyle/predict'
header = {'Content-type': 'application/json'}
data = data

r = requests.post( url, data=data , headers=header )
print( 'Stadus Code{}'.format( r.status_code ) )

In [None]:
d1 = pd.DataFrame( r.json(), columns=r.json()[0].keys() )