# Imports

In [105]:
import pandas as pd
import sys 
import os
import math
import numpy as np  
pd.set_option('display.float_format', lambda x: '%.4f' % x)


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


sys.path.insert(0, '../src/')
from data.data_load import DataLoad

# Helper Functions

In [106]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def regression_metrics(y_true, y_pred):
    """
    Calcula várias métricas de regressão e retorna um DataFrame.

    Parâmetros:
    - y_true: Valores reais.
    - y_pred: Valores preditos.

    Retorna:
    - DataFrame contendo métricas de regressão.
    """

    absolute_errors = np.abs(y_true - y_pred)
    percentage_errors = absolute_errors / np.maximum(np.abs(y_true), 1e-8)

    metrics_dict = {
        'mse': mean_squared_error(y_true, y_pred),
        'rmse': np.sqrt(mean_squared_error(y_true, y_pred)),
        'mae': mean_absolute_error(y_true, y_pred),
        'r2': r2_score(y_true, y_pred),
        'mape': np.mean(percentage_errors) * 100
        # Adicione mais métricas conforme necessário
    }

    metrics_df = pd.DataFrame(list(metrics_dict.items()), columns=['Métrica', 'Valor'])
    return metrics_df

# Data Load

In [107]:
dl = DataLoad()
df = dl.load_data('dados_train')

c:\Users\thale\Documents\Projetos_DS\projeto_aluno_mlflow\delivery_fast\notebooks\../src\data
2023-12-24 11:16:56 [info     ] Iniciando o carregamento
c:\Users\thale\Documents\Projetos_DS\projeto_aluno_mlflow\delivery_fast\src\data
c:\Users\thale\Documents\Projetos_DS\projeto_aluno_mlflow\delivery_fast\data\raw\dados.csv


# 1.0 Descrição dos Dados

In [108]:
df1 = df.copy()

## 1.1 Dimensão dos Dados

In [109]:
df1.shape

(197428, 16)

## 1.2 Tipo dos Dados

In [110]:
df1.dtypes

market_id                                       float64
created_at                                       object
actual_delivery_time                             object
store_id                                          int64
store_primary_category                           object
order_protocol                                  float64
total_items                                       int64
subtotal                                          int64
num_distinct_items                                int64
min_item_price                                    int64
max_item_price                                    int64
total_onshift_dashers                           float64
total_busy_dashers                              float64
total_outstanding_orders                        float64
estimated_order_place_duration                    int64
estimated_store_to_consumer_driving_duration    float64
dtype: object

## 1.3 Check Na

In [111]:
df1.isna().sum()

market_id                                         987
created_at                                          0
actual_delivery_time                                7
store_id                                            0
store_primary_category                           4760
order_protocol                                    995
total_items                                         0
subtotal                                            0
num_distinct_items                                  0
min_item_price                                      0
max_item_price                                      0
total_onshift_dashers                           16262
total_busy_dashers                              16262
total_outstanding_orders                        16262
estimated_order_place_duration                      0
estimated_store_to_consumer_driving_duration      526
dtype: int64

# 2.0 Fill Na

In [112]:
df2 = df1.copy()

In [113]:
df2 = df2.dropna()

In [114]:
# # total_onshift_dashers
# df2['total_onshift_dashers'] = df2['total_onshift_dashers'].apply(lambda x: 0 if math.isnan(x) else x)

# # total_busy_dashers
# df2['total_busy_dashers'] = df2['total_busy_dashers'].apply(lambda x: 0 if math.isnan(x) else x)

# # total_outstanding_orders
# df2['total_outstanding_orders'] = df2['total_outstanding_orders'].apply(lambda x: 0 if math.isnan(x) else x)

In [115]:
# df2[['store_id','store_primary_category']].groupby('store_id').count().reset_index().sort_values('store_primary_category', ascending=False)

In [116]:
# df2.loc[df2['store_id'] == 6865]['store_primary_category'].value_counts()

# 3.0 Change Data Type

In [117]:
df3 = df2.copy()

In [118]:
df3.dtypes

market_id                                       float64
created_at                                       object
actual_delivery_time                             object
store_id                                          int64
store_primary_category                           object
order_protocol                                  float64
total_items                                       int64
subtotal                                          int64
num_distinct_items                                int64
min_item_price                                    int64
max_item_price                                    int64
total_onshift_dashers                           float64
total_busy_dashers                              float64
total_outstanding_orders                        float64
estimated_order_place_duration                    int64
estimated_store_to_consumer_driving_duration    float64
dtype: object

In [119]:
df3['created_at'] = pd.to_datetime(df3['created_at'])

df3['actual_delivery_time'] = pd.to_datetime(df3['actual_delivery_time'])

In [120]:
df3.head()

Unnamed: 0,market_id,created_at,actual_delivery_time,store_id,store_primary_category,order_protocol,total_items,subtotal,num_distinct_items,min_item_price,max_item_price,total_onshift_dashers,total_busy_dashers,total_outstanding_orders,estimated_order_place_duration,estimated_store_to_consumer_driving_duration
0,1.0,2015-02-06 22:24:17,2015-02-06 23:27:16,1845,american,1.0,4,3441,4,557,1239,33.0,14.0,21.0,446,861.0
1,2.0,2015-02-10 21:49:25,2015-02-10 22:56:29,5477,mexican,2.0,1,1900,1,1400,1400,1.0,2.0,2.0,446,690.0
8,2.0,2015-02-16 00:11:35,2015-02-16 00:38:01,5477,indian,3.0,4,4771,3,820,1604,8.0,6.0,18.0,446,289.0
14,1.0,2015-02-12 03:36:46,2015-02-12 04:14:39,2841,italian,1.0,1,1525,1,1525,1525,5.0,6.0,8.0,446,795.0
15,1.0,2015-01-27 02:12:36,2015-01-27 03:02:24,2841,italian,1.0,2,3620,2,1425,2195,5.0,5.0,7.0,446,205.0


# 4.0 Feature Engineering

In [121]:
df4 = df3.copy()

In [122]:
df4['target'] = (df4['actual_delivery_time'] - df4['created_at']).dt.total_seconds()

# 5.0 Data Preparation

In [123]:
df5 = df4.copy()

In [124]:
df5 = df5[['market_id', 'store_id',
       'store_primary_category', 'order_protocol', 'total_items', 'subtotal',
       'num_distinct_items', 'min_item_price', 'max_item_price',
       'total_onshift_dashers', 'total_busy_dashers',
       'total_outstanding_orders', 'estimated_order_place_duration',
       'estimated_store_to_consumer_driving_duration', 'target']]

In [125]:
X = df5.drop('target', axis=1)
y = df5['target']

In [126]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# 6.0 Data Preprocess

In [127]:
le = LabelEncoder()

## 6.1 Encoding

In [128]:
all_categories = set(X_train['store_primary_category']).union(set(X_valid['store_primary_category']))
le.fit(list(all_categories))
# Dados de Treino
X_train['store_primary_category'] = le.transform(X_train['store_primary_category'])



In [129]:
default_label = 'desconhecido'
X_valid['store_primary_category'] = X_valid['store_primary_category'].replace(to_replace={cat: default_label for cat in all_categories.difference(X_valid['store_primary_category'])})

# Aplicar o LabelEncoder para transformar as categorias
X_valid['store_primary_category'] = le.transform(X_valid['store_primary_category'])

In [130]:
X_train

Unnamed: 0,market_id,store_id,store_primary_category,order_protocol,total_items,subtotal,num_distinct_items,min_item_price,max_item_price,total_onshift_dashers,total_busy_dashers,total_outstanding_orders,estimated_order_place_duration,estimated_store_to_consumer_driving_duration
114965,2.0000,2179,45,2.0000,1,820,1,745,745,35.0000,39.0000,56.0000,251,758.0000
106226,4.0000,4234,20,3.0000,1,995,1,995,995,110.0000,72.0000,116.0000,251,779.0000
56155,1.0000,1585,36,3.0000,4,2040,4,150,895,6.0000,3.0000,7.0000,251,454.0000
42322,3.0000,1874,39,1.0000,2,2376,2,579,1399,16.0000,17.0000,19.0000,446,836.0000
103126,2.0000,3937,6,1.0000,1,1430,1,1320,1320,64.0000,47.0000,52.0000,251,789.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134700,2.0000,3019,58,5.0000,1,1111,1,1111,1111,31.0000,31.0000,34.0000,251,926.0000
116075,2.0000,915,24,1.0000,4,2880,3,495,795,95.0000,80.0000,111.0000,446,693.0000
148324,6.0000,3853,46,5.0000,2,1799,2,300,899,18.0000,18.0000,15.0000,446,548.0000
164942,5.0000,5923,36,2.0000,10,4015,4,195,850,26.0000,24.0000,28.0000,251,590.0000


# 7.0 Machine Learning

In [131]:
import mlflow
from mlflow.tracking import MlflowClient

## 7.1 Linear Regression

In [132]:
mlflow.set_tracking_uri('http://127.0.0.1:5000')

In [133]:
# Nome do experimento
experiment_name = 'linear_regression'

# Obtém o experimento (se existir)
experiment = mlflow.get_experiment_by_name(experiment_name)

# Cria o experimento se não existir
if experiment is None:
    mlflow.create_experiment(experiment_name)

# Define o experimento como ativo
mlflow.set_experiment(experiment_name)

<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1703426525900, experiment_id='1', last_update_time=1703426525900, lifecycle_stage='active', name='linear_regression', tags={}>

In [134]:
from sklearn.linear_model import LinearRegression

with mlflow.start_run(run_name='baseline'):
    mlflow.set_tag('model_name', 'linear_regression_baseline')

    #model definition
    lr_model = LinearRegression()

    # model train
    lr_model.fit(X_train, y_train)

    # model predict
    y_pred_lr = lr_model.predict(X_valid)

    params_lr = lr_model.get_params()

    # log nos parametros do modelo
    for i in params_lr:
        mlflow.log_param(i, params_lr[i])
    
    # Calculando as métricas de performance 
    df_result_lr = regression_metrics(y_valid, y_pred_lr)

    # log nas métricas que vem de df_result_lr
    for index, row in df_result_lr.iterrows():
        mlflow.log_metric(row['Métrica'], row['Valor'])
    
    mlflow.sklearn.log_model(lr_model, "linear_regression_model")




## 7.2 Ridge

In [135]:
mlflow.set_tracking_uri('http://127.0.0.1:5000')
mlflow.set_experiment('ridge')

<Experiment: artifact_location='mlflow-artifacts:/2', creation_time=1703426538433, experiment_id='2', last_update_time=1703426538433, lifecycle_stage='active', name='ridge', tags={}>

In [136]:
from sklearn.linear_model import Ridge

with mlflow.start_run(run_name='baseline'):
    mlflow.set_tag('model_name', 'ridge_baseline')

    #model definition
    rd_model = Ridge()

    # model train
    rd_model.fit(X_train, y_train)

    # model predict
    y_pred_rd = rd_model.predict(X_valid)

    # Model Evaluation
    df_result_rd = regression_metrics(y_valid, y_pred_rd)

    params = rd_model.get_params()

    # log nos parametros do modelo
    for i in params:
        mlflow.log_param(i, params[i])
    

    # log nas métricas que vem de df_result_rd
    for index, row in df_result_rd.iterrows():
        mlflow.log_metric(row['Métrica'], row['Valor'])
    
    mlflow.sklearn.log_model(rd_model, "ridge_model")



## Lasso

In [137]:
mlflow.set_tracking_uri('http://127.0.0.1:5000')
mlflow.set_experiment('lasso')

<Experiment: artifact_location='mlflow-artifacts:/3', creation_time=1703426545086, experiment_id='3', last_update_time=1703426545086, lifecycle_stage='active', name='lasso', tags={}>

In [138]:
from sklearn.linear_model import Lasso

with mlflow.start_run(run_name='baseline'):
    mlflow.set_tag('model_name', 'lasso_baseline')

    ls_model = Lasso()

    # model train
    ls_model.fit(X_train, y_train)

    # model predict
    y_pred_ls = ls_model.predict(X_valid)

    # Model Evaluation
    df_result_ls = regression_metrics(y_valid, y_pred_ls)

    params = ls_model.get_params()

    # log nos parametros do modelo
    for i in params:
        mlflow.log_param(i, params[i])
    

    # log nas métricas que vem de df_result_rd
    for index, row in df_result_ls.iterrows():
        mlflow.log_metric(row['Métrica'], row['Valor'])
    
    mlflow.sklearn.log_model(ls_model, "lasso_model")



## KNeighbors

In [139]:
mlflow.set_experiment('KNeighbors')

<Experiment: artifact_location='mlflow-artifacts:/4', creation_time=1703426998729, experiment_id='4', last_update_time=1703426998729, lifecycle_stage='active', name='KNeighbors', tags={}>

In [140]:
from sklearn.neighbors import KNeighborsRegressor

with mlflow.start_run(run_name='baseline'):
    mlflow.set_tag('model_name', 'kn_baseline')

    kn_model = KNeighborsRegressor()

    # model train
    kn_model.fit(X_train, y_train)

    # model predict
    y_pred_kn = kn_model.predict(X_valid)

    # Model Evaluation
    df_result_kn = regression_metrics(y_valid, y_pred_kn)

    params = kn_model.get_params()

    # log nos parametros do modelo
    for i in params:
        mlflow.log_param(i, params[i])
    

    # log nas métricas que vem de df_result_rd
    for index, row in df_result_kn.iterrows():
        mlflow.log_metric(row['Métrica'], row['Valor'])
    
    mlflow.sklearn.log_model(kn_model, "kn_model")



## Decision Tree

In [141]:
mlflow.set_experiment('Decision_Tree')

<Experiment: artifact_location='mlflow-artifacts:/5', creation_time=1703427017943, experiment_id='5', last_update_time=1703427017943, lifecycle_stage='active', name='Decision_Tree', tags={}>

In [142]:
from sklearn.tree import DecisionTreeRegressor

with mlflow.start_run(run_name='baseline'):
    mlflow.set_tag('model_name', 'dt_baseline')

    #model definition
    dt_model = DecisionTreeRegressor()

    # model train
    dt_model.fit(X_train, y_train)

    # model predict
    y_pred_dt = dt_model.predict(X_valid)

    # Model Evaluation
    df_result_dt = regression_metrics(y_valid, y_pred_dt)

    params = dt_model.get_params()

    # log nos parametros do modelo
    for i in params:
        mlflow.log_param(i, params[i])
    

    # log nas métricas que vem de df_result_rd
    for index, row in df_result_dt.iterrows():
        mlflow.log_metric(row['Métrica'], row['Valor'])
    
    mlflow.sklearn.log_model(df_result_dt, "dt_model")



## Gradient Boosting

In [143]:
mlflow.set_experiment('Gradient_Boosting')

<Experiment: artifact_location='mlflow-artifacts:/6', creation_time=1703427140031, experiment_id='6', last_update_time=1703427140031, lifecycle_stage='active', name='Gradient_Boosting', tags={}>

In [144]:
from sklearn.ensemble import GradientBoostingRegressor

with mlflow.start_run(run_name='baseline'):
    mlflow.set_tag('model_name', 'gb_baseline')

    #model definition
    gb_model = GradientBoostingRegressor()

    # model train
    gb_model.fit(X_train, y_train)

    # model predict
    y_pred_gb = gb_model.predict(X_valid)

    # Model Evaluation
    df_result_gb = regression_metrics(y_valid, y_pred_gb)

    params = gb_model.get_params()

    # log nos parametros do modelo
    for i in params:
        mlflow.log_param(i, params[i])
    

    # log nas métricas que vem de df_result_rd
    for index, row in df_result_gb.iterrows():
        mlflow.log_metric(row['Métrica'], row['Valor'])
    
    mlflow.sklearn.log_model(gb_model, "gb_model")



## 7.3 Random Forest Regression

In [145]:
mlflow.set_experiment('Random_Forest')

<Experiment: artifact_location='mlflow-artifacts:/7', creation_time=1703427307447, experiment_id='7', last_update_time=1703427307447, lifecycle_stage='active', name='Random_Forest', tags={}>

In [147]:
from sklearn.ensemble import RandomForestRegressor

with mlflow.start_run(run_name='baseline'):
    mlflow.set_tag('model_name', 'rf_baseline')

    #model definition
    rf_model = RandomForestRegressor()

    # model train
    rf_model.fit(X_train, y_train)

    # model predict
    y_pred_rf = rf_model.predict(X_valid)

    # Model Evaluation
    df_result_rf = regression_metrics(y_valid, y_pred_rf)

    params = rf_model.get_params()

    # log nos parametros do modelo
    for i in params:
        mlflow.log_param(i, params[i])
    

    # log nas métricas que vem de df_result_rd
    for index, row in df_result_rf.iterrows():
        mlflow.log_metric(row['Métrica'], row['Valor'])
    
    mlflow.sklearn.log_model(rf_model, "rf_model")