# Imports

In [34]:
import pandas as pd
import sys 
import os
import math
import numpy as np  
pd.set_option('display.float_format', lambda x: '%.4f' % x)


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


sys.path.insert(0, '../src/')
from data.data_load import DataLoad

# Helper Functions

In [35]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def regression_metrics(y_true, y_pred):
    """
    Calcula várias métricas de regressão e retorna um DataFrame.

    Parâmetros:
    - y_true: Valores reais.
    - y_pred: Valores preditos.

    Retorna:
    - DataFrame contendo métricas de regressão.
    """

    absolute_errors = np.abs(y_true - y_pred)
    percentage_errors = absolute_errors / np.maximum(np.abs(y_true), 1e-8)

    metrics_dict = {
        'mse': mean_squared_error(y_true, y_pred),
        'rmse': np.sqrt(mean_squared_error(y_true, y_pred)),
        'mae': mean_absolute_error(y_true, y_pred),
        'r2': r2_score(y_true, y_pred),
        'mape': np.mean(percentage_errors) * 100
        # Adicione mais métricas conforme necessário
    }

    metrics_df = pd.DataFrame(list(metrics_dict.items()), columns=['Métrica', 'Valor'])
    return metrics_df

# Data Load

In [36]:
dl = DataLoad()
df = dl.load_data('dados_train')

c:\Users\thale\Documents\Projetos_DS\projeto_aluno_mlflow\delivery_fast\notebooks\../src\data
2023-12-24 08:27:26 [info     ] Iniciando o carregamento
c:\Users\thale\Documents\Projetos_DS\projeto_aluno_mlflow\delivery_fast\src\data
c:\Users\thale\Documents\Projetos_DS\projeto_aluno_mlflow\delivery_fast\data\raw\dados.csv


# 1.0 Descrição dos Dados

In [37]:
df1 = df.copy()

## 1.1 Dimensão dos Dados

In [38]:
df1.shape

(197428, 16)

## 1.2 Tipo dos Dados

In [39]:
df1.dtypes

market_id                                       float64
created_at                                       object
actual_delivery_time                             object
store_id                                          int64
store_primary_category                           object
order_protocol                                  float64
total_items                                       int64
subtotal                                          int64
num_distinct_items                                int64
min_item_price                                    int64
max_item_price                                    int64
total_onshift_dashers                           float64
total_busy_dashers                              float64
total_outstanding_orders                        float64
estimated_order_place_duration                    int64
estimated_store_to_consumer_driving_duration    float64
dtype: object

## 1.3 Check Na

In [40]:
df1.isna().sum()

market_id                                         987
created_at                                          0
actual_delivery_time                                7
store_id                                            0
store_primary_category                           4760
order_protocol                                    995
total_items                                         0
subtotal                                            0
num_distinct_items                                  0
min_item_price                                      0
max_item_price                                      0
total_onshift_dashers                           16262
total_busy_dashers                              16262
total_outstanding_orders                        16262
estimated_order_place_duration                      0
estimated_store_to_consumer_driving_duration      526
dtype: int64

# 2.0 Fill Na

In [41]:
df2 = df1.copy()

In [42]:
df2 = df2.dropna()

In [43]:
# # total_onshift_dashers
# df2['total_onshift_dashers'] = df2['total_onshift_dashers'].apply(lambda x: 0 if math.isnan(x) else x)

# # total_busy_dashers
# df2['total_busy_dashers'] = df2['total_busy_dashers'].apply(lambda x: 0 if math.isnan(x) else x)

# # total_outstanding_orders
# df2['total_outstanding_orders'] = df2['total_outstanding_orders'].apply(lambda x: 0 if math.isnan(x) else x)

In [44]:
# df2[['store_id','store_primary_category']].groupby('store_id').count().reset_index().sort_values('store_primary_category', ascending=False)

In [45]:
# df2.loc[df2['store_id'] == 6865]['store_primary_category'].value_counts()

# 3.0 Change Data Type

In [46]:
df3 = df2.copy()

In [47]:
df3.dtypes

market_id                                       float64
created_at                                       object
actual_delivery_time                             object
store_id                                          int64
store_primary_category                           object
order_protocol                                  float64
total_items                                       int64
subtotal                                          int64
num_distinct_items                                int64
min_item_price                                    int64
max_item_price                                    int64
total_onshift_dashers                           float64
total_busy_dashers                              float64
total_outstanding_orders                        float64
estimated_order_place_duration                    int64
estimated_store_to_consumer_driving_duration    float64
dtype: object

In [48]:
df3['created_at'] = pd.to_datetime(df3['created_at'])

df3['actual_delivery_time'] = pd.to_datetime(df3['actual_delivery_time'])

In [49]:
df3.head()

Unnamed: 0,market_id,created_at,actual_delivery_time,store_id,store_primary_category,order_protocol,total_items,subtotal,num_distinct_items,min_item_price,max_item_price,total_onshift_dashers,total_busy_dashers,total_outstanding_orders,estimated_order_place_duration,estimated_store_to_consumer_driving_duration
0,1.0,2015-02-06 22:24:17,2015-02-06 23:27:16,1845,american,1.0,4,3441,4,557,1239,33.0,14.0,21.0,446,861.0
1,2.0,2015-02-10 21:49:25,2015-02-10 22:56:29,5477,mexican,2.0,1,1900,1,1400,1400,1.0,2.0,2.0,446,690.0
8,2.0,2015-02-16 00:11:35,2015-02-16 00:38:01,5477,indian,3.0,4,4771,3,820,1604,8.0,6.0,18.0,446,289.0
14,1.0,2015-02-12 03:36:46,2015-02-12 04:14:39,2841,italian,1.0,1,1525,1,1525,1525,5.0,6.0,8.0,446,795.0
15,1.0,2015-01-27 02:12:36,2015-01-27 03:02:24,2841,italian,1.0,2,3620,2,1425,2195,5.0,5.0,7.0,446,205.0


# 4.0 Feature Engineering

In [50]:
df4 = df3.copy()

In [51]:
df4['target'] = (df4['actual_delivery_time'] - df4['created_at']).dt.total_seconds()

# 5.0 Data Preparation

In [52]:
df5 = df4.copy()

In [53]:
df5 = df5[['market_id', 'store_id',
       'store_primary_category', 'order_protocol', 'total_items', 'subtotal',
       'num_distinct_items', 'min_item_price', 'max_item_price',
       'total_onshift_dashers', 'total_busy_dashers',
       'total_outstanding_orders', 'estimated_order_place_duration',
       'estimated_store_to_consumer_driving_duration', 'target']]

In [54]:
X = df5.drop('target', axis=1)
y = df5['target']

In [55]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# 6.0 Data Preprocess

In [56]:
le = LabelEncoder()

## 6.1 Encoding

In [57]:
all_categories = set(X_train['store_primary_category']).union(set(X_valid['store_primary_category']))
le.fit(list(all_categories))
# Dados de Treino
X_train['store_primary_category'] = le.transform(X_train['store_primary_category'])



In [58]:
default_label = 'desconhecido'
X_valid['store_primary_category'] = X_valid['store_primary_category'].replace(to_replace={cat: default_label for cat in all_categories.difference(X_valid['store_primary_category'])})

# Aplicar o LabelEncoder para transformar as categorias
X_valid['store_primary_category'] = le.transform(X_valid['store_primary_category'])

In [59]:
X_train

Unnamed: 0,market_id,store_id,store_primary_category,order_protocol,total_items,subtotal,num_distinct_items,min_item_price,max_item_price,total_onshift_dashers,total_busy_dashers,total_outstanding_orders,estimated_order_place_duration,estimated_store_to_consumer_driving_duration
114965,2.0000,2179,45,2.0000,1,820,1,745,745,35.0000,39.0000,56.0000,251,758.0000
106226,4.0000,4234,20,3.0000,1,995,1,995,995,110.0000,72.0000,116.0000,251,779.0000
56155,1.0000,1585,36,3.0000,4,2040,4,150,895,6.0000,3.0000,7.0000,251,454.0000
42322,3.0000,1874,39,1.0000,2,2376,2,579,1399,16.0000,17.0000,19.0000,446,836.0000
103126,2.0000,3937,6,1.0000,1,1430,1,1320,1320,64.0000,47.0000,52.0000,251,789.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134700,2.0000,3019,58,5.0000,1,1111,1,1111,1111,31.0000,31.0000,34.0000,251,926.0000
116075,2.0000,915,24,1.0000,4,2880,3,495,795,95.0000,80.0000,111.0000,446,693.0000
148324,6.0000,3853,46,5.0000,2,1799,2,300,899,18.0000,18.0000,15.0000,446,548.0000
164942,5.0000,5923,36,2.0000,10,4015,4,195,850,26.0000,24.0000,28.0000,251,590.0000


# 7.0 Machine Learning

## 7.1 Linear Regression

In [60]:
from sklearn.linear_model import LinearRegression

#model definition
lr_model = LinearRegression()

# model train
lr_model.fit(X_train, y_train)

# model predict
y_pred_lr = lr_model.predict(X_valid)

# Model Evaluation
df_result_lr = regression_metrics(y_valid, y_pred_lr)

In [61]:
df_result_lr

Unnamed: 0,Métrica,Valor
0,mse,1083249.1141
1,rmse,1040.7925
2,mae,703.1217
3,r2,0.2078
4,mape,26.6467


## 7.2 Ridge

In [63]:
from sklearn.linear_model import Ridge

#model definition
rd_model = Ridge()

# model train
rd_model.fit(X_train, y_train)

# model predict
y_pred_rd = rd_model.predict(X_valid)

# Model Evaluation
df_result_rd = regression_metrics(y_valid, y_pred_rd)
df_result_rd

Unnamed: 0,Métrica,Valor
0,mse,1083249.1071
1,rmse,1040.7925
2,mae,703.1217
3,r2,0.2078
4,mape,26.6467


## Lasso

In [64]:
from sklearn.linear_model import Lasso

#model definition
ls_model = Lasso()

# model train
ls_model.fit(X_train, y_train)

# model predict
y_pred_ls = ls_model.predict(X_valid)

# Model Evaluation
df_result_ls = regression_metrics(y_valid, y_pred_ls)
df_result_ls

Unnamed: 0,Métrica,Valor
0,mse,1083203.264
1,rmse,1040.7705
2,mae,703.0797
3,r2,0.2079
4,mape,26.6452


## KNeighbors

In [65]:
from sklearn.neighbors import KNeighborsRegressor

#model definition
kn_model = KNeighborsRegressor()

# model train
kn_model.fit(X_train, y_train)

# model predict
y_pred_kn = kn_model.predict(X_valid)

# Model Evaluation
df_result_kn = regression_metrics(y_valid, y_pred_kn)
df_result_kn

Unnamed: 0,Métrica,Valor
0,mse,2096562.1359
1,rmse,1447.951
2,mae,827.3715
3,r2,-0.5332
4,mape,30.9511


## Decision Tree

In [66]:
from sklearn.tree import DecisionTreeRegressor

#model definition
dt_model = DecisionTreeRegressor()

# model train
dt_model.fit(X_train, y_train)

# model predict
y_pred_dt = dt_model.predict(X_valid)

# Model Evaluation
df_result_dt = regression_metrics(y_valid, y_pred_dt)
df_result_dt

Unnamed: 0,Métrica,Valor
0,mse,6112730.7685
1,rmse,2472.3937
2,mae,984.5045
3,r2,-3.4701
4,mape,36.5735


## Gradient Boosting

In [68]:
from sklearn.ensemble import GradientBoostingRegressor

#model definition
gb_model = GradientBoostingRegressor()

# model train
gb_model.fit(X_train, y_train)

# model predict
y_pred_gb = gb_model.predict(X_valid)

# Model Evaluation
df_result_gb = regression_metrics(y_valid, y_pred_gb)
df_result_gb

Unnamed: 0,Métrica,Valor
0,mse,1109174.2631
1,rmse,1053.1734
2,mae,687.5481
3,r2,0.1889
4,mape,26.1191


## 7.3 Random Forest Regression

In [62]:
from sklearn.ensemble import RandomForestRegressor

#model definition
rf_model = RandomForestRegressor()

# model train
rf_model.fit(X_train, y_train)

# model predict
y_pred_rf = rf_model.predict(X_valid)

# Model Evaluation
df_result_rf = regression_metrics(y_valid, y_pred_rf)

In [67]:
df_result_rf

Unnamed: 0,Métrica,Valor
0,mse,1135146.6711
1,rmse,1065.4326
2,mae,683.9934
3,r2,0.1699
4,mape,26.0092
