# Imports

In [82]:
import pandas as pd
import sys 
import os
import math
import numpy as np 
import joblib
pd.set_option('display.float_format', lambda x: '%.4f' % x)


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor




sys.path.insert(0, '../src/')
from utils.utils import load_config_file
from data.data_load import DataLoad
from data.data_preprocess import DataPreprocess
from data.data_transformation import DataTransformation

# Helper Functions

In [83]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def regression_metrics(y_true, y_pred):
    """
    Calcula várias métricas de regressão e retorna um DataFrame.

    Parâmetros:
    - y_true: Valores reais.
    - y_pred: Valores preditos.

    Retorna:
    - DataFrame contendo métricas de regressão.
    """

    absolute_errors = np.abs(y_true - y_pred)
    percentage_errors = absolute_errors / np.maximum(np.abs(y_true), 1e-8)

    metrics_dict = {
        'mse': mean_squared_error(y_true, y_pred),
        'rmse': np.sqrt(mean_squared_error(y_true, y_pred)),
        'mae': mean_absolute_error(y_true, y_pred),
        'r2': r2_score(y_true, y_pred),
        'mape': np.mean(percentage_errors) * 100
        # Adicione mais métricas conforme necessário
    }

    metrics_df = pd.DataFrame(list(metrics_dict.items()), columns=['Métrica', 'Valor'])
    return metrics_df


from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
import pandas as pd

class CustomLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column
        self.le = LabelEncoder()
        self.default_label = 'desconhecido'
        self.categories_ = None

    def fit(self, X, y=None):
        all_categories = set(X[self.column])
        all_categories.add(self.default_label)  # Adicione 'desconhecido' às categorias conhecidas
        self.le.fit(list(all_categories))
        self.categories_ = all_categories
        return self

    def transform(self, X):
        X_encoded = X.copy()
        # Usar um valor padrão para categorias desconhecidas
        X_encoded[self.column] = X_encoded[self.column].apply(lambda x: x if x in self.categories_ else self.default_label)
        X_encoded[self.column] = self.le.transform(X_encoded[self.column])
        return X_encoded

# Data Load

In [84]:
dl = DataLoad()
df = dl.load_data('dados_train')

c:\Users\thale\Documents\Projetos_DS\projeto_aluno_mlflow\delivery_fast\notebooks\../src\data
2023-12-26 14:47:21 [info     ] Iniciando o carregamento
c:\Users\thale\Documents\Projetos_DS\projeto_aluno_mlflow\delivery_fast\src\data
c:\Users\thale\Documents\Projetos_DS\projeto_aluno_mlflow\delivery_fast\data\raw\dados.csv


# 1.0 Descrição dos Dados

In [85]:
df1 = df.copy()

## 1.1 Dimensão dos Dados

In [86]:
df1.shape

(197428, 16)

## 1.2 Tipo dos Dados

In [87]:
df1.dtypes

market_id                                       float64
created_at                                       object
actual_delivery_time                             object
store_id                                          int64
store_primary_category                           object
order_protocol                                  float64
total_items                                       int64
subtotal                                          int64
num_distinct_items                                int64
min_item_price                                    int64
max_item_price                                    int64
total_onshift_dashers                           float64
total_busy_dashers                              float64
total_outstanding_orders                        float64
estimated_order_place_duration                    int64
estimated_store_to_consumer_driving_duration    float64
dtype: object

## 1.3 Check Na

In [88]:
df1.isna().sum()

market_id                                         987
created_at                                          0
actual_delivery_time                                7
store_id                                            0
store_primary_category                           4760
order_protocol                                    995
total_items                                         0
subtotal                                            0
num_distinct_items                                  0
min_item_price                                      0
max_item_price                                      0
total_onshift_dashers                           16262
total_busy_dashers                              16262
total_outstanding_orders                        16262
estimated_order_place_duration                      0
estimated_store_to_consumer_driving_duration      526
dtype: int64

# 2.0 Fill Na

In [89]:
df2 = df1.copy()

In [90]:
df2 = df2.dropna()

In [91]:
# # total_onshift_dashers
# df2['total_onshift_dashers'] = df2['total_onshift_dashers'].apply(lambda x: 0 if math.isnan(x) else x)

# # total_busy_dashers
# df2['total_busy_dashers'] = df2['total_busy_dashers'].apply(lambda x: 0 if math.isnan(x) else x)

# # total_outstanding_orders
# df2['total_outstanding_orders'] = df2['total_outstanding_orders'].apply(lambda x: 0 if math.isnan(x) else x)

In [92]:
# df2[['store_id','store_primary_category']].groupby('store_id').count().reset_index().sort_values('store_primary_category', ascending=False)

In [93]:
# df2.loc[df2['store_id'] == 6865]['store_primary_category'].value_counts()

# 3.0 Change Data Type

In [94]:
df3 = df2.copy()

In [95]:
df3.dtypes

market_id                                       float64
created_at                                       object
actual_delivery_time                             object
store_id                                          int64
store_primary_category                           object
order_protocol                                  float64
total_items                                       int64
subtotal                                          int64
num_distinct_items                                int64
min_item_price                                    int64
max_item_price                                    int64
total_onshift_dashers                           float64
total_busy_dashers                              float64
total_outstanding_orders                        float64
estimated_order_place_duration                    int64
estimated_store_to_consumer_driving_duration    float64
dtype: object

In [96]:
df3['created_at'] = pd.to_datetime(df3['created_at'])

df3['actual_delivery_time'] = pd.to_datetime(df3['actual_delivery_time'])

In [97]:
df3.head()

Unnamed: 0,market_id,created_at,actual_delivery_time,store_id,store_primary_category,order_protocol,total_items,subtotal,num_distinct_items,min_item_price,max_item_price,total_onshift_dashers,total_busy_dashers,total_outstanding_orders,estimated_order_place_duration,estimated_store_to_consumer_driving_duration
0,1.0,2015-02-06 22:24:17,2015-02-06 23:27:16,1845,american,1.0,4,3441,4,557,1239,33.0,14.0,21.0,446,861.0
1,2.0,2015-02-10 21:49:25,2015-02-10 22:56:29,5477,mexican,2.0,1,1900,1,1400,1400,1.0,2.0,2.0,446,690.0
8,2.0,2015-02-16 00:11:35,2015-02-16 00:38:01,5477,indian,3.0,4,4771,3,820,1604,8.0,6.0,18.0,446,289.0
14,1.0,2015-02-12 03:36:46,2015-02-12 04:14:39,2841,italian,1.0,1,1525,1,1525,1525,5.0,6.0,8.0,446,795.0
15,1.0,2015-01-27 02:12:36,2015-01-27 03:02:24,2841,italian,1.0,2,3620,2,1425,2195,5.0,5.0,7.0,446,205.0


# 4.0 Feature Engineering

In [98]:
df4 = df3.copy()

In [99]:
df4['target'] = (df4['actual_delivery_time'] - df4['created_at']).dt.total_seconds()

# 5.0 Data Preparation

In [100]:
df5 = df4.copy()
df5 = df5[load_config_file().get('columns_to_use')]

In [101]:
dt = DataTransformation(df5)

In [102]:
X_train, X_valid, y_train, y_valid = dt.train_test_split()

# 6.0 Data Preprocess

## 6.1 Encoding

In [103]:
# le = LabelEncoder()

# all_categories = set(X_train['store_primary_category']).union(set(X_valid['store_primary_category']))
# le.fit(list(all_categories))

# # Dados de Treino
# X_train['store_primary_category'] = le.transform(X_train['store_primary_category'])



In [104]:
# default_label = 'desconhecido'
# X_valid['store_primary_category'] = X_valid['store_primary_category'].replace(to_replace={cat: default_label for cat in all_categories.difference(X_valid['store_primary_category'])})

# # Aplicar o LabelEncoder para transformar as categorias
# X_valid['store_primary_category'] = le.transform(X_valid['store_primary_category'])

In [105]:
# # Criar pipeline
# pipeline = Pipeline([
#     ('label_encoder', CustomLabelEncoder(column='store_primary_category')),
# ])

In [106]:
# Criar pipeline
pipeline = Pipeline([
    ('label_encoder', CustomLabelEncoder(column='store_primary_category')),
    ('scaler', MinMaxScaler())
])

# 7.0 Machine Learning

In [107]:
import mlflow
from mlflow.tracking import MlflowClient

mlflow.set_tracking_uri('http://127.0.0.1:5000')
mlflow.set_experiment('predict_delivery_time')

<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1703597813329, experiment_id='1', last_update_time=1703597813329, lifecycle_stage='active', name='predict_delivery_time', tags={}>

In [109]:
models = [LinearRegression(), Ridge(), Lasso(), KNeighborsRegressor(), DecisionTreeRegressor(), GradientBoostingRegressor()]


with mlflow.start_run(run_name='min_max_scaler'):

    preprocessador = DataPreprocess(pipeline)
    preprocessador.train(X_train)

    X_train_processed = pd.DataFrame(preprocessador.transform(X_train))
    X_valid_processed = pd.DataFrame(preprocessador.transform(X_valid))

    X_train_processed.columns = X_train.columns
    X_valid_processed.columns = X_valid.columns

    joblib.dump(preprocessador, '../models/preprocess.joblib')

    # logar um artefato (preprocessador)
    mlflow.log_artifact('../models/preprocess.joblib')

    # logar os parametros do preprocessador
    mlflow.log_params(params={'encoder': pipeline['label_encoder'],
                            'scaler': pipeline['scaler']})

    for model in models:
        model_name = model.__class__.__name__
        print(model_name)
        with mlflow.start_run(nested=True, run_name = model_name):
            mlflow.set_tag('model_name', model_name)

            mlflow.log_param('model_name', model_name)
            md = model

            # model train
            md.fit(X_train_processed, y_train)

            # model predict
            y_pred = md.predict(X_valid_processed)

            params = md.get_params()

            # log nos parametros do modelo
            for i in params:
                mlflow.log_param(i, params[i])
            
            # Calculando as métricas de performance 
            df_result_lr = regression_metrics(y_valid, y_pred)

            # log nas métricas que vem de df_result_lr
            for index, row in df_result_lr.iterrows():
                mlflow.log_metric(row['Métrica'], row['Valor'])
            
            mlflow.sklearn.log_model(md, model_name)

2023-12-26 14:47:59 [info     ] Iniciando o processamento


2023-12-26 14:47:59 [info     ] Iniciando a Transformação
2023-12-26 14:48:00 [info     ] Iniciando a Transformação
LinearRegression




Ridge




Lasso




KNeighborsRegressor




DecisionTreeRegressor




GradientBoostingRegressor


