# Exploratory Data Analysis

In this notebook we will dive deeper into the data, focusing specifically on the delivery time and hypothesizing what are the main drivers of the delivery time.

We will use the training set for this analysis.


In [1]:
import pandas as pd
import plotly.express as px
# from ydata_profiling import ProfileReport
from geopy.distance import great_circle


date_cols = ['order_purchase_timestamp', 
             'order_approved_at',
             'order_delivered_carrier_date',
             'order_delivered_customer_date',
             'order_estimated_delivery_date']
train_df = pd.read_csv('../data/train_df.csv', parse_dates=date_cols)
city_coords_df = pd.read_csv('../data/city_coords.csv')


In [2]:
# profile = ProfileReport(train_df, title="Profiling Report")
# profile

# Target Variable

Our target variable will be to estimate the delivery time **after approval**.  
Hence, unapproved orders will not be considered, and will only only be predictable once the order it approved.

```latex
Prediction = Sum(`Approval` --> `Carrier` --> `Customer`) [Days]
```


We need to verify with the product manager that this makes sense.
For now, let's remove the rows where the deliver date is missing or approval date is missing

In [3]:
original_len = len(train_df)
train_df = train_df[train_df['order_delivered_customer_date'].notna()]
train_df = train_df[train_df['order_approved_at'].notna()]
new_len = len(train_df)
print(f"Removed {original_len - new_len} rows") # around 3% of the data was removed


Removed 2822 rows


# Evaluation Framework

In order to evaluate our model, we will do several things:
1. Split the data into train and evaluation sets, randomly.
2. Train a benchmark model on the train set.
3. Evaluate performance for:
   1. Benchmark model
   2. Estimated delivery (given by the system)

**Repeat:**
    Hypothesis creation
    Model training
    Model evaluation
    Comparison with benchmark model & estimated delivery



In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, median_absolute_error
from sklearn.impute import SimpleImputer

# stage 1: split the data into train and evaluation sets randomly
train_df['inter_state'] = train_df['customer_state'] != train_df['seller_state']

# convert all string columns to categorical variables
to_categorical = ['seller_state', 'customer_state']
train_df[to_categorical] = train_df[to_categorical].astype('category')

# 80/20 random split
train_set, eval_set = train_test_split(train_df, test_size=0.2, random_state=850)

# stage 2: train a benchmark model on the train set
TARGET = 'approval_to_customer'# total time from approval to acknowledged customer delivery
FEATURES = ['approval_time', 'approved_to_carrier',
            'price', 'freight_value', 'freight_component',
            'product_name_lenght', 'product_weight_g', 'product_length_cm',
            'product_height_cm', 'product_width_cm',
            'total_size', 'inter_state']
X_train = train_set[FEATURES]  
y_train = train_set[TARGET] 

imputer = SimpleImputer(strategy='mean') 
X_train_imputed = imputer.fit_transform(X_train)
X_eval_imputed = imputer.transform(eval_set[FEATURES])
y_eval = eval_set[TARGET]

def evaluate_model(ml_model, X_eval: pd.DataFrame, y_eval: pd.Series, model_name: str) -> pd.DataFrame:
    """
    Evaluate the performance of a machine learning model.

    Args:
        ml_model: The machine learning model to evaluate.
        X_eval: The evaluation set features.
        y_eval: The evaluation set target variable.
        model_name: name of the model evaluated.

    Returns:
        A tuple containing the MAE, MSE, and RMSE of the model.
    """
    y_pred = ml_model.predict(X_eval)
    mae = mean_absolute_error(y_eval, y_pred)
    madae = median_absolute_error(y_eval, y_pred)
    print(f"Mean Absolute Error: {mae}, Median Absolute Error: {madae}")
    evaluation_df = pd.DataFrame({f'y_pred_{model_name}': y_pred, 'y_eval': y_eval})
    return evaluation_df

def evaluate_default_delivery_estimation(y_pred: pd.Series,
                                         y_eval: pd.Series,
                                         model_name: str = "default_estimation") -> pd.DataFrame:
    """
    Evaluate the performance of a machine learning model.

    Args:
        y_pred: The evaluation set features.
        y_eval: The evaluation set target variable.
        model_name: name of the model evaluated.

    Returns:
        A tuple containing the MAE, MSE, and RMSE of the model.
    """
    mae = mean_absolute_error(y_eval, y_pred)
    madae = median_absolute_error(y_eval, y_pred)
    print(f"Mean Absolute Error: {mae}, Median Absolute Error: {madae}")
    evaluation_df = pd.DataFrame({f'y_pred_{model_name}': y_pred, 'y_eval': y_eval})
    return evaluation_df



In [5]:
assert len(train_set) == len(X_train_imputed) == len(y_train), "Rows were removed during preprocessing"

In [6]:
# time to prepare the default estimation for evaluation
seconds_in_day = 86400
train_set['default_delivery_estimation'] = train_set['order_estimated_delivery_date'] - train_set['order_approved_at']
train_set['default_delivery_estimation'] = train_set['default_delivery_estimation'].dt.total_seconds() / seconds_in_day

delivery_vis_df = train_set[['order_approved_at', 'approval_to_customer', 
                             'default_delivery_estimation', 'order_delivered_customer_date']].copy()
delivery_vis_df['difference'] = delivery_vis_df['approval_to_customer'] - delivery_vis_df['default_delivery_estimation']
delivery_vis_df['direction'] = delivery_vis_df['difference'].apply(lambda x: 'undershoot' if x > 0 else 'overshoot')

fig = px.scatter(delivery_vis_df, x='approval_to_customer', y='default_delivery_estimation',
                 color = 'direction', trendline="ols", trendline_scope="overall",
                 title="Default Delivery Estimation vs Actual Delivery Times")

# add a 45 degree line trace
fig.update_layout(shapes = [{'type': 'line', 'yref': 'paper', 'xref': 'paper', 
                             'y0': 0, 'y1': 1, 'x0': 0, 'x1': 1}],
                    xaxis_range = [0, 150], yaxis_range = [0, 150])

# pearson correlation between the two variables
corr_default = train_set['approval_to_customer'].corr(train_set['default_delivery_estimation'])
print(f"Pearson correlation between delivery time and default estimation: {corr_default}")

fig.show()

Pearson correlation between delivery time and default estimation: 0.3705719726508003


In [7]:
# Step 3: Evaluate performance for the benchmark model
benchmark_model = LinearRegression()
benchmark_model.fit(X_train_imputed, y_train)

In [8]:
benchmark_eval_df = evaluate_model(ml_model=benchmark_model, 
                                   X_eval=X_eval_imputed, 
                                   y_eval=y_eval, 
                                   model_name='benchmark')
benchmark_eval_df.sample(3)

Mean Absolute Error: 5.139411071338724, Median Absolute Error: 3.773945850610962


Unnamed: 0,y_pred_benchmark,y_eval
69419,13.353126,22.011944
52574,15.90561,48.963426
32482,14.105725,10.129236


In [13]:
eval_set['default_delivery_estimation'] = eval_set['order_estimated_delivery_date'] - eval_set['order_approved_at']
eval_set['default_delivery_estimation'] = eval_set['default_delivery_estimation'].dt.total_seconds() / seconds_in_day

default_eval_df = evaluate_default_delivery_estimation(y_pred=eval_set['default_delivery_estimation'],
                                                       y_eval=y_eval, 
                                                       model_name='default_estimation')
default_eval_df.sample(3)

Mean Absolute Error: 12.685320224167384, Median Absolute Error: 12.223755787037035


Unnamed: 0,y_pred_default_estimation,y_eval
23869,12.649132,1.42309
63374,22.461227,15.084525
5158,17.146574,2.826088


In [20]:
# merge default_eval_df and benchmark_eval_df by the index
merged_eval_df = pd.concat([default_eval_df, benchmark_eval_df.rename(columns={'y_eval': 'y_true'})], axis=1)
merged_eval_df.drop(columns=['y_true'], inplace=True)
merged_eval_df.sample(3)

Unnamed: 0,y_pred_default_estimation,y_eval,y_pred_benchmark
17284,23.371424,24.047477,14.847206
54324,27.891852,34.556123,13.035725
1018,28.065845,12.477014,14.298076


# Hypothesis 1

Physical distance between the warehouse and the customer is a major driver of the delivery time.

In [None]:
# get all the unique combinations of customer and seller cities
unique_cities = set(train_df.customer_city.unique())
unique_cities.update(train_df.seller_city.unique())


In [56]:
city_coords_df = pd.DataFrame(unique_cities, columns=['city'])


In [None]:
from geopy.geocoders import Nominatim
from geopy.distance import great_circle

geolocator = Nominatim(user_agent="E-Commerce Forecasts")
def get_coordinates(city):
    """Get the coordinates (latitude, longitude) of a city
    
    Args:
        city (str): The name of the city to get the coordinates for.
        
    Returns:
        tuple: A tuple containing the latitude and longitude of the city.
        If the city is not found, the function returns (None, None).
    """
    try:
        location = geolocator.geocode(city + ', Brazil')
        return (location.latitude, location.longitude) if location else (None, None)
    except Exception as e:
        print(f"Error getting coordinates for {city}: {e}")
        return (None, None)


In [None]:
city_coords_df['coords'] = city_coords_df['city'].apply(get_coordinates)
city_coords_df

train_df = train_df.merge(city_coords_df.rename(columns={'coords': 'customer_coords', 'city': 'customer_city'}), 
                          how='left',
                          on='customer_city')

train_df = train_df.merge(city_coords_df.rename(columns={'coords': 'seller_coords', 'city': 'seller_city'}), 
                          how='left',
                          on='seller_city')


def calculate_distance(row):
    if row['customer_coords'] and row['seller_coords']:
        return great_circle(row['customer_coords'], row['seller_coords']).kilometers
    return None

train_df['distance_km'] = train_df.apply(calculate_distance, axis=1)

Now we can visualize the effect of physical distance on the deliver time.

In [89]:
deliver_vs_distance = train_df[['carrier_to_customer', 'distance_km', 'customer_state', 'seller_state', 'inter_state']]

In [None]:
# add a regression line
px.scatter(deliver_vs_distance, x='distance_km', y='carrier_to_customer',
           trendline="ols",  color="inter_state",
           trendline_scope="overall",
           trendline_color_override="black")
