# Feature Engineering

In [1]:
import pandas as pd
import numpy as np

## Load Data

In [2]:
ecom_data_df = pd.read_parquet("../dataset/processed/olist_ecom_cleaned.parquet")

In [3]:
ecom_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7475 entries, 0 to 7474
Data columns (total 30 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   customer_id                    7475 non-null   object        
 1   customer_unique_id             7475 non-null   object        
 2   customer_zip_code_prefix       6877 non-null   float64       
 3   customer_city                  7475 non-null   object        
 4   customer_state                 7475 non-null   object        
 5   order_id                       7475 non-null   object        
 6   order_status                   7475 non-null   object        
 7   order_purchase_timestamp       7475 non-null   datetime64[ns]
 8   order_approved_at              7475 non-null   datetime64[ns]
 9   order_delivered_carrier_date   7409 non-null   datetime64[ns]
 10  order_delivered_customer_date  7405 non-null   datetime64[ns]
 11  order_estimated_d

In [4]:
print(ecom_data_df.dtypes)

customer_id                              object
customer_unique_id                       object
customer_zip_code_prefix                float64
customer_city                            object
customer_state                           object
order_id                                 object
order_status                             object
order_purchase_timestamp         datetime64[ns]
order_approved_at                datetime64[ns]
order_delivered_carrier_date     datetime64[ns]
order_delivered_customer_date    datetime64[ns]
order_estimated_delivery_date    datetime64[ns]
payment_sequential                      float64
payment_type                             object
payment_installments                    float64
payment_value                           float64
order_item_id                           float64
product_id                               object
seller_id                                object
shipping_limit_date              datetime64[ns]
price                                   

Change data type 'customer_zip_code_prefix' to object

In [5]:
print(
    f"data type before replace the data type: {ecom_data_df['customer_zip_code_prefix'].dtype}"
)

# change the data type
ecom_data_df["customer_zip_code_prefix"] = ecom_data_df[
    "customer_zip_code_prefix"
].astype(object)

print(
    f"data type after replace the data type: {ecom_data_df['customer_zip_code_prefix'].dtype}"
)

data type before replace the data type: float64
data type after replace the data type: object


## Make feature base time

In [6]:
def calculate_duration(df, start_date_col, end_date_col, unit="days"):
    """
    calculate two column date durration in preferred unit
    Args:
        df (pd.DataFrame): DataFrame input.
        start_date_col (str): start column name.
        end_date_col (str): end column name.
        unit (str): unit preferred('days', 'hours', 'seconds').

    Returns:
        pd.Series: result durration that contains calculations.
                   return NaN if wrong or onf of NaT input dates.
    """
    # calculate timedelta (time gap)
    duration_timedelta = df[end_date_col] - df[start_date_col]

    # convert timedelta to total second time
    duration_seconds = duration_timedelta.dt.total_seconds()

    # convert total second time to preferred unit
    if unit == "seconds":
        return duration_seconds
    elif unit == "hours":
        return duration_seconds / 3600
    elif unit == "days":
        return duration_seconds / (24 * 3600)

    else:
        raise ValueError(
            f"unit is not supported: {unit}. choose 'days', 'hours', 'seconds'."
        )

In [8]:
# Define the specifications for each duration calculation
# Each tuple contains: (new_column_name, start_date_column, end_date_column)
duration_specs = [
    ("time_to_approve", "order_purchase_timestamp", "order_approved_at"),
    ("time_to_carrier", "order_approved_at", "order_delivered_carrier_date"),
    ("shipping_time", "order_delivered_carrier_date", "order_delivered_customer_date"),
    (
        "total_delivery_time",
        "order_purchase_timestamp",
        "order_delivered_customer_date",
    ),
    (
        "delivery_vs_estimate",
        "order_estimated_delivery_date",
        "order_delivered_customer_date",
    ),
]

# Define the unit for all calculations (assuming it's the same for now)
calculation_unit = "days"

print(f"--- Creating Time-Based Features (Unit: {calculation_unit}) ---")

# Loop through the specifications and calculate/assign the duration for each
for new_col_name, start_col, end_col in duration_specs:
    if start_col in ecom_data_df.columns and end_col in ecom_data_df.columns:
        ecom_data_df[new_col_name] = calculate_duration(
            ecom_data_df, start_col, end_col, calculation_unit
        )
        print(f"Created column: '{new_col_name}'")
    else:
        print(
            f"Warning: Skipping '{new_col_name}'. Start column '{start_col}' or end column '{end_col}' not found."
        )

print(
    f"Columns created:\n{[col for col in ecom_data_df.columns if col in [spec[0] for spec in duration_specs]]}"
)
print("Time-based features creation complete!")

--- Creating Time-Based Features (Unit: days) ---
Created column: 'time_to_approve'
Created column: 'time_to_carrier'
Created column: 'shipping_time'
Created column: 'total_delivery_time'
Created column: 'delivery_vs_estimate'
Columns created:
['time_to_approve', 'time_to_carrier', 'shipping_time', 'total_delivery_time', 'delivery_vs_estimate']
Time-based features creation complete!
