# Feature Engineering

In [1]:
import pandas as pd
import numpy as np

## Load Data

In [2]:
ecom_data_df = pd.read_parquet("../dataset/processed/olist_ecom_cleaned.parquet")

In [3]:
ecom_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7475 entries, 0 to 7474
Data columns (total 30 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   customer_id                    7475 non-null   object        
 1   customer_unique_id             7475 non-null   object        
 2   customer_zip_code_prefix       6877 non-null   float64       
 3   customer_city                  7475 non-null   object        
 4   customer_state                 7475 non-null   object        
 5   order_id                       7475 non-null   object        
 6   order_status                   7475 non-null   object        
 7   order_purchase_timestamp       7475 non-null   datetime64[ns]
 8   order_approved_at              7475 non-null   datetime64[ns]
 9   order_delivered_carrier_date   7409 non-null   datetime64[ns]
 10  order_delivered_customer_date  7405 non-null   datetime64[ns]
 11  order_estimated_d

In [4]:
print(ecom_data_df.dtypes)

customer_id                              object
customer_unique_id                       object
customer_zip_code_prefix                float64
customer_city                            object
customer_state                           object
order_id                                 object
order_status                             object
order_purchase_timestamp         datetime64[ns]
order_approved_at                datetime64[ns]
order_delivered_carrier_date     datetime64[ns]
order_delivered_customer_date    datetime64[ns]
order_estimated_delivery_date    datetime64[ns]
payment_sequential                      float64
payment_type                             object
payment_installments                    float64
payment_value                           float64
order_item_id                           float64
product_id                               object
seller_id                                object
shipping_limit_date              datetime64[ns]
price                                   

Change data type 'customer_zip_code_prefix' to object

In [5]:
print(
    f"data type before replace the data type: {ecom_data_df['customer_zip_code_prefix'].dtype}"
)

# change the data type
ecom_data_df["customer_zip_code_prefix"] = ecom_data_df[
    "customer_zip_code_prefix"
].astype(object)

print(
    f"data type after replace the data type: {ecom_data_df['customer_zip_code_prefix'].dtype}"
)

data type before replace the data type: float64
data type after replace the data type: object


## Make feature base time

In [6]:
def calculate_duration(df, start_date_col, end_date_col, unit="days"):
    """
    calculate two column date durration in preferred unit
    Args:
        df (pd.DataFrame): DataFrame input.
        start_date_col (str): start column name.
        end_date_col (str): end column name.
        unit (str): unit preferred('days', 'hours', 'seconds').

    Returns:
        pd.Series: result durration that contains calculations.
                   return NaN if wrong or onf of NaT input dates.
    """
    # calculate timedelta (time gap)
    duration_timedelta = df[end_date_col] - df[start_date_col]

    # convert timedelta to total second time
    duration_seconds = duration_timedelta.dt.total_seconds()

    # convert total second time to preferred unit
    if unit == "seconds":
        return duration_seconds
    elif unit == "hours":
        return duration_seconds / 3600
    elif unit == "days":
        return duration_seconds / (24 * 3600)

    else:
        raise ValueError(
            f"unit is not supported: {unit}. choose 'days', 'hours', 'seconds'."
        )

In [7]:
# Define the specifications for each duration calculation
# Each tuple contains: (new_column_name, start_date_column, end_date_column)
duration_specs = [
    ("time_to_approve", "order_purchase_timestamp", "order_approved_at"),
    ("time_to_carrier", "order_approved_at", "order_delivered_carrier_date"),
    ("shipping_time", "order_delivered_carrier_date", "order_delivered_customer_date"),
    (
        "total_delivery_time",
        "order_purchase_timestamp",
        "order_delivered_customer_date",
    ),
    (
        "delivery_vs_estimate",
        "order_estimated_delivery_date",
        "order_delivered_customer_date",
    ),
]

# Define the unit for all calculations (assuming it's the same for now)
calculation_unit = "days"

print(f"--- Creating Time-Based Features (Unit: {calculation_unit}) ---")

# Loop through the specifications and calculate/assign the duration for each
for new_col_name, start_col, end_col in duration_specs:
    if start_col in ecom_data_df.columns and end_col in ecom_data_df.columns:
        ecom_data_df[new_col_name] = calculate_duration(
            ecom_data_df, start_col, end_col, calculation_unit
        )
        print(f"Created column: '{new_col_name}'")
    else:
        print(
            f"Warning: Skipping '{new_col_name}'. Start column '{start_col}' or end column '{end_col}' not found."
        )

print(
    f"Columns created:\n{[col for col in ecom_data_df.columns if col in [spec[0] for spec in duration_specs]]}"
)
print("Time-based features creation complete!")

--- Creating Time-Based Features (Unit: days) ---
Created column: 'time_to_approve'
Created column: 'time_to_carrier'
Created column: 'shipping_time'
Created column: 'total_delivery_time'
Created column: 'delivery_vs_estimate'
Columns created:
['time_to_approve', 'time_to_carrier', 'shipping_time', 'total_delivery_time', 'delivery_vs_estimate']
Time-based features creation complete!


## Investigate Order-Payment Relationship & Create Order Level (Non-Payment) Feature

In [8]:
print("-- Investigate relations order-payment --")

# Count how many rows for each order_id
order_id_counts = ecom_data_df["order_id"].value_counts()

# identification order_id that appears more than once (orders with multi-item or multi-payment)
multiple_rows_order_ids = order_id_counts[order_id_counts > 1].index.tolist()

print(f"unique number of order_id: {len(order_id_counts)}")
print(f"total order_id with multiple rows: {len(multiple_rows_order_ids)}")

if multiple_rows_order_ids:
    sample_order_id = multiple_rows_order_ids[0]
    print(
        f"Payment details for sample order_id with multiple rows ({sample_order_id}):"
    )
    display(
        ecom_data_df[ecom_data_df["order_id"] == sample_order_id][
            [
                "order_id",
                "payment_sequential",
                "payment_type",
                "payment_installments",
                "payment_value",
                "price",
                "freight_value",
            ]
        ].head()
    )
    print(f"total number of multiple rows: {len(ecom_data_df[ecom_data_df["order_id"] == sample_order_id])}")
else:
    print(
        "No order_id with multiple rows was found in the DataFrame (after this cleanup)."
    )

-- Investigate relations order-payment --
unique number of order_id: 6817
total order_id with multiple rows: 465
Payment details for sample order_id with multiple rows (465c2e1bee4561cb39e0db8c5993aafc):


Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value,price,freight_value
2095,465c2e1bee4561cb39e0db8c5993aafc,8.0,voucher,1.0,41.03,412.0,27.72
2096,465c2e1bee4561cb39e0db8c5993aafc,8.0,voucher,1.0,41.03,412.0,27.72
2097,465c2e1bee4561cb39e0db8c5993aafc,4.0,voucher,1.0,59.4,412.0,27.72
2098,465c2e1bee4561cb39e0db8c5993aafc,4.0,voucher,1.0,59.4,412.0,27.72
2099,465c2e1bee4561cb39e0db8c5993aafc,2.0,voucher,1.0,26.18,412.0,27.72


total number of multiple rows: 12


## Make feature level Comprehensif Order

In [9]:
# Aggregate data from level item/payment to level order_id
order_level_features_df = (
    ecom_data_df.groupby("order_id")
    .agg(
        # moneter feature
        total_payment_value=("payment_value", "sum"),
        total_item_price=("price", "sum"),
        total_freight_value=("freight_value", "sum"),
        # item/product feature
        number_of_rows_in_orders=("order_item_id", "count"),
        number_of_unique_products=("product_id", lambda x: x.nunique()),
        # payment feature
        max_payment_sequential=("payment_sequential", "max"),
        number_of_unique_payment_types=("payment_type", lambda x: x.nunique()),
        max_payment_installments=("payment_installments", "max"),
    )
    .reset_index()
)

In [10]:
if isinstance(order_level_features_df, pd.DataFrame):
    print("Verification: The created object is indeed a pandas DataFrame.")
    display(order_level_features_df.head())
    print(order_level_features_df.info())
else:
    print("Verification Warning: The created object is NOT a pandas DataFrame.")

Verification: The created object is indeed a pandas DataFrame.


Unnamed: 0,order_id,total_payment_value,total_item_price,total_freight_value,number_of_rows_in_orders,number_of_unique_products,max_payment_sequential,number_of_unique_payment_types,max_payment_installments
0,0011d82c4b53e22e84023405fb467e57,315.33,289.0,26.33,1,1,1.0,1,2.0
1,00229e4e43f7a7e0b9dd819ad43268d3,91.39,74.9,16.49,1,1,1.0,1,1.0
2,002b4e6fa42cd4a22cc86abc18fe9c05,119.57,99.9,19.67,1,1,1.0,1,2.0
3,00345f338696283410b7977d2e3efc89,67.34,48.9,18.44,1,1,1.0,1,1.0
4,003d0634280ff3d1d3a54459349a6899,228.89,220.0,8.89,1,1,1.0,1,1.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6817 entries, 0 to 6816
Data columns (total 9 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   order_id                        6817 non-null   object 
 1   total_payment_value             6817 non-null   float64
 2   total_item_price                6817 non-null   float64
 3   total_freight_value             6817 non-null   float64
 4   number_of_rows_in_orders        6817 non-null   int64  
 5   number_of_unique_products       6817 non-null   int64  
 6   max_payment_sequential          6817 non-null   float64
 7   number_of_unique_payment_types  6817 non-null   int64  
 8   max_payment_installments        6817 non-null   float64
dtypes: float64(5), int64(3), object(1)
memory usage: 479.4+ KB
None


## RFM Analysis

In [11]:
order_customer_mapping = ecom_data_df[
    ["order_id", "customer_unique_id"]
].drop_duplicates()

# merge mapping using 'order_id'
order_level_features_df = pd.merge(
    order_level_features_df, order_customer_mapping, how="left", on="order_id"
)

if isinstance(order_level_features_df, pd.DataFrame):
    print("Verification: The created object is indeed a pandas DataFrame.")
    display(
        order_level_features_df[
            ["order_id", "customer_unique_id", "total_payment_value"]
        ].head()
    )
else:
    print("Verification Warning: The created object is NOT a pandas DataFrame.")

Verification: The created object is indeed a pandas DataFrame.


Unnamed: 0,order_id,customer_unique_id,total_payment_value
0,0011d82c4b53e22e84023405fb467e57,02daaa0e021d624d1784c4ea5cc84ace,315.33
1,00229e4e43f7a7e0b9dd819ad43268d3,12d280dc9e8bf7888a3d2a8a8ae74f8d,91.39
2,002b4e6fa42cd4a22cc86abc18fe9c05,248d18ee4a4b0977fd21ef3273eb4352,119.57
3,00345f338696283410b7977d2e3efc89,93b592cbb89abbda7dc554c39add344e,67.34
4,003d0634280ff3d1d3a54459349a6899,2fe9477037604b2d6f60ffc998eb0812,228.89


Calculate Frequency (F) and Monetary (M) per customer

In [12]:
# aggregate dataframe 'order_level_features_df' base 'customer_unique_id'
customer_rfm_fm_df = (
    order_level_features_df.groupby("customer_unique_id")
    .agg(
        frequency=("order_id", "count"),
        monetary=("total_payment_value", "sum"),
    )
    .reset_index()
)

In [13]:
if isinstance(customer_rfm_fm_df, pd.DataFrame):
    print("Verification: The created object is indeed a pandas DataFrame.")
    display(customer_rfm_fm_df.head())
    print(customer_rfm_fm_df.info())
else:
    print("Verification Warning: The created object is NOT a pandas DataFrame.")

Verification: The created object is indeed a pandas DataFrame.


Unnamed: 0,customer_unique_id,frequency,monetary
0,001147e649a7b1afd577e873841632dd,1,0.0
1,0015752e079902b12cd00b9b7596276b,1,0.0
2,00293787e2aec37a9c253b63fdfe2049,1,257.87
3,002d3bd901608f67c3fc11eaaa842b13,1,28.76
4,003fb95e849e71e732629d94bb92762f,1,77.59


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6798 entries, 0 to 6797
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   customer_unique_id  6798 non-null   object 
 1   frequency           6798 non-null   int64  
 2   monetary            6798 non-null   float64
dtypes: float64(1), int64(1), object(1)
memory usage: 159.5+ KB
None


identify Snapshot Date Recency

In [14]:
latest_order_date = ecom_data_df["order_purchase_timestamp"].max()
print(f"latest order date: {latest_order_date}")

latest order date: 2018-09-09 14:42:05


Set Final Date Recency

In [15]:
snaphot_date = latest_order_date + pd.Timedelta(days=1)
print(f"snapshot date recency: {snaphot_date}")

snapshot date recency: 2018-09-10 14:42:05


Identify Last Order Date per Customer

In [16]:
latest_order_per_customer = (
    ecom_data_df.groupby("customer_unique_id")["order_purchase_timestamp"]
    .max()
    .reset_index()
)

Calculate Recency (R)

calculate the difference between days `snapshot_date` and the date of their last order `latest_order_per_customer`.

In [17]:
latest_order_per_customer["Recency"] = (
    snaphot_date - latest_order_per_customer["order_purchase_timestamp"]
).dt.days

# change data type 'Recency' from object to number
latest_order_per_customer["Recency"] = latest_order_per_customer["Recency"].astype(int)

In [18]:
display(latest_order_per_customer.head())
print(latest_order_per_customer.info())

Unnamed: 0,customer_unique_id,order_purchase_timestamp,Recency
0,001147e649a7b1afd577e873841632dd,2017-08-30 14:02:49,376
1,0015752e079902b12cd00b9b7596276b,2018-08-02 18:23:51,38
2,00293787e2aec37a9c253b63fdfe2049,2017-10-25 16:44:50,319
3,002d3bd901608f67c3fc11eaaa842b13,2018-01-17 14:09:01,236
4,003fb95e849e71e732629d94bb92762f,2018-07-29 07:39:37,43


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6798 entries, 0 to 6797
Data columns (total 3 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   customer_unique_id        6798 non-null   object        
 1   order_purchase_timestamp  6798 non-null   datetime64[ns]
 2   Recency                   6798 non-null   int64         
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 159.5+ KB
None


Merge Recency to RFM DataFrame

In [19]:
customer_rfm_final_df = pd.merge(
    customer_rfm_fm_df,
    latest_order_per_customer[["customer_unique_id", "Recency"]],
    on="customer_unique_id",
    how="left",
)

In [20]:
customer_rfm_final_df.head()

Unnamed: 0,customer_unique_id,frequency,monetary,Recency
0,001147e649a7b1afd577e873841632dd,1,0.0,376
1,0015752e079902b12cd00b9b7596276b,1,0.0,38
2,00293787e2aec37a9c253b63fdfe2049,1,257.87,319
3,002d3bd901608f67c3fc11eaaa842b13,1,28.76,236
4,003fb95e849e71e732629d94bb92762f,1,77.59,43


Replace name rfm with standard naming 'Rencency', 'Monetary', 'Frequency'

In [None]:
customer_rfm_final_df.rename(
    columns={"frequency": "Frequency", "monetary": "Monetary"}, inplace=True
)

In [None]:
display(customer_rfm_final_df[["Frequency", "Monetary", "Recency"]].head())
display(customer_rfm_final_df.describe())
display(customer_rfm_final_df.info())

Unnamed: 0,Frequency,Monetary,Recency
0,1,0.0,376
1,1,0.0,38
2,1,257.87,319
3,1,28.76,236
4,1,77.59,43


Unnamed: 0,Frequency,Monetary,Recency
count,6798.0,6798.0,6798.0
mean,1.002795,189.250038,250.852751
std,0.052797,373.730931,153.443822
min,1.0,0.0,1.0
25%,1.0,58.3875,126.0
50%,1.0,106.04,231.0
75%,1.0,189.145,359.0
max,2.0,11200.2,705.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6798 entries, 0 to 6797
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   customer_unique_id  6798 non-null   object 
 1   Frequency           6798 non-null   int64  
 2   Monetary            6798 non-null   float64
 3   Recency             6798 non-null   int64  
dtypes: float64(1), int64(2), object(1)
memory usage: 212.6+ KB


None