# Feature Engineering

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Load Data

In [None]:
ecom_data_df = pd.read_parquet("../dataset/processed/olist_ecom_cleaned.parquet")

In [None]:
ecom_data_df.info()

In [None]:
print(ecom_data_df.dtypes)

Change data type 'customer_zip_code_prefix' to object

In [None]:
print(
    f"data type before replace the data type: {ecom_data_df['customer_zip_code_prefix'].dtype}"
)

# change the data type
ecom_data_df["customer_zip_code_prefix"] = ecom_data_df[
    "customer_zip_code_prefix"
].astype(object)

print(
    f"data type after replace the data type: {ecom_data_df['customer_zip_code_prefix'].dtype}"
)

## Make feature base time

In [None]:
def calculate_duration(df, start_date_col, end_date_col, unit="days"):
    """
    calculate two column date durration in preferred unit
    Args:
        df (pd.DataFrame): DataFrame input.
        start_date_col (str): start column name.
        end_date_col (str): end column name.
        unit (str): unit preferred('days', 'hours', 'seconds').

    Returns:
        pd.Series: result durration that contains calculations.
                   return NaN if wrong or onf of NaT input dates.
    """
    # calculate timedelta (time gap)
    duration_timedelta = df[end_date_col] - df[start_date_col]

    # convert timedelta to total second time
    duration_seconds = duration_timedelta.dt.total_seconds()

    # convert total second time to preferred unit
    if unit == "seconds":
        return duration_seconds
    elif unit == "hours":
        return duration_seconds / 3600
    elif unit == "days":
        return duration_seconds / (24 * 3600)

    else:
        raise ValueError(
            f"unit is not supported: {unit}. choose 'days', 'hours', 'seconds'."
        )

In [None]:
# Define the specifications for each duration calculation
# Each tuple contains: (new_column_name, start_date_column, end_date_column)
duration_specs = [
    ("time_to_approve", "order_purchase_timestamp", "order_approved_at"),
    ("time_to_carrier", "order_approved_at", "order_delivered_carrier_date"),
    ("shipping_time", "order_delivered_carrier_date", "order_delivered_customer_date"),
    (
        "total_delivery_time",
        "order_purchase_timestamp",
        "order_delivered_customer_date",
    ),
    (
        "delivery_vs_estimate",
        "order_estimated_delivery_date",
        "order_delivered_customer_date",
    ),
]

# Define the unit for all calculations (assuming it's the same for now)
calculation_unit = "days"

print(f"--- Creating Time-Based Features (Unit: {calculation_unit}) ---")

# Loop through the specifications and calculate/assign the duration for each
for new_col_name, start_col, end_col in duration_specs:
    if start_col in ecom_data_df.columns and end_col in ecom_data_df.columns:
        ecom_data_df[new_col_name] = calculate_duration(
            ecom_data_df, start_col, end_col, calculation_unit
        )
        print(f"Created column: '{new_col_name}'")
    else:
        print(
            f"Warning: Skipping '{new_col_name}'. Start column '{start_col}' or end column '{end_col}' not found."
        )

print(
    f"Columns created:\n{[col for col in ecom_data_df.columns if col in [spec[0] for spec in duration_specs]]}"
)
print("Time-based features creation complete!")

## Investigate Order-Payment Relationship & Create Order Level (Non-Payment) Feature

In [None]:
print("-- Investigate relations order-payment --")

# Count how many rows for each order_id
order_id_counts = ecom_data_df["order_id"].value_counts()

# identification order_id that appears more than once (orders with multi-item or multi-payment)
multiple_rows_order_ids = order_id_counts[order_id_counts > 1].index.tolist()

print(f"unique number of order_id: {len(order_id_counts)}")
print(f"total order_id with multiple rows: {len(multiple_rows_order_ids)}")

if multiple_rows_order_ids:
    sample_order_id = multiple_rows_order_ids[0]
    print(
        f"Payment details for sample order_id with multiple rows ({sample_order_id}):"
    )
    display(
        ecom_data_df[ecom_data_df["order_id"] == sample_order_id][
            [
                "order_id",
                "payment_sequential",
                "payment_type",
                "payment_installments",
                "payment_value",
                "price",
                "freight_value",
            ]
        ].head()
    )
    print(f"total number of multiple rows: {len(ecom_data_df[ecom_data_df["order_id"] == sample_order_id])}")
else:
    print(
        "No order_id with multiple rows was found in the DataFrame (after this cleanup)."
    )

## Make feature level Comprehensif Order

In [None]:
# Aggregate data from level item/payment to level order_id
order_level_features_df = (
    ecom_data_df.groupby("order_id")
    .agg(
        # moneter feature
        total_payment_value=("payment_value", "sum"),
        total_item_price=("price", "sum"),
        total_freight_value=("freight_value", "sum"),
        # item/product feature
        number_of_rows_in_orders=("order_item_id", "count"),
        number_of_unique_products=("product_id", lambda x: x.nunique()),
        # payment feature
        max_payment_sequential=("payment_sequential", "max"),
        number_of_unique_payment_types=("payment_type", lambda x: x.nunique()),
        max_payment_installments=("payment_installments", "max"),
    )
    .reset_index()
)

In [None]:
if isinstance(order_level_features_df, pd.DataFrame):
    print("Verification: The created object is indeed a pandas DataFrame.")
    display(order_level_features_df.head())
    print(order_level_features_df.info())
else:
    print("Verification Warning: The created object is NOT a pandas DataFrame.")

## RFM Analysis

In [None]:
order_customer_mapping = ecom_data_df[
    ["order_id", "customer_unique_id"]
].drop_duplicates()

# merge mapping using 'order_id'
order_level_features_df = pd.merge(
    order_level_features_df, order_customer_mapping, how="left", on="order_id"
)

if isinstance(order_level_features_df, pd.DataFrame):
    print("Verification: The created object is indeed a pandas DataFrame.")
    display(
        order_level_features_df[
            ["order_id", "customer_unique_id", "total_payment_value"]
        ].head()
    )
else:
    print("Verification Warning: The created object is NOT a pandas DataFrame.")

Calculate Frequency (F) and Monetary (M) per customer

In [None]:
# aggregate dataframe 'order_level_features_df' base 'customer_unique_id'
customer_rfm_fm_df = (
    order_level_features_df.groupby("customer_unique_id")
    .agg(
        frequency=("order_id", "count"),
        monetary=("total_payment_value", "sum"),
    )
    .reset_index()
)

In [None]:
if isinstance(customer_rfm_fm_df, pd.DataFrame):
    print("Verification: The created object is indeed a pandas DataFrame.")
    display(customer_rfm_fm_df.head())
    print(customer_rfm_fm_df.info())
else:
    print("Verification Warning: The created object is NOT a pandas DataFrame.")

identify Snapshot Date Recency

In [None]:
latest_order_date = ecom_data_df["order_purchase_timestamp"].max()
print(f"latest order date: {latest_order_date}")

Set Final Date Recency

In [None]:
snaphot_date = latest_order_date + pd.Timedelta(days=1)
print(f"snapshot date recency: {snaphot_date}")

Identify Last Order Date per Customer

In [None]:
latest_order_per_customer = (
    ecom_data_df.groupby("customer_unique_id")["order_purchase_timestamp"]
    .max()
    .reset_index()
)

Calculate Recency (R)

calculate the difference between days `snapshot_date` and the date of their last order `latest_order_per_customer`.

In [None]:
latest_order_per_customer["Recency"] = (
    snaphot_date - latest_order_per_customer["order_purchase_timestamp"]
).dt.days

# change data type 'Recency' from object to number
latest_order_per_customer["Recency"] = latest_order_per_customer["Recency"].astype(int)

In [None]:
display(latest_order_per_customer.head())
print(latest_order_per_customer.info())

Merge Recency to RFM DataFrame

In [None]:
customer_rfm_final_df = pd.merge(
    customer_rfm_fm_df,
    latest_order_per_customer[["customer_unique_id", "Recency"]],
    on="customer_unique_id",
    how="left",
)

In [None]:
customer_rfm_final_df.head()

Replace name rfm with standard naming 'Rencency', 'Monetary', 'Frequency'

In [None]:
customer_rfm_final_df.rename(
    columns={"frequency": "Frequency", "monetary": "Monetary"}, inplace=True
)

In [None]:
display(customer_rfm_final_df[["Frequency", "Monetary", "Recency"]].head())
display(customer_rfm_final_df.describe())
display(customer_rfm_final_df.info())

Define number quantile **RFM** score & calculate score **Recency**

In [None]:
num_quantiles = 5

customer_rfm_final_df["R_score"] = pd.qcut(
    customer_rfm_final_df["Recency"], q=num_quantiles, labels=False, duplicates="drop"
)

customer_rfm_final_df["R_score"] = num_quantiles - customer_rfm_final_df["R_score"]

In [None]:
print(f"Recency score: {num_quantiles}")
display(customer_rfm_final_df[["customer_unique_id", "Recency", "R_score"]].head())

# check distribution
print(customer_rfm_final_df["R_score"].value_counts().sort_index())

Calculate limit quantile & give score for **Frequency**

In [None]:
print(f"Attempting to assign Frequency score using {num_quantiles} quantiles...")

# --- Attempt scoring using pd.qcut ---
customer_rfm_final_df["F_score"] = pd.qcut(
    customer_rfm_final_df["Frequency"],
    q=num_quantiles,
    labels=False,
    duplicates="drop",
)

# Add 1 to shift the scores from 1 to q.
customer_rfm_final_df["F_score"] = customer_rfm_final_df["F_score"] + 1

# --- Step 2: Check if the quantile scoring resulted in a "skewed" distribution ---
# We check if the number of unique scores produced is less than the requested number of quantiles.
# If it is, it indicates that pd.qcut couldn't create distinct quantiles due to skewness/duplicates.

if customer_rfm_final_df["F_score"].nunique() < num_quantiles:
    print(
        f"\nWarning: Quantile scoring for Frequency resulted in only {customer_rfm_final_df['F_score'].nunique()} unique scores (less than {num_quantiles})."
    )
    print("This indicates a highly skewed distribution preventing distinct quantiles.")
    print("Proceeding with alternative manual scoring based on unique values.")

    # --- Step 3: If skewed, apply manual scoring based on unique values ---
    # This is the logic that was originally in the 'except' block.
    # It assigns score 1 if Frequency is exactly 1, otherwise assigns score 2.
    # Make sure this logic fits your definition of manual scoring based on skewness.

    customer_rfm_final_df["F_score"] = customer_rfm_final_df["Frequency"].apply(
        lambda x: 1 if x == 1 else 2
    )
    print("Manual scoring applied successfully.")

else:
    # --- Step 4: If not skewed (unique scores == num_quantiles), the quantile scoring is considered good ---
    print("\nQuantile scoring resulted in the expected number of unique scores.")
    print("Quantile scoring is considered successful.")

In [None]:
print(f"Recency score: {num_quantiles}")
display(customer_rfm_final_df[["customer_unique_id", "Recency", "F_score"]].head())

# check distribution
print(customer_rfm_final_df["F_score"].value_counts().sort_index())

Calculate limit quantile & give score for Monetary

In [None]:
print(f"Attempting to assign Monetary score using {num_quantiles} quantiles...")

# --- Attempt scoring using pd.qcut ---
customer_rfm_final_df["M_score"] = pd.qcut(
    customer_rfm_final_df["Monetary"],
    q=num_quantiles,
    labels=False,
    duplicates="drop",
)

# Add 1 to shift the scores from 1 to q.
customer_rfm_final_df["M_score"] = customer_rfm_final_df["M_score"] + 1

# --- Step 2: Check if the quantile scoring resulted in a "skewed" distribution ---
# We check if the number of unique scores produced is less than the requested number of quantiles.
# If it is, it indicates that pd.qcut couldn't create distinct quantiles due to skewness/duplicates.

if customer_rfm_final_df["M_score"].nunique() < num_quantiles:
    print(
        f"\nWarning: Quantile scoring for Monetary resulted in only {customer_rfm_final_df['M_score'].nunique()} unique scores (less than {num_quantiles})."
    )
    print("This indicates a highly skewed distribution preventing distinct quantiles.")
    print("Proceeding with alternative manual scoring based on unique values.")

    # --- Step 3: If skewed, apply manual scoring based on unique values ---
    # This is the logic that was originally in the 'except' block.
    # It assigns score 1 if Monetary is exactly 1, otherwise assigns score 2.
    # Make sure this logic fits your definition of manual scoring based on skewness.

    customer_rfm_final_df["M_score"] = customer_rfm_final_df["Monetary"].apply(
        lambda x: 1 if x == 1 else 2
    )
    print("Manual scoring applied successfully.")

else:
    # --- Step 4: If not skewed (unique scores == num_quantiles), the quantile scoring is considered good ---
    print("\nQuantile scoring resulted in the expected number of unique scores.")
    print("Quantile scoring is considered successful.")

In [None]:
print(f"Monetary score: {num_quantiles}")
display(customer_rfm_final_df[["customer_unique_id", "Monetary", "M_score"]].head())

# check distribution
print(customer_rfm_final_df["M_score"].value_counts().sort_index())

In [None]:
# combine RFM score

customer_rfm_final_df["RFM_score_string"] = (
    customer_rfm_final_df["R_score"].astype(int).astype(str)
    + customer_rfm_final_df["F_score"].astype(int).astype(str)
    + customer_rfm_final_df["M_score"].astype(int).astype(str)
)

# make RFM score composite
customer_rfm_final_df["RFM_score_composite"] = (
    customer_rfm_final_df["R_score"]
    + customer_rfm_final_df["F_score"]
    + customer_rfm_final_df["M_score"]
)

In [None]:
display(
    customer_rfm_final_df[
        ["customer_unique_id", "R_score", "F_score", "M_score", "RFM_score_string"]
    ].head()
)

In [None]:
display(
    customer_rfm_final_df[
        [
            "customer_unique_id",
            "R_score",
            "F_score",
            "M_score",
            "RFM_score_string",
            "RFM_score_composite",
        ]
    ].head()
)
display(
    customer_rfm_final_df[
        [
            "customer_unique_id",
            "R_score",
            "F_score",
            "M_score",
            "RFM_score_string",
            "RFM_score_composite",
        ]
    ].describe()
)
print(customer_rfm_final_df.info())

Define RFM Segmentation Strategy

In [None]:
def assign_rfm_segment(row):
    """
    Assigns an RFM customer segment based on R_score, F_score, and M_score
    using a rule-based approach for better readability and maintainability.

    Args:
        row (pd.Series): A row from the DataFrame containing 'R_score',
                         'F_score', and 'M_score'.

    Returns:
        str: The assigned RFM segment label.
             Returns "Undefined Segment" if no rule matches (should not happen
             if rules cover all score combinations).
    """
    # Extract scores for easier use
    R = row["R_score"]
    F = row["F_score"]
    M = row["M_score"]

    # Define the segmentation rules as a list of (condition, segment_label) tuples.
    segmentation_rules = [
        # F == 2 (Repeat Buyers)
        (lambda r, f, m: f == 2 and r >= 4 and m >= 4, "Repeat Buyers (Champions)"),
        (
            lambda r, f, m: f == 2 and r >= 4 and m < 4,
            "Repeat Buyers (Recent & Moderate/Low M)",
        ),
        (
            lambda r, f, m: f == 2 and r < 4 and m >= 4,
            "Repeat Buyers (Less Recent & High M)",
        ),
        (
            lambda r, f, m: f == 2 and r < 4 and m < 4,
            "Repeat Buyers (Less Recent & Moderate/Low M)",
        ),
        # F == 1 (Single Buyers)
        (
            lambda r, f, m: f == 1 and r >= 4 and m >= 4,
            "Single Buyers (High Value & Recent)",
        ),
        (
            lambda r, f, m: f == 1 and r >= 4 and m < 4,
            "Single Buyers (Recent & Moderate/Low M)",
        ),
        (
            lambda r, f, m: f == 1 and r >= 2 and r < 4 and m >= 4,
            "Single Buyers (Moderate R & High M)",
        ),
        (
            lambda r, f, m: f == 1 and r >= 2 and r < 4 and m < 4,
            "Single Buyers (Moderate R & Moderate/Low M)",
        ),
        (
            lambda r, f, m: f == 1 and r < 2 and m >= 4,
            "Single Buyers (Hibernating & High M)",
        ),
        (
            lambda r, f, m: f == 1 and r < 2 and m < 4,
            "Single Buyers (Hibernating & Moderate/Low M)",
        ),
    ]

    # Iterate through the rules and return the segment label for the first matching condition
    for condition, segment_label in segmentation_rules:
        if condition(R, F, M):
            return segment_label

    # Return a default label if no condition matches (should ideally not happen
    # if all score combinations are covered by the rules based on the original logic)
    return "Undefined Segment"

In [None]:
# apply function
customer_rfm_final_df["RFM_segment"] = customer_rfm_final_df.apply(
    assign_rfm_segment, axis=1
)

In [None]:
display(
    customer_rfm_final_df[
        [
            "customer_unique_id",
            "R_score",
            "F_score",
            "M_score",
            "RFM_score_string",
            "RFM_segment",
        ]
    ].head()
)

Calculate & make a Visualization for `RFM_segment` size

In [None]:
segment_size = customer_rfm_final_df["RFM_segment"].value_counts().reset_index()

# make columns
segment_size.columns = ["RFM_segment", "customer_count"]

In [None]:
plt.figure(figsize=(14, 7))
sns.barplot(
    x="customer_count",
    y="RFM_segment",
    data=segment_size.sort_values("customer_count", ascending=False),
    palette="rocket",
)

plt.title("Distribution customer per RFM Segment", fontsize=18)
plt.xlabel("Number of Customer", fontsize=12)
plt.ylabel("RFM Segment", fontsize=12)
plt.tight_layout()
plt.show()

Characterization of RFM Segments (Average Score & Original Value)

In [None]:
segment_characteristics = (
    customer_rfm_final_df.groupby("RFM_segment")
    .agg(
        Avg_R_score=("R_score", "mean"),
        Avg_F_score=("F_score", "mean"),
        Avg_M_score=("M_score", "mean"),
        Avg_Recency=("Recency", "mean"),
        Avg_Frequency=("Frequency", "mean"),
        Avg_Monetary=("Monetary", "mean"),
        Num_Customers=("customer_unique_id", "nunique"),
    )
    .round(2)
)

In [None]:
# sort the result base number of customer (use DESC)
segment_characteristics = segment_characteristics.sort_values(
    "Num_Customers", ascending=False
)

In [None]:
print("Result after characterization RFM segments\n")
display(segment_characteristics)

Average RFM Score visualization per Segment

In [None]:
scores_for_heatmap = segment_characteristics[
    [
        "Avg_R_score",
        "Avg_F_score",
        "Avg_M_score",
    ]
]

plt.figure(figsize=(10, 7))
sns.heatmap(
    scores_for_heatmap,
    annot=True,
    cmap="rocket",
    fmt=".2f",
    linewidths=0.5,
    linecolor="black",
    cbar_kws={"label": "Rata-rata Skor"},
)

plt.title("Avarage Score R, F, M per Segment", fontsize=16)
plt.xlabel("RFM Score", fontsize=12)
plt.ylabel("RFM Segments", fontsize=12)
plt.xticks(rotation=45, ha="right", fontsize=10)
plt.yticks(rotation=0, fontsize=10)
plt.tight_layout()
plt.show()

## Further Data Exploration & RFM Cross-Segment Analysis

Identify product or category products that most bought by customer

In [None]:
# merge data to get product categories per customer_unique_id and RFM segment

product_info_df = ecom_data_df[
    ["customer_unique_id", "order_id", "product_id", "product_category_name"]
].copy()

# drop duplicate
product_info_df.drop_duplicates(inplace=True)

# merge data
df_segment_products = pd.merge(
    customer_rfm_final_df, product_info_df, how="left", on="customer_unique_id"
)

# drop rows without product categries
df_segment_products.dropna(subset=["product_category_name"], inplace=True)

In [None]:
# count the number of times a product category appears per segment
product_preference_per_segment = (
    df_segment_products.groupby(["RFM_segment", "product_category_name"])
    .size()
    .reset_index(name="order_count")
)

In [None]:
# for every segment, get 5 higher category product
top_n = 5
top_products_per_segment = (
    product_preference_per_segment.sort_values(
        ["RFM_segment", "order_count"], ascending=[False, False]
    )
    .groupby("RFM_segment")
    .head(top_n)
    .reset_index(drop=True)
)

Identify most city/state per segment

In [None]:
df_segment_geo = pd.merge(
    customer_rfm_final_df,
    ecom_data_df[["customer_unique_id", "customer_city", "customer_state"]],
    on="customer_unique_id",
    how="left",
)

df_segment_geo.dropna(subset=["customer_city", "customer_state"], inplace=True)

In [None]:
# calculate number of unique customer per segment
geo_distribution_per_segment_city = (
    df_segment_geo.groupby(["RFM_segment", "customer_city"])["customer_unique_id"]
    .nunique()
    .reset_index(name="customer_count")
)
geo_distribution_per_segment_state = (
    df_segment_geo.groupby(["RFM_segment", "customer_state"])["customer_unique_id"]
    .nunique()
    .reset_index(name="customer_count")
)

In [None]:
# Take the top 5 cities/states per segment
top_n = 5
top_cities_per_segment = (
    geo_distribution_per_segment_city.sort_values(
        ["RFM_segment", "customer_count"], ascending=[False, False]
    )
    .groupby("RFM_segment")
    .head(top_n)
    .reset_index(drop=True)
)

top_states_per_segment = (
    geo_distribution_per_segment_state.sort_values(
        ["RFM_segment", "customer_count"], ascending=[False, False]
    )
    .groupby("RFM_segment")
    .head(top_n)
    .reset_index(drop=True)
)

Analysis of payment method per RFM segment

In [None]:
payment_info_df = ecom_data_df[
    ["customer_unique_id", "order_id", "payment_type"]
].copy()

# drop duplicate
payment_info_df.drop_duplicates(inplace=True)

# merge data
df_segment_payments = pd.merge(
    customer_rfm_final_df, payment_info_df, how="left", on="customer_unique_id"
)

# drop rows without product categries
df_segment_payments.dropna(subset=["payment_type"], inplace=True)

# count amount transaction per payment method
payment_method_per_segment = (
    df_segment_payments.groupby(["RFM_segment", "payment_type"])
    .size()
    .reset_index(name="transaction_count")
)

In [None]:
# get the top 5 payment method
top_payments_per_segment = (
    payment_method_per_segment.sort_values(
        ["RFM_segment", "transaction_count"], ascending=[False, False]
    )
    .groupby("RFM_segment")
    .head(top_n)
    .reset_index(drop=True)
)

display all all result

In [None]:
print("top category products:\n")
display(top_products_per_segment)
print("top cities for every segments customer:\n")
display(top_cities_per_segment)
print("top states for every segments customer:\n")
display(top_states_per_segment)
print("top payment method for every segments customer:\n")
display(top_payments_per_segment)

Analyze time delivery per segment RFM

In [None]:
segment_delivery_time = pd.merge(
    customer_rfm_final_df,
    ecom_data_df[
        [
            "customer_unique_id",
            "order_id",
            "order_purchase_timestamp",
            "order_delivered_customer_date",
        ]
    ],
    on="customer_unique_id",
    how="left",
)

# check if date column is datetime
if pd.api.types.is_datetime64_any_dtype(
    ecom_data_df["order_purchase_timestamp"]
) and pd.api.types.is_datetime64_any_dtype(
    ecom_data_df["order_delivered_customer_date"]
):
    print(
        "Both columns (order_purchase_timestamp & order_delivered_customer_date) are already of datetime type"
    )
else:
    print("One or both columns are NOT of datetime type. Convert now...")
    ecom_data_df[["order_purchase_timestamp", "order_delivered_customer_date"]] = (
        pd.to_datetime(
            ecom_data_df[["order_purchase_timestamp", "order_delivered_customer_date"]],
            errors="coerce",
        )
    )
    print("The columns are successfully converted to datetime.")

In [None]:
# calculate delivery time in day
segment_delivery_time["delivery_time_days"] = (
    segment_delivery_time["order_delivered_customer_date"]
    - segment_delivery_time["order_purchase_timestamp"]
).dt.days.astype("Int64")

In [None]:
display(segment_delivery_time[["delivery_time_days"]].head(1))

In [None]:
# calculate avarage time delivery per segment RFM
avg_time_delivery = (
    segment_delivery_time.groupby("RFM_segment")["delivery_time_days"]
    .mean()
    .reset_index()
)

# rename column
avg_time_delivery.rename(
    columns={"delivery_time_days": "avg_delivery_time_days"}, inplace=True
)

# sort data time delivery
avg_time_delivery = avg_time_delivery.sort_values(
    by="avg_delivery_time_days", ascending=False
)

In [80]:
display(avg_time_delivery)

Unnamed: 0,RFM_segment,avg_delivery_time_days
0,Repeat Buyers (Champions),13.4
2,Repeat Buyers (Less Recent & Moderate/Low M),13.375
7,Single Buyers (Moderate R & High M),12.808605
1,Repeat Buyers (Less Recent & High M),12.5
8,Single Buyers (Moderate R & Moderate/Low M),12.174123
4,Single Buyers (Hibernating & High M),11.181219
5,Single Buyers (Hibernating & Moderate/Low M),10.565882
3,Repeat Buyers (Recent & Moderate/Low M),10.333333
6,Single Buyers (High Value & Recent),10.306224
9,Single Buyers (Recent & Moderate/Low M),9.472075


Analyze price product per segment

In [77]:
# merge data to get price product per customer_unique_id and RFM segment
price_product_info = ecom_data_df[["customer_unique_id", "order_id", "price"]].copy()

# drop duplicate
price_product_info.drop_duplicates(inplace=True)

# merge data
price_products_df = pd.merge(
    customer_rfm_final_df, price_product_info, how="left", on="customer_unique_id"
)

# drop NaN price column
price_products_df.dropna(subset=["price"], inplace=True)

In [78]:
# calculate avarage time delivery per segment RFM
avg_price_product = (
    price_products_df.groupby("RFM_segment")["price"].mean().reset_index()
)

# rename column
avg_price_product.rename(columns={"price": "avg_price_product"}, inplace=True)

# sort data time delivery
avg_price_product = avg_price_product.sort_values(
    by="avg_price_product", ascending=False
)

In [79]:
display(avg_price_product)

Unnamed: 0,RFM_segment,avg_price_product
4,Single Buyers (Hibernating & High M),227.851764
6,Single Buyers (High Value & Recent),225.155718
7,Single Buyers (Moderate R & High M),210.467118
1,Repeat Buyers (Less Recent & High M),161.734286
0,Repeat Buyers (Champions),87.485
5,Single Buyers (Hibernating & Moderate/Low M),58.491981
9,Single Buyers (Recent & Moderate/Low M),58.025036
8,Single Buyers (Moderate R & Moderate/Low M),58.010429
3,Repeat Buyers (Recent & Moderate/Low M),50.455
2,Repeat Buyers (Less Recent & Moderate/Low M),45.15
