In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

In [2]:
ecom_data_df = pd.read_parquet("../dataset/processed/olist_ecom_cleaned.parquet")

In [3]:
# Function to name the RFM segment
def assign_rfm_segment(row):
    r, f, m = row["R_Score"], row["F_Score"], row["M_Score"]

    if r >= 4 and f >= 4 and m >= 4:
        return "Champions"
    elif r >= 3 and f >= 3 and m >= 3:
        return "Loyal Customers"
    elif r >= 4 and f >= 2 and m >= 2:
        return "Potential Loyalist"
    elif r >= 4 and f == 1 and m >= 3:
        return "New Customers (High Value)"
    elif r >= 4 and f == 1 and m < 3:
        return "New Customers (Moderate/Low Value)"
    elif r <= 2 and f >= 3 and m >= 3:
        return "At Risk"
    elif r <= 2 and f <= 2 and m >= 3:
        return "Cant Lose Them"
    elif r <= 2 and f <= 2 and m < 3:
        return "Hibernating"
    elif r >= 3 and f == 1:
        return "About To Sleep"
    else:
        return "Other"

In [4]:
# Define the Snapshot Date (Observation Date)
latest_purchase_date_in_data = ecom_data_df["order_purchase_timestamp"].max()

# determine the observation_date a few days after the last transaction date (60 days)
observation_date = latest_purchase_date_in_data + timedelta(days=60)
print(f"Last date purchased in the dataset: {latest_purchase_date_in_data}")
print(f"Date of Observation (to calculate Recency & Churn): {observation_date}")

Last date purchased in the dataset: 2018-09-09 14:42:05
Date of Observation (to calculate Recency & Churn): 2018-11-08 14:42:05


In [5]:
# Calculate Recency, Frequency, Monetary for Churn Prediction Purpose

# For Monetary: sum price per item per order_id, then sum per customer_unique_id
# For Frequency: sum unique order_id per customer_unique_id
# For Recency: calculate (observation_date - last_purchase_date)

customer_data_for_rfm = (
    ecom_data_df.groupby("customer_unique_id")
    .agg(
        last_purchase_date=(
            "order_purchase_timestamp",
            "max",
        ),
        Frequency=("order_id", "nunique"),
        Monetary=("price", "sum"),
    )
    .reset_index()
)

customer_data_for_rfm["Recency"] = (
    observation_date - customer_data_for_rfm["last_purchase_date"]
).dt.days

In [6]:
num_quantiles = 5
rfm_scores_for_churn = customer_data_for_rfm.copy()

# for Frequency and Monetary:
# Labels 0-indexed (0,1,2,3,4) will change to (1,2,3,4,5)
rfm_scores_for_churn["F_Score"] = pd.qcut(
    rfm_scores_for_churn["Frequency"], q=num_quantiles, labels=False, duplicates="drop"
)
rfm_scores_for_churn["M_Score"] = pd.qcut(
    rfm_scores_for_churn["Monetary"], q=num_quantiles, labels=False, duplicates="drop"
)

# for Recency:
# Labels 0-indexed (0,1,2,3,4) will change to (5,4,3,2,1)
rfm_scores_for_churn["R_Score_temp"] = pd.qcut(
    rfm_scores_for_churn["Recency"], q=num_quantiles, labels=False, duplicates="drop"
)

# for F and M, just +1 caouse labels=False, start from 0
rfm_scores_for_churn["F_Score"] = rfm_scores_for_churn["F_Score"] + 1
rfm_scores_for_churn["M_Score"] = rfm_scores_for_churn["M_Score"] + 1

# For R, we flip. If it's 5 quartiles, 5 - score_0_indexed.
# For example, if score_0_indexed is 0, so 5-0=5. If it's 4, then 5-4=1.
# max_possible_label_0_indexed = num_quantiles - 1 (if num_quantiles=5, then 4)
# new_score = (max_possible_label_0_indexed - old_score) + 1
max_score_value = rfm_scores_for_churn["R_Score_temp"].max()
rfm_scores_for_churn["R_Score"] = (
    max_score_value - rfm_scores_for_churn["R_Score_temp"]
) + 1

# drop column temporer
rfm_scores_for_churn.drop("R_Score_temp", axis=1, inplace=True)


# make sure data type is int (Integer)
rfm_scores_for_churn["R_Score"] = rfm_scores_for_churn["R_Score"].astype(int)
rfm_scores_for_churn["F_Score"] = rfm_scores_for_churn["F_Score"].astype(int)
rfm_scores_for_churn["M_Score"] = rfm_scores_for_churn["M_Score"].astype(int)

# make RFM_Score column
rfm_scores_for_churn["RFM_Score"] = (
    rfm_scores_for_churn["R_Score"].astype(str)
    + rfm_scores_for_churn["F_Score"].astype(str)
    + rfm_scores_for_churn["M_Score"].astype(str)
)

In [7]:
# apply the function
rfm_scores_for_churn["RFM_Segment"] = rfm_scores_for_churn.apply(
    assign_rfm_segment, axis=1
)

In [8]:
# Define Churn Status
churn_threshold_days = 270  # Customer churn if Recency > n days

rfm_scores_for_churn["is_churned"] = (
    rfm_scores_for_churn["Recency"] > churn_threshold_days
).astype(int)

In [9]:
# get favorite product category features per customer
customer_product_category = (
    ecom_data_df.groupby("customer_unique_id")["product_category_name"]
    .apply(lambda x: x.mode()[0] if not x.mode().empty else "unknown_category")
    .reset_index()
)
customer_product_category.rename(
    columns={"product_category_name": "fav_product_category"}, inplace=True
)

In [10]:
# Mengambil fitur metode pembayaran dominan per customer
customer_dominant_payment = (
    ecom_data_df.groupby("customer_unique_id")["payment_type"]
    .apply(lambda x: x.mode()[0] if not x.mode().empty else "unknown_payment")
    .reset_index()
)
customer_dominant_payment.rename(
    columns={"payment_type": "dominant_payment_type"}, inplace=True
)

In [11]:
# Taking a median delivery time per customer
if "delivery_duration_days" not in ecom_data_df.columns:
    ecom_data_df["delivery_duration_days"] = (
        ecom_data_df["order_delivered_customer_date"]
        - ecom_data_df["order_purchase_timestamp"]
    ).dt.days

avg_delivery_time_per_customer = (
    ecom_data_df.groupby("customer_unique_id")["delivery_duration_days"]
    .mean()
    .reset_index()
)
avg_delivery_time_per_customer.rename(
    columns={"delivery_duration_days": "avg_delivery_time_days"}, inplace=True
)

In [12]:
# Merge all the additional features into rfm_scores_for_churn
churn_prediction_df = pd.merge(
    rfm_scores_for_churn, customer_product_category, on="customer_unique_id", how="left"
)
churn_prediction_df = pd.merge(
    churn_prediction_df, customer_dominant_payment, on="customer_unique_id", how="left"
)
churn_prediction_df = pd.merge(
    churn_prediction_df,
    avg_delivery_time_per_customer,
    on="customer_unique_id",
    how="left",
)

In [13]:
# Fill in NaN for category, payment, and delivery time features (with 'unknown' or average(mean))

churn_prediction_df["fav_product_category"] = churn_prediction_df[
    "fav_product_category"
].fillna("unknown_category")

churn_prediction_df["dominant_payment_type"] = churn_prediction_df[
    "dominant_payment_type"
].fillna("unknown_payment_type")

churn_prediction_df["avg_delivery_time_days"] = churn_prediction_df[
    "avg_delivery_time_days"
].fillna(churn_prediction_df["avg_delivery_time_days"].mean())

In [14]:
# Final Churn Prediction Dataset yang akan digunakan untuk modeling
final_churn_dataset = churn_prediction_df[
    [
        "customer_unique_id",
        "Recency",
        "Frequency",
        "Monetary",
        "R_Score",
        "F_Score",
        "M_Score",
        "RFM_Segment",
        "fav_product_category",
        "dominant_payment_type",
        "avg_delivery_time_days",
        "is_churned",
    ]
]

In [None]:
churn_prediction_df["avg_delivery_time_days"] = pd.to_numeric(
    churn_prediction_df["avg_delivery_time_days"], errors="coerce"
)

In [16]:
print("Distribution churn status:")
print(final_churn_dataset["is_churned"].value_counts())
print("\nPercentage churn:")
print(
    final_churn_dataset["is_churned"]
    .value_counts(normalize=True)
    .mul(100)
    .apply(lambda x: f"{x:.2f}%")
)
display(final_churn_dataset.head())
final_churn_dataset.info()

Distribution churn status:
is_churned
1    3700
0    3098
Name: count, dtype: int64

Percentage churn:
is_churned
1    54.43%
0    45.57%
Name: proportion, dtype: object


Unnamed: 0,customer_unique_id,Recency,Frequency,Monetary,R_Score,F_Score,M_Score,RFM_Segment,fav_product_category,dominant_payment_type,avg_delivery_time_days,is_churned
0,001147e649a7b1afd577e873841632dd,435,1,170.0,2,1,5,Cant Lose Them,utilidades_domesticas,credit_card,9.0,1
1,0015752e079902b12cd00b9b7596276b,97,1,59.8,5,1,2,New Customers (Moderate/Low Value),malas_acessorios,credit_card,3.0,0
2,00293787e2aec37a9c253b63fdfe2049,378,1,118.9,2,1,4,Cant Lose Them,unknown_category,boleto,14.0,1
3,002d3bd901608f67c3fc11eaaa842b13,295,1,19.49,3,1,1,About To Sleep,bebidas,boleto,9.0,1
4,003fb95e849e71e732629d94bb92762f,102,1,57.99,5,1,2,New Customers (Moderate/Low Value),cama_mesa_banho,credit_card,6.0,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6798 entries, 0 to 6797
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   customer_unique_id      6798 non-null   object 
 1   Recency                 6798 non-null   int64  
 2   Frequency               6798 non-null   int64  
 3   Monetary                6798 non-null   float64
 4   R_Score                 6798 non-null   int64  
 5   F_Score                 6798 non-null   int64  
 6   M_Score                 6798 non-null   int64  
 7   RFM_Segment             6798 non-null   object 
 8   fav_product_category    6798 non-null   object 
 9   dominant_payment_type   6798 non-null   object 
 10  avg_delivery_time_days  6798 non-null   float64
 11  is_churned              6798 non-null   int64  
dtypes: float64(2), int64(6), object(4)
memory usage: 637.4+ KB
