In [2]:
import pandas as pd
import numpy as np



In [52]:
enriched_final_rfm_df=pd.read_parquet('../csv_export/RFM.parquet')

In [4]:
enriched_final_rfm_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 223393 entries, 0 to 223392
Data columns (total 30 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   CUSTOMER_ID                     223393 non-null  object 
 1   Recency                         223393 non-null  int64  
 2   Frequency                       223393 non-null  int64  
 3   Monetary                        223393 non-null  float64
 4   Total_Units                     223393 non-null  int64  
 5   Total_Organic_Discount          223393 non-null  float64
 6   Total_Extra_Discount            223393 non-null  float64
 7   Total_Discounts                 223393 non-null  float64
 8   AVG_ORGANIC_DISCOUNT_PER_ORDER  223393 non-null  float64
 9   USED_ORGANIC_DISCOUNT_RATIO     223393 non-null  float64
 10  AVG_EXTRA_DISCOUNT_PER_ORDER    223393 non-null  float64
 11  USED_EXTRA_DISCOUNT_RATIO       223393 non-null  float64
 12  AVG_TOTAL_DISCOU

In [53]:
enriched_final_rfm_df.dropna(inplace=True)

## KNN

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

categorical_cols = [
    'FAVOURITE_STORE',
    'FAVOURITE_STORE_TYPE',
    'FAVOURITE_PAYMENT_METHOD',
    'FAVORITE_CATEGORY',
    'FAVORITE_SUB_CATEGORY',
    'FAVORITE_WEEKDAY'
]

numerical_cols = [
    col for col in enriched_final_rfm_df.select_dtypes(include=['int64', 'float64']).columns
    if col != 'CUSTOMER_ID' and col != 'FAVORITE_HOUR'  
]

categorical_cols.append('FAVORITE_HOUR')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
    ]
)

X_processed = preprocessor.fit_transform(enriched_final_rfm_df)

num_features = numerical_cols
cat_features = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols)
all_features = list(num_features) + list(cat_features)

X_df = pd.DataFrame(X_processed, columns=all_features, index=enriched_final_rfm_df.index)


In [75]:
from sklearn.ensemble import IsolationForest

iso_forest = IsolationForest(contamination=0.05, random_state=42)  
enriched_final_rfm_df['Anomaly_Score'] = iso_forest.fit_predict(X_df)

enriched_final_rfm_df['Churn_Label_IsoForest'] = (enriched_final_rfm_df['Anomaly_Score'] == -1).astype(int)


## Treatment


In [80]:
enriched_final_rfm_df['Treatment'] = (enriched_final_rfm_df['USED_EXTRA_DISCOUNT_RATIO'] > 0.05).astype(bool)


## Modelling 

In [81]:
y = enriched_final_rfm_df.loc[X_df.index, 'Churn_Label_IsoForest']
treatment = enriched_final_rfm_df.loc[X_df.index, 'Treatment'].astype(int)


In [82]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test, treat_train, treat_test = train_test_split(
    X_df, y, treatment, test_size=0.3, stratify=treatment, random_state=42
)


In [11]:
y.shape

(222910,)

In [12]:
X_df.shape

(222910, 181)

In [13]:
treatment.shape

(222910,)

In [83]:
from sklift.models import TwoModels
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

uplift_model = TwoModels(
    estimator_trmnt=XGBClassifier(
        n_estimators=1000,
        learning_rate=0.05,
        max_depth=6,
        random_state=42,
        eval_metric='logloss'
    ),
    estimator_ctrl=RandomForestClassifier(
        n_estimators=1000,
        max_depth=None,
        random_state=42
    ),
    method='vanilla'
)

uplift_model.fit(X_train, y_train, treat_train)


## Evaluation 

In [84]:
from sklift.metrics import uplift_at_k, qini_auc_score

uplift_preds = uplift_model.predict(X_test)

qini_score = qini_auc_score(y_test, uplift_preds, treat_test)
print(f"Qini AUC Score: {qini_score:.4f}")



Qini AUC Score: 0.5479


In [85]:
enriched_final_rfm_df.loc[X_test.index, 'Uplift_Score'] = uplift_model.predict(X_test)


In [87]:
churned_customers = enriched_final_rfm_df[enriched_final_rfm_df['Churn_Label_IsoForest'] == 1]

top_churned_customers = churned_customers.sort_values('Uplift_Score', ascending=False).head(10000)


In [88]:
def assign_discount(uplift):
    if uplift > 0.05:        
        return 0.25
    elif uplift > 0.03:      
        return 0.15
    elif uplift > 0.015:     
        return 0.05
    else:
        return 0.00          


In [89]:
top_churned_customers['Assigned_Discount'] = top_churned_customers['Uplift_Score'].apply(assign_discount)
top_churned_customers['Expected_Revenue_Saved'] = top_churned_customers['Monetary'] * top_churned_customers['Uplift_Score']
top_churned_customers['Discount_Cost'] = top_churned_customers['Monetary'] * top_churned_customers['Assigned_Discount']
top_churned_customers['ROI'] = top_churned_customers['Expected_Revenue_Saved'] / top_churned_customers['Discount_Cost'].replace(0, 1)


In [92]:
top_targets = top_churned_customers.sort_values(by='Uplift_Score', ascending=False).head(10000)


In [93]:
top_targets

Unnamed: 0,CUSTOMER_ID,Recency,Frequency,Monetary,Total_Units,Total_Organic_Discount,Total_Extra_Discount,Total_Discounts,AVG_ORGANIC_DISCOUNT_PER_ORDER,USED_ORGANIC_DISCOUNT_RATIO,...,FAVORITE_WEEKDAY,FAVORITE_HOUR,Anomaly_Score,Churn_Label_IsoForest,Treatment,Uplift_Score,Assigned_Discount,Expected_Revenue_Saved,Discount_Cost,ROI
63407,48ed646e0699a87167a8224b18077963244d322065233a...,914,1,166.400,6,0.00,55.600,55.600,0.000000,0.000000,...,Thursday,10,-1,1,True,0.857953,0.25,142.763433,41.6000,3.431813
109542,7d5f51cb1895ab4fcff4b771211770add50bb34b17269c...,1187,1,39.840,2,0.00,15.960,15.960,0.000000,0.000000,...,Thursday,10,-1,1,True,0.810166,0.25,32.277000,9.9600,3.240663
180855,ced3e9c8bc612f5d396b4ac9c758d423bdd87034744c03...,19,3,254.200,6,46.49,0.000,46.490,15.496667,1.000000,...,Tuesday,13,-1,1,False,0.789775,0.25,200.760718,63.5500,3.159099
133163,985c846424d7923dac71e552da01737f1dc1f8b141611d...,280,38,1625.906,99,166.50,902.794,1069.294,4.381579,0.473684,...,Thursday,13,-1,1,True,0.766993,0.25,1247.057769,406.4765,3.067970
156549,b321185dac0c231c974269deb77a1c79714c6709324004...,626,44,2286.520,119,0.00,674.980,674.980,0.000000,0.000000,...,Sunday,13,-1,1,True,0.765090,0.25,1749.392860,571.6300,3.060359
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189980,d93ab2da789bbe16ad050a23837bc21857457278c34bc1...,674,4,142.390,27,0.00,0.000,0.000,0.000000,0.000000,...,Monday,13,-1,1,False,,0.00,,0.0000,
189989,d93d4b3be783aa12771ac9ed082dfb2745eaabae9518b5...,569,69,5588.360,231,0.00,1602.740,1602.740,0.000000,0.000000,...,Saturday,19,-1,1,True,,0.00,,0.0000,
190023,d9485697292d3c867afbcf9abbdc454cb8f4843eb8928d...,93,1,153.400,5,45.50,0.000,45.500,45.500000,1.000000,...,Saturday,12,-1,1,False,,0.00,,0.0000,
190064,d9574d22e464f5d8cff06f9433402bd84bae24cfb1fce8...,812,1,138.000,20,0.00,12.000,12.000,0.000000,0.000000,...,Monday,10,-1,1,True,,0.00,,0.0000,
