In [1]:
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
sys.path.append(str(PROJECT_ROOT))

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

import mlflow
import mlflow.sklearn

from src.data_pipeline.load_data import DataLoader
from src.data_pipeline.preprocess import Feature_Engineer, Preprocessor

In [2]:
loader = DataLoader(PROJECT_ROOT/"data")

customers = loader.load_customers()
tx = loader.load_transactions()
prods = loader.load_products()


In [3]:
customers.head()

Unnamed: 0,customer_id,name,email,phone_number,country,city,age,gender,interests,signup_date,last_purchase_date,total_spent,purchase_frequency,churn
0,CUST0001,Laura Hill,patrickcooke@hamilton.com,+1 (830) 982-9806,CA,South Lisa,61,F,"Beauty, Fashion",2022-08-17,2025-02-13,225.82,1.9,0
1,CUST0002,Kimberly Burnett,stonebrenda@hotmail.com,(938) 909-3033,US,Lake Priscilla,33,M,"Books, Sports, Electronics",2022-08-15,2024-04-17,3001.28,4.2,1
2,CUST0003,Abigail West,williamsbailey@hotmail.com,+1 (265) 189-5206,CA,North Tinahaven,45,Other,"Electronics, Sports",2022-12-18,2025-07-05,3934.91,2.1,0
3,CUST0004,Carolyn Rowland,kevinshah@conway.com,+234 690 602 6127,NG,Catherinehaven,49,M,"Books, Home Appliances",2023-03-19,2024-10-03,162.0,2.4,1
4,CUST0005,Amanda Johnson,melissa32@arnold.org,+44 7914 177971,UK,Jorgeburgh,59,F,"Sports, Home Appliances, Beauty",2023-01-09,2023-06-11,2233.89,3.8,1


In [4]:
prods.head()

Unnamed: 0,product_id,product_name,category,description,price,stock_status
0,PROD0001,Sharable bifurcated algorithm,Beauty,Development say quality throughout beautiful. ...,60.11,Out of Stock
1,PROD0002,Reactive explicit product,Fashion,Young catch management sense technology. Physi...,115.49,In Stock
2,PROD0003,Sharable motivating portal,Beauty,Hear claim well two. Out major born.,371.63,In Stock
3,PROD0004,Universal fault-tolerant initiative,Sports,Best issue interest level. Pull worker better.,213.85,In Stock
4,PROD0005,Down-sized incremental matrix,Electronics,Ten total clearly able hospital. Expect recent...,113.23,In Stock


In [5]:
tx.head()

Unnamed: 0,transaction_id,customer_id,product_id,purchase_date,quantity,total_price
0,TX00001,CUST0001,PROD0131,2022-09-14,2,124.18
1,TX00002,CUST0001,PROD0100,2023-11-03,5,2460.05
2,TX00003,CUST0001,PROD0041,2022-09-17,3,249.87
3,TX00004,CUST0002,PROD0086,2023-09-28,3,116.76
4,TX00005,CUST0002,PROD0051,2022-12-13,4,1536.36


In [6]:
prep = Preprocessor()
clean_cutomers = prep.clean_customer_data(customers)
clean_tx = prep.clean_tx_data(tx)
clean_prods = prep.clean_product_data(prods)

In [7]:
feat_eng = Feature_Engineer()
customers_eng = feat_eng.engineer_customer_features(customers=clean_cutomers, products=prods, transactions=clean_tx)
products_eng = feat_eng.engineer_product_features(transactions=clean_tx, products=clean_prods)
tx_eng = feat_eng.engineer_tx_features(clean_tx)

In [8]:
customers.columns

Index(['customer_id', 'name', 'email', 'phone_number', 'country', 'city',
       'age', 'gender', 'interests', 'signup_date', 'last_purchase_date',
       'total_spent', 'purchase_frequency', 'churn'],
      dtype='object')

In [9]:
# Merge datasets

merged = feat_eng.merge_all_datasets(tx_eng, customers_eng, products_eng)
merged.head(5)

INFO:src.utils.logger:datasets merged...


Unnamed: 0,transaction_id,customer_id,product_id,purchase_date,quantity,total_price,days_since_last_purchase,product_name,category,description,...,signup_date,last_purchase_date,total_spent,purchase_frequency,churn,num_purchases,avg_purchase_value,recency_days,days_since_signup,top_category
0,TX00001,CUST0001,PROD0131,2022-09-14 00:00:00+00:00,2,124.18,143.5,extended radical productivity,Electronics,Beat painting heavy shake. Sort continue book ...,...,2022-08-17 00:00:00+00:00,2025-02-13,2834.1,1.9,0,3,944.7,773,1216,Electronics
1,TX00003,CUST0001,PROD0041,2022-09-17 00:00:00+00:00,3,249.87,3.0,future-proofed client-driven benchmark,Home Appliances,Small control see the face also.\nOwner up fri...,...,2022-08-17 00:00:00+00:00,2025-02-13,2834.1,1.9,0,3,944.7,773,1216,Electronics
2,TX00002,CUST0001,PROD0100,2023-11-03 00:00:00+00:00,5,2460.05,412.0,re-contextualized mission-critical throughput,Sports,Which learn leave only agree break. Him small ...,...,2022-08-17 00:00:00+00:00,2025-02-13,2834.1,1.9,0,3,944.7,773,1216,Electronics
3,TX00006,CUST0002,PROD0084,2022-08-27 00:00:00+00:00,1,241.01,143.5,multi-lateral transitional knowledge user,Sports,Service develop game stop. Natural hear blood ...,...,2022-08-15 00:00:00+00:00,2024-04-17,1894.13,4.2,1,3,631.376667,809,1218,Sports
4,TX00005,CUST0002,PROD0051,2022-12-13 00:00:00+00:00,4,1536.36,108.0,future-proofed coherent implementation,Sports,First blood accept final growth especially. Be...,...,2022-08-15 00:00:00+00:00,2024-04-17,1894.13,4.2,1,3,631.376667,809,1218,Sports


In [10]:
customers_eng.head()

Unnamed: 0_level_0,name,email,phone_number,country,city,age,gender,interests,signup_date,last_purchase_date,total_spent,purchase_frequency,churn,num_purchases,avg_purchase_value,recency_days,days_since_signup,top_category
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
CUST0001,laura hill,patrickcooke@hamilton.com,+1 (830) 982-9806,CA,South Lisa,61,F,"Beauty, Fashion",2022-08-17 00:00:00+00:00,2025-02-13,2834.1,1.9,0,3,944.7,773,1216,Electronics
CUST0002,kimberly burnett,stonebrenda@hotmail.com,(938) 909-3033,US,Lake Priscilla,33,M,"Books, Sports, Electronics",2022-08-15 00:00:00+00:00,2024-04-17,1894.13,4.2,1,3,631.376667,809,1218,Sports
CUST0003,abigail west,williamsbailey@hotmail.com,+1 (265) 189-5206,CA,North Tinahaven,45,Other,"Electronics, Sports",2022-12-18 00:00:00+00:00,2025-07-05,3938.37,2.1,0,3,1312.79,213,1093,Books
CUST0004,carolyn rowland,kevinshah@conway.com,+234 690 602 6127,NG,Catherinehaven,49,M,"Books, Home Appliances",2023-03-19 00:00:00+00:00,2024-10-03,2482.5,2.4,1,3,827.5,568,1002,Fashion
CUST0005,amanda johnson,melissa32@arnold.org,+44 7914 177971,UK,Jorgeburgh,59,F,"Sports, Home Appliances, Beauty",2023-01-09 00:00:00+00:00,2023-06-11,1415.28,3.8,1,3,471.76,363,1071,Fashion


In [11]:
target = "churn"

feature_cols = [
    'total_spent', 'total_price', 'days_since_last_purchase', 'product_name', 'category', 
    'popularity_score', 'category_popularity', 'country', 'city', 'age', 'gender', 'interests', 
    'purchase_frequency', 'num_purchases', 'avg_purchase_value', 'recency_days', 'top_category'
]

X = merged[feature_cols]
y = merged[target]

X


Unnamed: 0,total_spent,total_price,days_since_last_purchase,product_name,category,popularity_score,category_popularity,country,city,age,gender,interests,purchase_frequency,num_purchases,avg_purchase_value,recency_days,top_category
0,2834.10,124.18,143.5,extended radical productivity,Electronics,27,1028,CA,South Lisa,61,F,"Beauty, Fashion",1.9,3,944.700000,773,Electronics
1,2834.10,249.87,3.0,future-proofed client-driven benchmark,Home Appliances,38,929,CA,South Lisa,61,F,"Beauty, Fashion",1.9,3,944.700000,773,Electronics
2,2834.10,2460.05,412.0,re-contextualized mission-critical throughput,Sports,35,875,CA,South Lisa,61,F,"Beauty, Fashion",1.9,3,944.700000,773,Electronics
3,1894.13,241.01,143.5,multi-lateral transitional knowledge user,Sports,37,875,US,Lake Priscilla,33,M,"Books, Sports, Electronics",4.2,3,631.376667,809,Sports
4,1894.13,1536.36,108.0,future-proofed coherent implementation,Sports,30,875,US,Lake Priscilla,33,M,"Books, Sports, Electronics",4.2,3,631.376667,809,Sports
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,5069.95,2318.15,92.0,virtual cohesive matrix,Books,28,819,US,West Andrewfurt,28,F,Fashion,2.2,3,1689.983333,359,Books
5996,5069.95,2441.10,28.0,exclusive 24hour process improvement,Fashion,23,1057,US,West Andrewfurt,28,F,Fashion,2.2,3,1689.983333,359,Books
5997,3341.03,909.44,143.5,customer-focused value-added neural-net,Fashion,30,1057,CA,Lake Willieberg,57,F,"Sports, Books",3.3,3,1113.676667,532,Electronics
5998,3341.03,2047.35,103.0,exclusive dynamic process improvement,Beauty,37,1292,CA,Lake Willieberg,57,F,"Sports, Books",3.3,3,1113.676667,532,Electronics


In [12]:
num_features = [col for col in X.columns if X[col].dtype != "object"]
cat_features = [col for col in X.columns if X[col].dtype == "object"]

In [13]:
transformed = ColumnTransformer([
    ("num", StandardScaler(), num_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
])

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

X_train

Unnamed: 0,total_spent,total_price,days_since_last_purchase,product_name,category,popularity_score,category_popularity,country,city,age,gender,interests,purchase_frequency,num_purchases,avg_purchase_value,recency_days,top_category
1591,2668.57,218.49,253.0,optimized multi-tasking infrastructure,Books,34,819,CA,Rogersport,47,Other,"Electronics, Home Appliances, Books",1.8,3,889.523333,198,Sports
5499,2394.21,936.16,143.5,user-centric multimedia ability,Beauty,33,1292,IN,South Josephton,67,Other,"Sports, Beauty",1.0,3,798.070000,260,Beauty
4516,2161.66,842.76,52.0,balanced clear-thinking methodology,Electronics,27,1028,CA,West Kristinmouth,54,F,"Books, Beauty, Fashion",4.2,3,720.553333,541,Sports
1739,3141.53,461.96,153.0,reactive explicit product,Fashion,30,1057,CA,Dixonbury,41,Other,"Home Appliances, Sports",4.3,3,1047.176667,178,Fashion
5662,973.22,50.91,53.0,balanced dynamic leverage,Electronics,43,1028,IN,South Ashley,18,M,"Fashion, Electronics",3.4,3,324.406667,239,Beauty
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3080,3303.89,2452.30,53.0,stand-alone web-enabled migration,Fashion,32,1057,US,West Mark,46,F,"Beauty, Electronics, Home Appliances",4.7,3,1101.296667,249,Books
3344,2465.95,1381.24,179.0,total regional matrix,Fashion,28,1057,UK,Figueroachester,53,M,Electronics,2.2,3,821.983333,266,Sports
4900,3035.34,30.82,151.0,visionary background initiative,Books,31,819,IN,South Laurenberg,38,Other,Sports,4.3,3,1011.780000,249,Books
1877,1260.31,160.23,197.0,organic contextually-based pricing structure,Beauty,29,1292,NG,South Robert,68,M,Home Appliances,3.7,3,420.103333,456,Beauty


In [15]:
models = {
    "logistic_regression": LogisticRegression(max_iter=1000),
    "random_forest": RandomForestClassifier(n_estimators=200),
    "catboost": CatBoostClassifier(verbose=0),
    "lightgbm": LGBMClassifier(n_estimators=300, n_jobs=4, verbose=-1)
}

models.items()

dict_items([('logistic_regression', LogisticRegression(max_iter=1000)), ('random_forest', RandomForestClassifier(n_estimators=200)), ('catboost', <catboost.core.CatBoostClassifier object at 0x000001430F75D2B0>), ('lightgbm', LGBMClassifier(n_estimators=300, n_jobs=4, verbose=-1))])

In [None]:
mlflow.set_experiment("agentic_churn_modeling")

results = {}

for model_name, model in models.items():
    with mlflow.start_run(run_name=model_name):

        if model_name == "catboost":
            cat_features = X_train.select_dtypes(
                include=["object", "category"]
            ).columns.tolist()

            cat_feature_indices = [
                X_train.columns.get_loc(col) for col in cat_features
            ]

            model.fit(
                X_train,
                y_train,
                cat_features=cat_feature_indices
            )

            proba = model.predict_proba(X_test)[:, 1]

            mlflow.log_params(model.get_params())

            auc = roc_auc_score(y_test, proba)
            mlflow.log_metric("auc", auc)

            mlflow.catboost.log_model(model, artifact_path="model")
            results[model_name] = auc
            
        else:

            pipe = Pipeline([
                ("transformed", transformed),
                ("model", model)
            ])

            pipe.fit(X_train, y_train)
            preds = pipe.predict(X_test)
            proba = pipe.predict_proba(X_test)[:, 1]

            mlflow.log_param("train_size", X_train.shape[0])
            mlflow.log_param("test_size", X_test.shape[0])
            mlflow.log_param("num_features", X_train.shape[1])

            mlflow.log_params(pipe.named_steps["model"].get_params())

            auc = roc_auc_score(y_test, proba)
            mlflow.log_metric("auc", auc)

            
            mlflow.sklearn.log_model(pipe, artifact_path="model")

            results[model_name] = auc

2025/12/15 01:48:59 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/12/15 01:48:59 INFO mlflow.store.db.utils: Updating database tables
2025/12/15 01:48:59 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2025/12/15 01:48:59 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2025/12/15 01:48:59 INFO alembic.runtime.migration: Running upgrade  -> 451aebb31d03, add metric step
2025/12/15 01:48:59 INFO alembic.runtime.migration: Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
2025/12/15 01:48:59 INFO alembic.runtime.migration: Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
2025/12/15 01:48:59 INFO alembic.runtime.migration: Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
2025/12/15 01:48:59 INFO alembic.runtime.migration: Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
2025/12/15 01:49:00 INFO alembic.runtime.migration: Running 

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.mlflow-telemetry.io:443
DEBUG:urllib3.connectionpool:https://api.mlflow-telemetry.io:443 "POST /log HTTP/1.1" 200 30


In [17]:
print("AUC Scores:")
for k, v in results.items():
    print(k, v)

best_model = max(results, key=results.get)
print("Best Model:", best_model)

AUC Scores:
logistic_regression 0.9149372653173369
random_forest 0.9703287846872425
catboost 0.9988680282077114
lightgbm 0.9705504167048266
Best Model: catboost


In [23]:

if best_model == "catboost":
    cat_features = X_train.select_dtypes(
        include=["object", "category"]
    ).columns.tolist()

    cat_feature_indices = [
        X_train.columns.get_loc(col) for col in cat_features
    ]

    model = models[best_model]

    model.fit(
        X_train,
        y_train,
        cat_features=cat_feature_indices
    )

    preds = model.predict(X_test)
    proba = model.predict_proba(X_test)[:, 1]

else:

    best_pipeline = Pipeline([
        ("preprocess", transformed),
        ("model", models[best_model])
    ])
    best_pipeline.fit(X_train, y_train)
    preds = best_pipeline.predict(X_test)
    proba = best_pipeline.predict_proba(X_test)[:, 1]

print(classification_report(y_test, preds))
print("AUC:", roc_auc_score(y_test, proba))


              precision    recall  f1-score   support

           0       0.99      0.92      0.95       305
           1       0.97      1.00      0.99       895

    accuracy                           0.98      1200
   macro avg       0.98      0.96      0.97      1200
weighted avg       0.98      0.98      0.98      1200

AUC: 0.9988680282077114


In [25]:
import mlflow
print(mlflow.get_tracking_uri())

mlflow.search_runs()



sqlite:///mlflow.db


Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.auc,params.min_child_weight,params.test_size,params.importance_type,...,params.l1_ratio,params.solver,params.tol,params.C,params.penalty,params.fit_intercept,tags.mlflow.source.type,tags.mlflow.runName,tags.mlflow.user,tags.mlflow.source.name
0,b85084079de944949ad7dcb63d396569,1,FINISHED,file:c:/Users/HP/projects/AI/ai_marketing_agen...,2025-12-15 00:51:33.691000+00:00,2025-12-15 00:51:53.401000+00:00,0.97055,0.001,1200.0,split,...,,,,,,,LOCAL,lightgbm,HP,c:\Users\HP\projects\AI\ai_marketing_agents\ve...
1,8015321d1146435da910daceb42398a9,1,FINISHED,file:c:/Users/HP/projects/AI/ai_marketing_agen...,2025-12-15 00:50:00.625000+00:00,2025-12-15 00:51:33.640000+00:00,0.998868,,,,...,,,,,,,LOCAL,catboost,HP,c:\Users\HP\projects\AI\ai_marketing_agents\ve...
2,ee49d7de572444aaa6252f3705464292,1,FINISHED,file:c:/Users/HP/projects/AI/ai_marketing_agen...,2025-12-15 00:49:30.722000+00:00,2025-12-15 00:50:00.577000+00:00,0.970329,,1200.0,,...,,,,,,,LOCAL,random_forest,HP,c:\Users\HP\projects\AI\ai_marketing_agents\ve...
3,3c84752ccf6747a7abceb0bdced95cf9,1,FINISHED,file:c:/Users/HP/projects/AI/ai_marketing_agen...,2025-12-15 00:49:02.799000+00:00,2025-12-15 00:49:30.687000+00:00,0.914937,,1200.0,,...,0.0,lbfgs,0.0001,1.0,deprecated,True,LOCAL,logistic_regression,HP,c:\Users\HP\projects\AI\ai_marketing_agents\ve...
