In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
import mlflow
import os
import sys

sys.path.append(os.path.abspath(os.path.join('..', 'src')))

from data_processing import load_raw_data, create_feature_engineering_pipeline, create_full_processed_dataframe
from train import run_training_experiment


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)


local_mlruns_path = os.path.abspath("../mlruns")
if not os.path.exists(local_mlruns_path):
    os.makedirs(local_mlruns_path) 

mlflow.set_tracking_uri(local_mlruns_path)

database_uri = f"sqlite:///{os.path.join(local_mlruns_path, 'mlruns.db')}"
mlflow.set_tracking_uri(database_uri) 

print(f"MLflow Tracking URI: {mlflow.get_tracking_uri()}")

MLflow Tracking URI: sqlite:///c:\Tenx-projects\credit-risk-model\mlruns\mlruns.db


In [2]:
main_transactions_df = load_raw_data('data.csv')

final_processed_df = create_full_processed_dataframe(main_transactions_df)

print("\nShape of final processed data (DataFrame):", final_processed_df.shape)
print("\nFirst 5 rows of final processed data (DataFrame):")
display(final_processed_df.head())

Loaded data from: c:\Tenx-projects\credit-risk-model\data\raw\data.csv

Shape of final processed data (DataFrame): (95662, 95)

First 5 rows of final processed data (DataFrame):


Unnamed: 0,num__Amount,num__Value,num__TransactionHour,num__TransactionDay,num__TransactionMonth,num__TransactionYear,num__TransactionDayOfWeek,num__TransactionIsWeekend,num__TotalTransactionAmount,num__AverageTransactionAmount,num__TransactionCount,num__StdTransactionAmount,num__MaxTransactionAmount,num__MinTransactionAmount,num__Recency,num__Frequency,num__Monetary,num__TotalTransactionAmount_AccountId,num__AverageTransactionAmount_AccountId,num__TransactionCount_AccountId,num__StdTransactionAmount_AccountId,num__MaxTransactionAmount_AccountId,num__MinTransactionAmount_AccountId,num__TotalTransactionAmount_SubscriptionId,num__AverageTransactionAmount_SubscriptionId,num__TransactionCount_SubscriptionId,num__StdTransactionAmount_SubscriptionId,num__MaxTransactionAmount_SubscriptionId,num__MinTransactionAmount_SubscriptionId,num__TotalTransactionAmount_CustomerId,num__AverageTransactionAmount_CustomerId,num__TransactionCount_CustomerId,num__StdTransactionAmount_CustomerId,num__MaxTransactionAmount_CustomerId,num__MinTransactionAmount_CustomerId,cat__CurrencyCode_UGX,cat__CountryCode_256,cat__ProviderId_ProviderId_1,cat__ProviderId_ProviderId_2,cat__ProviderId_ProviderId_3,cat__ProviderId_ProviderId_4,cat__ProviderId_ProviderId_5,cat__ProviderId_ProviderId_6,cat__ProductId_ProductId_1,cat__ProductId_ProductId_10,cat__ProductId_ProductId_11,cat__ProductId_ProductId_12,cat__ProductId_ProductId_13,cat__ProductId_ProductId_14,cat__ProductId_ProductId_15,cat__ProductId_ProductId_16,cat__ProductId_ProductId_19,cat__ProductId_ProductId_2,cat__ProductId_ProductId_20,cat__ProductId_ProductId_21,cat__ProductId_ProductId_22,cat__ProductId_ProductId_23,cat__ProductId_ProductId_24,cat__ProductId_ProductId_27,cat__ProductId_ProductId_3,cat__ProductId_ProductId_4,cat__ProductId_ProductId_5,cat__ProductId_ProductId_6,cat__ProductId_ProductId_7,cat__ProductId_ProductId_8,cat__ProductId_ProductId_9,cat__ProductCategory_airtime,cat__ProductCategory_data_bundles,cat__ProductCategory_financial_services,cat__ProductCategory_movies,cat__ProductCategory_other,cat__ProductCategory_ticket,cat__ProductCategory_transport,cat__ProductCategory_tv,cat__ProductCategory_utility_bill,cat__ChannelId_ChannelId_1,cat__ChannelId_ChannelId_2,cat__ChannelId_ChannelId_3,cat__ChannelId_ChannelId_5,cat__PricingStrategy_0,cat__PricingStrategy_1,cat__PricingStrategy_2,cat__PricingStrategy_4,cat__RFM_Cluster_At Risk,cat__RFM_Cluster_Champions,cat__RFM_Cluster_Loyal Customers,cat__RFM_Cluster_Others,cat__RFM_Cluster_Potential Loyalists,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,FraudResult,is_high_risk
0,-0.046371,-0.072291,-2.15553,-0.100739,0.848684,-0.994246,-0.006389,-0.510703,0.0,-3.637979e-12,0.0,4.365575e-11,0.0,0.0,-0.746427,-0.311831,0.170118,0.532237,-0.050092,-0.718149,-0.145175,-0.138232,0.256104,0.565973,-0.050093,-0.742968,-0.145466,-0.1383,0.25837,0.170118,-0.067623,-0.311831,-0.167016,-0.193027,0.163927,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,0,0
1,-0.054643,-0.080251,-2.15553,-0.100739,0.848684,-0.994246,-0.006389,-0.510703,0.0,-3.637979e-12,0.0,4.365575e-11,0.0,0.0,-0.746427,-0.311831,0.170118,-0.546125,-0.087889,1.444841,-0.159869,-0.179437,0.137118,-0.564696,-0.087953,1.387358,-0.160722,-0.179505,0.139336,0.170118,-0.067623,-0.311831,-0.167016,-0.193027,0.163927,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,0,0
2,-0.050426,-0.076352,-2.15553,-0.100739,0.848684,-0.994246,-0.006389,-0.510703,0.0,-3.637979e-12,0.0,4.365575e-11,0.0,0.0,3.383499,-0.444993,0.165122,0.526214,-0.071753,-0.722639,-0.180726,-0.178508,0.258328,0.560028,-0.071754,-0.747155,-0.181018,-0.178576,0.260596,0.165122,-0.072568,-0.444993,-0.201209,-0.229522,0.28378,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,0,0
3,0.107717,0.096648,-1.949214,-0.100739,0.848684,-0.994246,-0.006389,-0.510703,0.0,-3.637979e-12,0.0,4.365575e-11,0.0,0.0,-0.491494,-0.40402,0.175567,0.535874,0.033881,-0.720955,0.041956,0.027002,0.258328,0.569563,0.033882,-0.745585,0.041672,0.026936,0.260596,0.175567,-0.008155,-0.40402,-0.008243,-0.043301,0.054969,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,0,1
4,-0.059704,-0.075183,-1.949214,-0.100739,0.848684,-0.994246,-0.006389,-0.510703,0.0,-3.637979e-12,0.0,4.365575e-11,0.0,0.0,-0.491494,-0.40402,0.175567,-0.546125,-0.087889,1.444841,-0.159869,-0.179437,0.137118,-0.564696,-0.087953,1.387358,-0.160722,-0.179505,0.139336,0.175567,-0.008155,-0.40402,-0.008243,-0.043301,0.054969,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,0,0


In [3]:
X_columns = [col for col in final_processed_df.columns if col not in ['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId', 'FraudResult', 'is_high_risk']]

X = final_processed_df[X_columns]
y = final_processed_df['is_high_risk']

print(f"Features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")
print("\nTarget variable distribution:")
print(y.value_counts(normalize=True))

Features (X) shape: (95662, 88)
Target (y) shape: (95662,)

Target variable distribution:
is_high_risk
0    0.995787
1    0.004213
Name: proportion, dtype: float64


In [None]:
print("Starting MLflow experiment...")
best_model = run_training_experiment(X, y)

if best_model:
    print(f"\nBest model found: {best_model.__class__.__name__}")
else:
    print("\nNo best model identified.")

Starting MLflow experiment...


2025/07/05 15:14:37 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/07/05 15:14:37 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.


Best params for LogisticRegression_tuned: {'C': 10.0}




Best params for RandomForestClassifier_tuned: {'max_depth': None, 'n_estimators': 200}


2025/07/05 15:17:35 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/07/05 15:17:35 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
Registered model 'CreditRiskProxyModel' already exists. Creating a new version of this model...


Best model (DecisionTreeClassifier) registered with ROC-AUC: 1.0

Best model found: DecisionTreeClassifier


Created version '3' of model 'CreditRiskProxyModel'.
