In [1]:
import pandas as pd
import numpy as np
import onnx
import onnxruntime as ort
import skl2onnx
from sklearn.linear_model import LassoCV
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score
import joblib
from sklearn.inspection import permutation_importance


In [2]:

file_path = "../../data/abt_em_transforme_updated.csv"  
df = pd.read_csv(file_path)

In [3]:
target = "cc_clos"
X = df.drop(columns=[target])
y = df[target]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

X_test.info()



<class 'pandas.core.frame.DataFrame'>
Index: 1278 entries, 2025 to 3017
Data columns (total 72 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        1278 non-null   int64  
 1   atm_trn_cnt_1     1278 non-null   int64  
 2   atm_trn_cnt       1278 non-null   int64  
 3   bt_trn_cnt_1      1278 non-null   int64  
 4   bt_trn_cnt        1278 non-null   int64  
 5   chq_trn_cnt_1     1278 non-null   int64  
 6   chq_trn_cnt       1278 non-null   int64  
 7   cc_trn_cnt_1      1278 non-null   int64  
 8   cc_trn_cnt        1278 non-null   int64  
 9   elt_trn_cnt_1     1278 non-null   int64  
 10  elt_trn_cnt       1278 non-null   int64  
 11  ht_trn_cnt_1      1278 non-null   int64  
 12  ht_trn_cnt        1278 non-null   int64  
 13  it_trn_cnt        1278 non-null   int64  
 14  it_trn_cnt_1      1278 non-null   int64  
 15  pos_trn_cnt_1     1278 non-null   int64  
 16  pos_trn_cnt       1278 non-null   int64  
 1

In [5]:
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

In [6]:
#Selecting features by importance from Permutation Importance
model_rf = RandomForestClassifier(n_estimators=100, random_state=42)
model_rf.fit(X_train, y_train)
perm_importance = permutation_importance(model_rf, X_train, y_train, scoring="roc_auc", n_repeats=10, random_state=42)
feature_importances_perm = perm_importance.importances_mean
print("Permutation Importance (топ-20):", np.argsort(feature_importances_perm)[-20:])

Permutation Importance (топ-20): [20 18 41 39 61 67 44 12 11  8 22 25 28 24 38 71  6 52 19 35]


In [7]:
permutation_importance_mass = [20, 18, 41, 39, 61, 67, 44, 12, 11, 8, 22, 25, 28, 24, 38, 71, 6, 52, 19, 35]
top_feature_names = df.columns[permutation_importance_mass].tolist()
print("Permutation Importance.Top-20 features:", top_feature_names)

Permutation Importance.Top-20 features: ['acc_credit', 'trd_trn_cnt', 'sa_la_open_mth', 'reject', 'sav_avg_bal', 'tra_auth_no', 'cust_class', 'ht_trn_cnt', 'ht_trn_cnt_1', 'cc_trn_cnt', 'arrear_ind', 'cc_avg_bal_1', 'cust_age', 'cc_avg_bal', 'fol_l_open_mth', 'call_trn_cnt', 'chq_trn_cnt', 'prod_count', 'cc_clos', 'gender']


In [8]:
#Selecting features by importance from RandomForest
feature_importances_rf = model_rf.feature_importances_
print("RandomForest.Top-20 features:", np.argsort(feature_importances_rf)[-20:])

RandomForest.Top-20 features: [65 61  7 44 28 42 38 33  0 51 58 23 48 25 24 62 20 29 19 35]


In [9]:
rf_importance_mass = [65, 61, 7, 44, 28, 42, 38, 33, 0, 51, 58, 23, 48, 25, 24, 62, 20, 29, 19, 35]
top_feature_names = df.columns[rf_importance_mass].tolist()
print("Permutation Importance.Top-20 features:", top_feature_names)

Permutation Importance.Top-20 features: ['tra_avg_bal', 'sav_avg_bal', 'cc_trn_cnt_1', 'cust_class', 'cust_age', 'tr_la_clos_mth', 'fol_l_open_mth', 'enq_trn_cnt', 'customerID', 'joint_acc_cnt', 'eqity_secur', 'cc_appr_amt', 'npv_savings', 'cc_avg_bal_1', 'cc_avg_bal', 'avg_bal_1', 'acc_credit', 'days_brtday', 'cc_clos', 'gender']


In [10]:
#Selecting features by LASSO
lasso = LassoCV(cv=5).fit(X_train, y_train)
feature_importances_lasso = np.abs(lasso.coef_)
print("Lasso Feature Importance (топ-20):", np.argsort(feature_importances_lasso)[-20:])

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


Lasso Feature Importance (топ-20): [ 1 56 13 44 47 53 50 66 32 65 38  8 26 52 42 22 62 29 51 35]


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


In [11]:
lasso_importance_mass = [1, 56, 13, 44, 47, 53, 50, 66, 32, 65, 38, 8, 26, 52, 42, 22, 62, 29, 51, 35]
top_feature_names = df.columns[rf_importance_mass].tolist()
print("LASSO.Top-20 features:", top_feature_names)

LASSO.Top-20 features: ['wthdr_count', 'cust_class', 'cc_trn_cnt_1', 'npv_total', 'tra_avg_bal', 'cc_appr_amt', 'fol_l_open_mth', 'eqity_secur', 'cust_age', 'sav_avg_bal', 'joint_acc_cnt', 'customerID', 'cc_avg_bal_1', 'cc_avg_bal', 'npv_savings', 'avg_bal_1', 'acc_credit', 'days_brtday', 'cc_clos', 'gender']


In [13]:
feature_scores = feature_importances_perm + feature_importances_rf + feature_importances_lasso


In [14]:
important_features = np.argsort(feature_scores)[-20:]
selected_features = X_train.columns[important_features]
print("Features to use:", selected_features.tolist())

Features to use: ['equity_amt', 'cc_trn_cnt_1', 'rel_age', 'cc_trn_cnt', 'cc_avg_bal', 'dep_max_amt', 'cc_appr_amt', 'customerID', 'tr_min_bal', 'npv_trans', 'reject', 'cc_min_bal', 'cc_avg_bal_1', 'tr_la_open_mth', 'acc_funds', 'npv_total', 'acc_credit', 'prod_count', 'cc_ea_open_mth', 'recency']


In [15]:
X_train = X_train[selected_features]
X_test = X_test[selected_features]

In [16]:
#models
models = {
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "MLPClassifier": MLPClassifier(hidden_layer_sizes=(50, 20), max_iter=500, random_state=42)
}

In [17]:
results = {}

In [18]:
onnx_output_dir = "../src/onnx/"

In [19]:
from sklearn.metrics import f1_score

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_test)[:, 1]
    
    gini = 2 * roc_auc_score(y_test, y_pred) - 1
    roc_auc = roc_auc_score(y_test, y_pred)
    f1 = f1_score(y_test, (y_pred > 0.5).astype(int))
    
    results[name] = {"GINI": gini, "ROC-AUC": roc_auc, "F1-Score": f1}
    print(f"{name}: GINI = {gini:.4f}, ROC-AUC = {roc_auc:.4f}, F1-Score = {f1:.4f}")
    
    # ONNX for inference
    initial_type = [("float_input", FloatTensorType([None, X_train.shape[1]]))]
    onnx_model = convert_sklearn(model, initial_types=initial_type)

    onnx_filename = f"{onnx_output_dir}{name}.onnx"  
    with open(onnx_filename, "wb") as f:
        f.write(onnx_model.SerializeToString())
    print(f"Model {name} saved as {onnx_filename}")

RandomForest: GINI = 0.8751, ROC-AUC = 0.9376, F1-Score = 0.6222
Model RandomForest saved as ../src/onnx/RandomForest.onnx
LogisticRegression: GINI = 0.6422, ROC-AUC = 0.8211, F1-Score = 0.0656
Model LogisticRegression saved as ../src/onnx/LogisticRegression.onnx
MLPClassifier: GINI = 0.6713, ROC-AUC = 0.8357, F1-Score = 0.3093
Model MLPClassifier saved as ../src/onnx/MLPClassifier.onnx


In [20]:
#Because of the class disbalance, the best model is RF
best_model = max(results, key=lambda x: results[x]["GINI"])
print(f"The best model: {best_model} with GINI = {results[best_model]['GINI']:.4f}")

The best model: RandomForest with GINI = 0.8751


In [39]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score,recall_score,precision_score,roc_curve
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

models = {
    "RandomForest": RandomForestClassifier(n_estimators=300, max_depth=15, min_samples_split=2, min_samples_leaf=1, random_state=42 ,class_weight="balanced"),
}
for name, model in models.items():

    cv_scores = cross_val_score(model, X_train_resampled, y_train_resampled, cv=5, scoring='roc_auc')  
    print(f"{name} Cross-Validation ROC-AUC: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")
    
    model.fit(X_train_resampled, y_train_resampled)
    y_pred = model.predict_proba(X_test)[:, 1]

    fpr, tpr, thresholds = roc_curve(y_test, y_pred)
    optimal_idx = np.argmax(tpr - fpr)
    threshold = thresholds[optimal_idx]
    print(f"Optimal threshold: {threshold}")

    gini = 2 * roc_auc_score(y_test, y_pred) - 1
    roc_auc = roc_auc_score(y_test, y_pred)
    f1 = f1_score(y_test, (y_pred > threshold).astype(int))
    acc = accuracy_score(y_test, (y_pred > threshold).astype(int))
    rec = recall_score(y_test,(y_pred> threshold).astype(int))
    prec = precision_score(y_test,(y_pred> threshold).astype(int))
    
    results[name] = {"GINI": gini, "ROC-AUC": roc_auc, "F1-Score": f1 , "Accuracy": acc,"Recall": rec, "Precision" : prec}
    print(f"{name}: GINI = {gini:.4f}, ROC-AUC = {roc_auc:.4f}, F1-Score = {f1:.4f} , Accuracy = {acc:.4f} , Recall = {rec:.4f} , Precision = {prec:.4f}")

    # ONNX for inference
    initial_type = [("float_input", FloatTensorType([None, X_train.shape[1]]))]
    onnx_model = convert_sklearn(model, initial_types=initial_type)

    onnx_filename = f"{onnx_output_dir}{name}.onnx"  
    with open(onnx_filename, "wb") as f:
        f.write(onnx_model.SerializeToString())
    print(f"Model {name} saved as {onnx_filename}")

RandomForest Cross-Validation ROC-AUC: 0.9992 ± 0.0004
Optimal threshold: 0.2752719251990529
RandomForest: GINI = 0.8829, ROC-AUC = 0.9415, F1-Score = 0.5683 , Accuracy = 0.9382 , Recall = 0.8814 , Precision = 0.4194
Model RandomForest saved as ../src/onnx/RandomForest.onnx


In [127]:
"""
    I emphasized recall because this metric shows how well the model detects customers who will actually leave. 
    This is important so that you don't miss out on customers who need retention.
    Even if it requires a little more resources
    
    The high ROC-AUC values in both cross-validation and testing indicate 
    that the metrics are not wrong and the model indeed performs well
"""

'\n    The high ROC-AUC values in both cross-validation and testing indicate \n    that the metrics are not wrong and the model indeed performs well\n'

In [40]:

best_model = max(results, key=lambda x: results[x]["GINI"])
print(f"Лучшая модель: {best_model} с GINI = {results[best_model]['GINI']:.4f}")

Лучшая модель: RandomForest с GINI = 0.8829


In [41]:
import json
with open("../src/onnx/selected_features.json", "w") as f:
    json.dump(selected_features.tolist(), f)


In [131]:
"""
    Let's save the list of features for future use in the inference
    
"""

"\n    Let's save the list of features for future use in the inference\n    \n"