In [1]:
import os
import sys
from pathlib import Path

ROOT_DIR = Path(os.getcwd()).parent.resolve()
sys.path.append(str(ROOT_DIR))

In [28]:
import matplotlib.pyplot as plt
import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd
import seaborn as sns
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from xgboost import XGBClassifier

from src.config import cfg  # noqa: E402
from src.logger import get_logger  # noqa: E402

log = get_logger(__name__)

In [3]:
RANDOM_STATE = 42

# 1. Load data

In [4]:
features_path = f"{ROOT_DIR}/{cfg['data']['features_path']}"
labels_path = f"{ROOT_DIR}/{cfg['data']['labeled_path']}"
features = pd.read_parquet(features_path)
labels   = pd.read_parquet(labels_path)

df = features.merge(labels, on="userId", how="inner")

log.info(f"Loaded dataset shape: {df.shape}")


2025-10-09 09:16:04,595 | INFO | __main__: Loaded dataset shape: (426, 37)


# 2. Prepare X, y

In [5]:
X = df.drop(columns=["userId", "churn", "auth_fail_ratio"]) # "last_ts_dt"
y = df["churn"].astype(int)

# Handle any non-numeric columns
X = pd.get_dummies(X, drop_first=True)

# Fill NaNs
X = X.fillna(0)

# 3. Split train / test

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 4. Scale numeric features

In [7]:
num_cols = X_train.select_dtypes(include=np.number).columns
cat_cols = X_train.select_dtypes(include=["object", "category"]).columns
bool_cols = X.select_dtypes(include="bool").columns

scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

# 5. Handling Class Imbalanced

In [8]:
print(y_train.value_counts(normalize=True))

churn
0    0.838235
1    0.161765
Name: proportion, dtype: float64


In [9]:
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print('Imbalance')
print(X_train.shape)
print(y_train.value_counts())

print('Balance')
print(X_train_smote.shape)
print(y_train_smote.value_counts())

Imbalance
(340, 85)
churn
0    285
1     55
Name: count, dtype: int64
Balance
(570, 85)
churn
0    285
1    285
Name: count, dtype: int64


# 6. Cross Validation

In [10]:
models = {
            "RandomForest": RandomForestClassifier(n_estimators=300, class_weight="balanced", random_state=RANDOM_STATE, n_jobs=-1),
            "GradientBoosting": GradientBoostingClassifier(random_state=RANDOM_STATE),
            "LogisticRegression": LogisticRegression(max_iter=500, class_weight="balanced", solver="lbfgs", random_state=RANDOM_STATE),
            "LightGBM": LGBMClassifier(n_estimators=400, learning_rate=0.05, subsample=0.8, colsample_bytree=0.8, verbose=-1, class_weight="balanced", random_state=RANDOM_STATE),
        }

In [11]:
# dictionary to store the cross validation results
cv_scores = {}

# perform 5-fold cross validation for each model
for model_name, model in models.items():
  print(f"Training {model_name} with default parameters")
  scores = cross_val_score(model, X_train_smote, y_train_smote, cv=5, scoring="accuracy")
  cv_scores[model_name] = scores
  print(f"{model_name} cross-validation accuracy: {np.mean(scores):.2f}")
  print("-"*70)

Training RandomForest with default parameters
RandomForest cross-validation accuracy: 0.93
----------------------------------------------------------------------
Training GradientBoosting with default parameters
GradientBoosting cross-validation accuracy: 0.90
----------------------------------------------------------------------
Training LogisticRegression with default parameters
LogisticRegression cross-validation accuracy: 0.86
----------------------------------------------------------------------
Training LightGBM with default parameters
LightGBM cross-validation accuracy: 0.92
----------------------------------------------------------------------


In [12]:
cv_scores

{'RandomForest': array([0.87719298, 0.96491228, 0.9122807 , 0.97368421, 0.93859649]),
 'GradientBoosting': array([0.78947368, 0.92105263, 0.9122807 , 0.95614035, 0.93859649]),
 'LogisticRegression': array([0.73684211, 0.84210526, 0.89473684, 0.92105263, 0.92105263]),
 'LightGBM': array([0.8245614 , 0.93859649, 0.92982456, 0.97368421, 0.93859649])}

# 6. Train model

In [13]:
model_param = {
    'n_estimators': 200,
    'max_depth': 10,
    'random_state': 42,
    'class_weight': "balanced"
}

In [14]:
model = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    random_state=42,
    class_weight="balanced"
)

model.fit(X_train, y_train)

In [15]:
X_train[bool_cols] = X_train[bool_cols].astype(int)


# Categorical pipeline
cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

# Numeric pipeline
num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

# Combine both
preprocessor = ColumnTransformer([
    ("cat", cat_pipe, cat_cols),
    ("num", num_pipe, num_cols),
])


X_train_enc = preprocessor.fit_transform(X_train)
X_test_enc  = preprocessor.transform(X_test)

n_pos = np.sum(y_train == 1)      # number of churners
n_neg = np.sum(y_train == 0)      # number of non-churners



# Ensure it’s a NumPy array
X_train_enc = np.array(X_train_enc)
X_test_enc  = np.array(X_test_enc)


print(f"Churners (pos): {n_pos}, Non-churners (neg): {n_neg}")

scale = n_neg / n_pos   # how much more weight to give to churners

Churners (pos): 55, Non-churners (neg): 285


In [16]:
xgb_model = XGBClassifier(
    n_estimators=400,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42,
    scale_pos_weight=scale,
)

xgb_model.fit(X_train_enc, y_train)

# 6. Evaluate

In [17]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)

log.info(f"Accuracy: {acc:.3f}")
log.info(f"ROC-AUC: {auc:.3f}")
model_report = classification_report(y_test, y_pred, output_dict=True)
log.info(f"\nClassification report:\n {classification_report(y_test, y_pred)}")
model_report

2025-10-09 09:16:15,496 | INFO | __main__: Accuracy: 0.872
2025-10-09 09:16:15,497 | INFO | __main__: ROC-AUC: 0.827
2025-10-09 09:16:15,501 | INFO | __main__: 
Classification report:
               precision    recall  f1-score   support

           0       0.89      0.97      0.93        72
           1       0.71      0.36      0.48        14

    accuracy                           0.87        86
   macro avg       0.80      0.66      0.70        86
weighted avg       0.86      0.87      0.85        86



{'0': {'precision': 0.8860759493670886,
  'recall': 0.9722222222222222,
  'f1-score': 0.9271523178807947,
  'support': 72.0},
 '1': {'precision': 0.7142857142857143,
  'recall': 0.35714285714285715,
  'f1-score': 0.47619047619047616,
  'support': 14.0},
 'accuracy': 0.872093023255814,
 'macro avg': {'precision': 0.8001808318264014,
  'recall': 0.6646825396825397,
  'f1-score': 0.7016713970356354,
  'support': 86.0},
 'weighted avg': {'precision': 0.8581100971445393,
  'recall': 0.872093023255814,
  'f1-score': 0.853739925047487,
  'support': 86.0}}

In [18]:
y_pred = xgb_model.predict(X_test_enc)
y_prob = xgb_model.predict_proba(X_test_enc)[:, 1]

acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)

log.info(f"Accuracy: {acc:.3f}")
log.info(f"ROC-AUC: {auc:.3f}")
log.info(f"\nClassification report:\n {classification_report(y_test, y_pred)}")

2025-10-09 09:16:15,509 | INFO | __main__: Accuracy: 0.895
2025-10-09 09:16:15,509 | INFO | __main__: ROC-AUC: 0.789
2025-10-09 09:16:15,512 | INFO | __main__: 
Classification report:
               precision    recall  f1-score   support

           0       0.93      0.94      0.94        72
           1       0.69      0.64      0.67        14

    accuracy                           0.90        86
   macro avg       0.81      0.79      0.80        86
weighted avg       0.89      0.90      0.89        86



In [31]:
mlflow.set_experiment("first_experiments")
mlflow.set_tracking_uri('http://127.0.0.1:5000')

with mlflow.start_run():
    mlflow.log_params(model_param)
    mlflow.log_metrics({
        'accuracy': model_report['accuracy'],
        'recall_class_0': model_report['0']['recall'],
        'recall_class_1': model_report['1']['recall'],
        'f1_score_macro': model_report['macro avg']['f1-score']
    })
    mlflow.sklearn.log_model(model, artifact_path="random_forest_classifier")



🏃 View run bold-ape-604 at: http://127.0.0.1:5000/#/experiments/433248090814665400/runs/8be03573fe0e4a36aded256196244a0d
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/433248090814665400


# 7. Feature Importance

In [None]:
importances = pd.Series(model.feature_importances_, index=X_train.columns)
imp_sorted = importances.sort_values(ascending=False)[:20]

plt.figure(figsize=(8,6))
sns.barplot(x=imp_sorted, y=imp_sorted.index, color="royalblue")
plt.title("Top 20 Feature Importances")
plt.tight_layout()
plt.show()

In [None]:
importances.sort_values(ascending=True)