In [1]:
import pandas as pd,numpy as np, joblib, optuna
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.impute import KNNImputer
from sklearn.ensemble import BaggingClassifier,GradientBoostingClassifier, RandomForestClassifier,AdaBoostClassifier,ExtraTreesClassifier
from sklearn import tree
from sklearn.linear_model import SGDClassifier,RidgeClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB,GaussianNB,GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score, confusion_matrix

In [4]:
df = pd.read_csv('../2. Preprocessing/Processed data.csv')

In [5]:
labels = df['Churn']
features = df.drop(columns= ['Churn'],axis=1)

x_train, x_validation, y_train, y_validation = train_test_split(features, labels, random_state = 42, stratify = labels, test_size = 0.1)
train_support= Counter(y_train)
validation_support = Counter(y_validation)

sm = SMOTE(random_state=27)
x_train, y_train = sm.fit_resample(x_train, y_train)

In [31]:
def create_metrics_table(model_name, y_train, y_train_predictions, truth, predictions, train_support, validation_support):
    precision_per_label = precision_score(truth, predictions, average=None)
    recall_per_label = recall_score(truth, predictions, average=None)
    f1_per_label = f1_score(truth, predictions, average=None)

    precision_per_label_train = precision_score(y_train, y_train_predictions, average=None)
    recall_per_label_train = recall_score(y_train, y_train_predictions, average=None)

    metrics = pd.DataFrame(columns=['Model', 'Training Precision', 'Training Recall', 'Train Support',
                                    'Validation Precision', 'Validation Recall', 'Validation F1 Score'],
                           index=range(len(precision_per_label)))

    for index in range(len(precision_per_label)):
        metrics.at[index, 'Training Precision'] = precision_per_label_train[index]
        metrics.at[index, 'Training Recall'] = recall_per_label_train[index]
        metrics.at[index, 'Train Support'] = train_support[index]
        metrics.at[index, 'Validation Support'] = validation_support[index]
        metrics.at[index, 'Validation Precision'] = precision_per_label[index]
        metrics.at[index, 'Validation Recall'] = recall_per_label[index]
        metrics.at[index, 'Validation F1 Score'] = f1_per_label[index]

    metrics['Model'] = model_name
    return metrics

In [32]:
models = {
    "LogisticRegression" : LogisticRegression(),
    "AdaBoost" : AdaBoostClassifier(DecisionTreeClassifier(),n_estimators=10),
    "GradientBoosting"      : GradientBoostingClassifier(),
    "BaggingClassifier"  : BaggingClassifier(tree.DecisionTreeClassifier(random_state=1)),
    "RandomForest"      : RandomForestClassifier(n_estimators=3),
    "eXtremeGradientBoosting"     : XGBClassifier(objective= 'binary:logistic'),
    "KNN"                : KNeighborsClassifier(n_neighbors = 30, weights = 'distance'),
    "DecisionTree"      : DecisionTreeClassifier(),
    "ExtraTreeClassifier"       : ExtraTreesClassifier(n_estimators=3)
}
metrics = pd.DataFrame()
for name, model in models.items():
    

    print(f'Using model: {name}')
    model.fit(x_train, y_train)
    #joblib.dump(model,f'Saved Models/{name}.h5')
    y_train_predictions = model.predict(x_train)
    predictions = model.predict(x_validation)
    metrics = pd.concat([metrics,create_metrics_table(name,y_train,y_train_predictions,y_validation, predictions, train_support,validation_support)])

Using model: LogisticRegression
Using model: AdaBoost
Using model: GradientBoosting
Using model: BaggingClassifier
Using model: RandomForest
Using model: eXtremeGradientBoosting
Using model: KNN
Using model: DecisionTree
Using model: ExtraTreeClassifier


In [33]:
metrics

Unnamed: 0,Model,Training Precision,Training Recall,Train Support,Validation Precision,Validation Recall,Validation F1 Score,Validation Support
0,LogisticRegression,0.58,0.57,26161,0.76,0.57,0.65,2907.0
1,LogisticRegression,0.58,0.58,10592,0.34,0.56,0.43,1177.0
0,AdaBoost,1.0,1.0,26161,0.73,0.72,0.73,2907.0
1,AdaBoost,1.0,1.0,10592,0.34,0.36,0.35,1177.0
0,GradientBoosting,0.72,0.98,26161,0.72,0.97,0.83,2907.0
1,GradientBoosting,0.97,0.63,10592,0.5,0.07,0.12,1177.0
0,BaggingClassifier,0.98,1.0,26161,0.73,0.9,0.81,2907.0
1,BaggingClassifier,1.0,0.98,10592,0.43,0.18,0.26,1177.0
0,RandomForest,0.95,0.96,26161,0.74,0.77,0.75,2907.0
1,RandomForest,0.96,0.94,10592,0.36,0.32,0.34,1177.0


In [109]:
def objective(trial):
    # Suggest hyperparameters controlling complexity
    max_depth = trial.suggest_int('max_depth', 2, 32)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 32)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 32)

    # Create and train model
    model = DecisionTreeClassifier(
        max_depth=max_depth,
        min_samples_leaf=min_samples_leaf,
        min_samples_split=min_samples_split,
        random_state=42
    )
    model.fit(x_train, y_train)

    # Predict on validation set
    preds = model.predict(x_validation)

    # Use accuracy or another metric on validation set
    f1 = f1_score(y_validation, preds)

    # Optuna tries to maximize accuracy
    return f1

# Create study and optimize
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=16)

print("Best hyperparameters:", study.best_params)
print("Best validation accuracy:", study.best_value)


[I 2025-06-04 23:13:20,794] A new study created in memory with name: no-name-a251fd25-8c58-4cf2-aa95-40af99a52580
[I 2025-06-04 23:13:21,830] Trial 0 finished with value: 0.26590909090909093 and parameters: {'max_depth': 11, 'min_samples_leaf': 6, 'min_samples_split': 2}. Best is trial 0 with value: 0.26590909090909093.
[I 2025-06-04 23:13:22,991] Trial 1 finished with value: 0.34802158273381295 and parameters: {'max_depth': 20, 'min_samples_leaf': 19, 'min_samples_split': 15}. Best is trial 1 with value: 0.34802158273381295.
[I 2025-06-04 23:13:24,128] Trial 2 finished with value: 0.34679981842941443 and parameters: {'max_depth': 28, 'min_samples_leaf': 22, 'min_samples_split': 22}. Best is trial 1 with value: 0.34802158273381295.
[I 2025-06-04 23:13:24,446] Trial 3 finished with value: 0.43094672548354257 and parameters: {'max_depth': 3, 'min_samples_leaf': 3, 'min_samples_split': 11}. Best is trial 3 with value: 0.43094672548354257.
[I 2025-06-04 23:13:25,673] Trial 4 finished with 

Best hyperparameters: {'max_depth': 3, 'min_samples_leaf': 3, 'min_samples_split': 11}
Best validation accuracy: 0.43094672548354257


In [None]:
clf = LazyClassifier(verbose=0, ignore_warnings=True)

# Fit and evaluate models
models, predictions = clf.fit(x_train, x_validation, y_train, y_validation)

models