## Importing dependencies

In [1]:
import opendatasets as od
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

## Loading the dataset

In [2]:
df = pd.read_excel("data/Telco_customer_churn.xlsx")

In [3]:
df['Total Charges'].dtype

dtype('O')

In [4]:
df['Total Charges'] = pd.to_numeric(df['Total Charges'], errors='coerce')

In [5]:
df['Total Charges'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Total Charges'].fillna(0, inplace=True)


In [6]:
df.isnull().sum()

CustomerID              0
Count                   0
Country                 0
State                   0
City                    0
Zip Code                0
Lat Long                0
Latitude                0
Longitude               0
Gender                  0
Senior Citizen          0
Partner                 0
Dependents              0
Tenure Months           0
Phone Service           0
Multiple Lines          0
Internet Service        0
Online Security         0
Online Backup           0
Device Protection       0
Tech Support            0
Streaming TV            0
Streaming Movies        0
Contract                0
Paperless Billing       0
Payment Method          0
Monthly Charges         0
Total Charges           0
Churn Label             0
Churn Value             0
Churn Score             0
CLTV                    0
Churn Reason         5174
dtype: int64

## Preparing data for the model

### Encoding Categorical features

In [7]:
df['Churn Reason'].unique()

array(['Competitor made better offer', 'Moved',
       'Competitor had better devices',
       'Competitor offered higher download speeds',
       'Competitor offered more data', 'Price too high',
       'Product dissatisfaction', 'Service dissatisfaction',
       'Lack of self-service on Website', 'Network reliability',
       'Limited range of services',
       'Lack of affordable download/upload speed',
       'Long distance charges', 'Extra data charges', "Don't know",
       'Poor expertise of online support',
       'Poor expertise of phone support', 'Attitude of service provider',
       'Attitude of support person', 'Deceased', nan], dtype=object)

In [8]:
df = df.drop(["Country", "State", "Count", "Zip Code",
              "Churn Reason", "City", "Churn Value",
              "Churn Score", "CLTV", "CustomerID", "Latitude",
              "Longitude", "Lat Long"], axis=1)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             7043 non-null   object 
 1   Senior Citizen     7043 non-null   object 
 2   Partner            7043 non-null   object 
 3   Dependents         7043 non-null   object 
 4   Tenure Months      7043 non-null   int64  
 5   Phone Service      7043 non-null   object 
 6   Multiple Lines     7043 non-null   object 
 7   Internet Service   7043 non-null   object 
 8   Online Security    7043 non-null   object 
 9   Online Backup      7043 non-null   object 
 10  Device Protection  7043 non-null   object 
 11  Tech Support       7043 non-null   object 
 12  Streaming TV       7043 non-null   object 
 13  Streaming Movies   7043 non-null   object 
 14  Contract           7043 non-null   object 
 15  Paperless Billing  7043 non-null   object 
 16  Payment Method     7043 

In [10]:
df.to_csv(".\data\modified_data.csv", index=False, header=True)

  df.to_csv(".\data\modified_data.csv", index=False, header=True)


In [11]:
df

Unnamed: 0,Gender,Senior Citizen,Partner,Dependents,Tenure Months,Phone Service,Multiple Lines,Internet Service,Online Security,Online Backup,Device Protection,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Label
0,Male,No,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
1,Female,No,No,Yes,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
2,Female,No,No,Yes,8,Yes,Yes,Fiber optic,No,No,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.50,Yes
3,Female,No,Yes,Yes,28,Yes,Yes,Fiber optic,No,No,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,104.80,3046.05,Yes
4,Male,No,No,Yes,49,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,Month-to-month,Yes,Bank transfer (automatic),103.70,5036.30,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Female,No,No,No,72,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,Yes,Bank transfer (automatic),21.15,1419.40,No
7039,Male,No,Yes,Yes,24,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.50,No
7040,Female,No,Yes,Yes,72,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.90,No
7041,Female,No,Yes,Yes,11,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No


In [12]:
numeric_cols = list(df.select_dtypes(include=["int", "float"]).columns)

In [13]:
numeric_cols

['Tenure Months', 'Monthly Charges', 'Total Charges']

In [14]:
categorical_cols = list(df.select_dtypes(include="object").columns)

In [15]:
categorical_cols

['Gender',
 'Senior Citizen',
 'Partner',
 'Dependents',
 'Phone Service',
 'Multiple Lines',
 'Internet Service',
 'Online Security',
 'Online Backup',
 'Device Protection',
 'Tech Support',
 'Streaming TV',
 'Streaming Movies',
 'Contract',
 'Paperless Billing',
 'Payment Method',
 'Churn Label']

In [16]:
df[categorical_cols]

Unnamed: 0,Gender,Senior Citizen,Partner,Dependents,Phone Service,Multiple Lines,Internet Service,Online Security,Online Backup,Device Protection,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Churn Label
0,Male,No,No,No,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,Yes
1,Female,No,No,Yes,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,Yes
2,Female,No,No,Yes,Yes,Yes,Fiber optic,No,No,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,Yes
3,Female,No,Yes,Yes,Yes,Yes,Fiber optic,No,No,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,Yes
4,Male,No,No,Yes,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,Month-to-month,Yes,Bank transfer (automatic),Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Female,No,No,No,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,Yes,Bank transfer (automatic),No
7039,Male,No,Yes,Yes,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,No
7040,Female,No,Yes,Yes,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),No
7041,Female,No,Yes,Yes,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check,No


In [17]:
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [18]:
df

Unnamed: 0,Tenure Months,Monthly Charges,Total Charges,Gender_Male,Senior Citizen_Yes,Partner_Yes,Dependents_Yes,Phone Service_Yes,Multiple Lines_No phone service,Multiple Lines_Yes,Internet Service_Fiber optic,Internet Service_No,Online Security_No internet service,Online Security_Yes,Online Backup_No internet service,Online Backup_Yes,Device Protection_No internet service,Device Protection_Yes,Tech Support_No internet service,Tech Support_Yes,Streaming TV_No internet service,Streaming TV_Yes,Streaming Movies_No internet service,Streaming Movies_Yes,Contract_One year,Contract_Two year,Paperless Billing_Yes,Payment Method_Credit card (automatic),Payment Method_Electronic check,Payment Method_Mailed check,Churn Label_Yes
0,2,53.85,108.15,True,False,False,False,True,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,True,True
1,2,70.70,151.65,False,False,False,True,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,True
2,8,99.65,820.50,False,False,False,True,True,False,True,True,False,False,False,False,False,False,True,False,False,False,True,False,True,False,False,True,False,True,False,True
3,28,104.80,3046.05,False,False,True,True,True,False,True,True,False,False,False,False,False,False,True,False,True,False,True,False,True,False,False,True,False,True,False,True
4,49,103.70,5036.30,True,False,False,True,True,False,True,True,False,False,False,False,True,False,True,False,False,False,True,False,True,False,False,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,72,21.15,1419.40,False,False,False,False,True,False,False,False,True,True,False,True,False,True,False,True,False,True,False,True,False,False,True,True,False,False,False,False
7039,24,84.80,1990.50,True,False,True,True,True,False,True,False,False,False,True,False,False,False,True,False,True,False,True,False,True,True,False,True,False,False,True,False
7040,72,103.20,7362.90,False,False,True,True,True,False,True,True,False,False,False,False,True,False,True,False,False,False,True,False,True,True,False,True,True,False,False,False
7041,11,29.60,346.45,False,False,True,True,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False


### Scaling numerical features

In [19]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
df[numeric_cols] = sc.fit_transform(df[numeric_cols])

In [20]:
df

Unnamed: 0,Tenure Months,Monthly Charges,Total Charges,Gender_Male,Senior Citizen_Yes,Partner_Yes,Dependents_Yes,Phone Service_Yes,Multiple Lines_No phone service,Multiple Lines_Yes,Internet Service_Fiber optic,Internet Service_No,Online Security_No internet service,Online Security_Yes,Online Backup_No internet service,Online Backup_Yes,Device Protection_No internet service,Device Protection_Yes,Tech Support_No internet service,Tech Support_Yes,Streaming TV_No internet service,Streaming TV_Yes,Streaming Movies_No internet service,Streaming Movies_Yes,Contract_One year,Contract_Two year,Paperless Billing_Yes,Payment Method_Credit card (automatic),Payment Method_Electronic check,Payment Method_Mailed check,Churn Label_Yes
0,-1.236724,-0.362660,-0.958066,True,False,False,False,True,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,True,True
1,-1.236724,0.197365,-0.938874,False,False,False,True,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,True
2,-0.992402,1.159546,-0.643789,False,False,False,True,True,False,True,True,False,False,False,False,False,False,True,False,False,False,True,False,True,False,False,True,False,True,False,True
3,-0.177995,1.330711,0.338085,False,False,True,True,True,False,True,True,False,False,False,False,False,False,True,False,True,False,True,False,True,False,False,True,False,True,False,True
4,0.677133,1.294151,1.216150,True,False,False,True,True,False,True,True,False,False,False,False,True,False,True,False,False,False,True,False,True,False,False,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,1.613701,-1.449476,-0.379565,False,False,False,False,True,False,False,False,True,True,False,True,False,True,False,True,False,True,False,True,False,False,True,True,False,False,False,False
7039,-0.340876,0.665992,-0.127605,True,False,True,True,True,False,True,False,False,False,True,False,False,False,True,False,True,False,True,False,True,True,False,True,False,False,True,False
7040,1.613701,1.277533,2.242606,False,False,True,True,True,False,True,True,False,False,False,False,True,False,True,False,False,False,True,False,True,True,False,True,True,False,False,False
7041,-0.870241,-1.168632,-0.852932,False,False,True,True,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False


### Balancing the dataset

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 31 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Tenure Months                           7043 non-null   float64
 1   Monthly Charges                         7043 non-null   float64
 2   Total Charges                           7043 non-null   float64
 3   Gender_Male                             7043 non-null   bool   
 4   Senior Citizen_Yes                      7043 non-null   bool   
 5   Partner_Yes                             7043 non-null   bool   
 6   Dependents_Yes                          7043 non-null   bool   
 7   Phone Service_Yes                       7043 non-null   bool   
 8   Multiple Lines_No phone service         7043 non-null   bool   
 9   Multiple Lines_Yes                      7043 non-null   bool   
 10  Internet Service_Fiber optic            7043 non-null   bool

In [22]:
df = df.replace({True: 1, False: 0})
df

  df = df.replace({True: 1, False: 0})


Unnamed: 0,Tenure Months,Monthly Charges,Total Charges,Gender_Male,Senior Citizen_Yes,Partner_Yes,Dependents_Yes,Phone Service_Yes,Multiple Lines_No phone service,Multiple Lines_Yes,Internet Service_Fiber optic,Internet Service_No,Online Security_No internet service,Online Security_Yes,Online Backup_No internet service,Online Backup_Yes,Device Protection_No internet service,Device Protection_Yes,Tech Support_No internet service,Tech Support_Yes,Streaming TV_No internet service,Streaming TV_Yes,Streaming Movies_No internet service,Streaming Movies_Yes,Contract_One year,Contract_Two year,Paperless Billing_Yes,Payment Method_Credit card (automatic),Payment Method_Electronic check,Payment Method_Mailed check,Churn Label_Yes
0,-1.236724,-0.362660,-0.958066,1,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1
1,-1.236724,0.197365,-0.938874,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1
2,-0.992402,1.159546,-0.643789,0,0,0,1,1,0,1,1,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,1,0,1,0,1
3,-0.177995,1.330711,0.338085,0,0,1,1,1,0,1,1,0,0,0,0,0,0,1,0,1,0,1,0,1,0,0,1,0,1,0,1
4,0.677133,1.294151,1.216150,1,0,0,1,1,0,1,1,0,0,0,0,1,0,1,0,0,0,1,0,1,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,1.613701,-1.449476,-0.379565,0,0,0,0,1,0,0,0,1,1,0,1,0,1,0,1,0,1,0,1,0,0,1,1,0,0,0,0
7039,-0.340876,0.665992,-0.127605,1,0,1,1,1,0,1,0,0,0,1,0,0,0,1,0,1,0,1,0,1,1,0,1,0,0,1,0
7040,1.613701,1.277533,2.242606,0,0,1,1,1,0,1,1,0,0,0,0,1,0,1,0,0,0,1,0,1,1,0,1,1,0,0,0
7041,-0.870241,-1.168632,-0.852932,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0


In [23]:
df.groupby("Churn Label_Yes")["Churn Label_Yes"].count()

Churn Label_Yes
0    5174
1    1869
Name: Churn Label_Yes, dtype: int64

In [24]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='minority')

X = df.drop("Churn Label_Yes", axis=1).values
y = df["Churn Label_Yes"].values

In [None]:
X, y = smote.fit_resample(X, y)

In [26]:
X

array([[-1.23672422, -0.36266036, -0.9580659 , ...,  0.        ,
         0.        ,  1.        ],
       [-1.23672422,  0.19736523, -0.93887444, ...,  0.        ,
         1.        ,  0.        ],
       [-0.99240204,  1.1595457 , -0.64378925, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.20468438,  1.17091565,  0.16756017, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.31409319,  1.632671  ,  0.93740779, ...,  0.        ,
         0.        ,  0.13077617],
       [-1.1089164 ,  0.502675  , -0.81948901, ...,  0.        ,
         1.        ,  0.        ]])

## Building a model

In [27]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [28]:
X_train.shape, X_test.shape

((8278, 30), (2070, 30))

In [29]:
y_train.shape, y_test.shape

((8278,), (2070,))

In [None]:
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import classification_report, roc_auc_score, make_scorer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

models = {
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Logistic Regression": LogisticRegression(),
    "XGBClassifier": XGBClassifier(),
    "CatBoosting Classifier": CatBoostClassifier(silent=True),
    "AdaBoost Classifier": AdaBoostClassifier(),
    "LightGBM Classifier": LGBMClassifier()
}

params = {
    "Decision Tree": {
        'criterion': ['gini', 'entropy'],
        'splitter': ['best', 'random'],
        'max_features': ['sqrt', 'log2', None],
    },
    "Random Forest": {
        'criterion': ['gini', 'entropy'],
        'max_features': ['sqrt', 'log2', None],
        'n_estimators': [50, 100, 200, 300, 400, 500]
    },
    "Gradient Boosting": {
        'loss': ['log_loss', 'exponential'],
        'learning_rate': [0.1, 0.01, 0.05, 0.001],
        'subsample': [0.6, 0.7, 0.75, 0.8, 0.85, 0.9],
        'criterion': ['friedman_mse', 'squared_error'],
        'max_features': ['sqrt', 'log2'],
        'n_estimators': [50, 100, 200, 300, 400, 500]
    },
    "Logistic Regression": {
        'penalty': ['l1', 'l2', 'elasticnet'],
        'C': [0.01, 0.1, 1.0, 10, 100],
        'solver': ['liblinear', 'saga']
    },
    "XGBClassifier": {
        'learning_rate': [0.1, 0.01, 0.05, 0.001],
        'n_estimators': [50, 100, 200, 300, 400, 500],
        'max_depth': [3, 4, 5, 6, 7, 8]
    },
    "CatBoosting Classifier": {
        'depth': [6, 8, 10],
        'learning_rate': [0.01, 0.05, 0.1],
        'iterations': [100, 200, 300, 500]
    },
    "AdaBoost Classifier": {
        'learning_rate': [0.1, 0.01, 0.05, 0.001],
        'n_estimators': [50, 100, 200, 300, 400, 500]
    },
    "LightGBM Classifier": {
        'learning_rate': [0.1, 0.01, 0.05, 0.001],
        'n_estimators': [50, 100, 200, 300, 400, 500],
        'max_depth': [3, 4, 5, 6, 7, 8],
        'num_leaves': [20, 30, 40, 50, 60]
    }
}

def evaluate_models(X_train, y_train, X_test, y_test, models, params):
    report = {}
    for model_name, model in models.items():
        param_grid = params[model_name]

        # Define the Stratified K-Fold Cross-Validator
        stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

        # Define the F1 scorer
        f1_scorer = make_scorer(f1_score, average='weighted')

        rs = RandomizedSearchCV(estimator=model, param_distributions=param_grid, cv=stratified_kfold, scoring=f1_scorer)

        rs.fit(X_train, y_train)

        # Get the best parameters and best F1 score
        best_params = rs.best_params_
        best_score = rs.best_score_

        # Get the best model with best parameters
        best_model = rs.best_estimator_

        # Train the best model on the full training data
        best_model.fit(X_train, y_train)

        # Predict on the testing data
        y_test_pred = best_model.predict(X_test)

        # Check if the model supports probability estimation
        if hasattr(best_model, "predict_proba"):
            y_test_proba = best_model.predict_proba(X_test)[:, 1]
            roc_auc = roc_auc_score(y_test, y_test_proba)
        else:
            roc_auc = None

        # Calculate metrics
        classification_rep = classification_report(y_test, y_test_pred, output_dict=True)
        precision = classification_rep['weighted avg']['precision']
        recall = classification_rep['weighted avg']['recall']
        f1 = classification_rep['weighted avg']['f1-score']

        # Store the results
        report[model_name] = {
            'best_params': best_params,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'roc_auc': roc_auc
        }

    return report

results = evaluate_models(X_train, y_train, X_test, y_test, models, params)

In [None]:
for model_name, result in results.items():
    print(f"Model: {model_name}")
    print(f"Best Parameters: {result['best_params']}")
    print(f"Precision: {result['precision']}")
    print(f"Recall: {result['recall']}")
    print(f"F1 Score: {result['f1_score']}")
    if result['roc_auc'] is not None:
        print(f"ROC AUC: {result['roc_auc']}")
    else:
        print("ROC AUC: Not available")
    print()


Model: Decision Tree
Best Parameters: {'splitter': 'random', 'max_features': None, 'criterion': 'entropy'}
Precision: 0.8174598150884864
Recall: 0.8173913043478261
F1 Score: 0.8173331228475444
ROC AUC: 0.8167646440828333

Model: Random Forest
Best Parameters: {'n_estimators': 400, 'max_features': 'sqrt', 'criterion': 'gini'}
Precision: 0.8686479277556288
Recall: 0.8685990338164251
F1 Score: 0.8685715369096128
ROC AUC: 0.9312492704223378

Model: Gradient Boosting
Best Parameters: {'subsample': 0.7, 'n_estimators': 400, 'max_features': 'log2', 'loss': 'log_loss', 'learning_rate': 0.1, 'criterion': 'friedman_mse'}
Precision: 0.8604584993145062
Recall: 0.8603864734299517
F1 Score: 0.8603499185486647
ROC AUC: 0.9382999089487076

Model: Logistic Regression
Best Parameters: {'solver': 'saga', 'penalty': 'l2', 'C': 0.1}
Precision: 0.7885211298254777
Recall: 0.7879227053140097
F1 Score: 0.787663552774533
ROC AUC: 0.8702201573553102

Model: XGBClassifier
Best Parameters: {'n_estimators': 200, 'm

The Best model here is CatBoostClassifier.

Let's further tune it with optuna

In [34]:
import optuna
from sklearn.model_selection import cross_val_score

In [36]:
from sklearn.metrics import make_scorer, f1_score
f1_scorer = make_scorer(f1_score)

def objective(trial):

  params = {
      "iterations": trial.suggest_int("iterations", 100, 1000),
      "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
      "depth": trial.suggest_int("depth", 4, 10),
      "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-8, 100.0, log=True),
      "subsample": trial.suggest_float("subsample", 0.05, 1.0),
      "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1.0),
      "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
      "od_type": trial.suggest_categorical("od_type", ["IncToDec", "Iter"]),
      "od_wait": trial.suggest_int("od_wait", 10, 50),
  }

  model = CatBoostClassifier(**params, silent=True)
  score = cross_val_score(model, X_train, y_train, cv=5, scoring=f1_scorer, n_jobs=-1).mean()

  return score


In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2024-05-13 03:45:52,257] A new study created in memory with name: no-name-3943ef1d-531d-4248-b60d-3ecea49ed5ae
[I 2024-05-13 03:45:58,035] Trial 0 finished with value: 0.8578381132070954 and parameters: {'iterations': 167, 'learning_rate': 0.04332696373973052, 'depth': 5, 'l2_leaf_reg': 0.08112808125475827, 'subsample': 0.9341198133703117, 'colsample_bylevel': 0.6997474360191243, 'min_data_in_leaf': 71, 'od_type': 'IncToDec', 'od_wait': 39}. Best is trial 0 with value: 0.8578381132070954.
[I 2024-05-13 03:47:06,453] Trial 1 finished with value: 0.8652786000590282 and parameters: {'iterations': 925, 'learning_rate': 0.01015164492512863, 'depth': 10, 'l2_leaf_reg': 0.5429528604183685, 'subsample': 0.45332974395447967, 'colsample_bylevel': 0.24187016653766225, 'min_data_in_leaf': 13, 'od_type': 'Iter', 'od_wait': 33}. Best is trial 1 with value: 0.8652786000590282.
[I 2024-05-13 03:47:14,059] Trial 2 finished with value: 0.8596473652231067 and parameters: {'iterations': 449, 'learning_

In [None]:
study.best_value

0.8753334016479707

In [None]:
study.best_params

{'iterations': 987,
 'learning_rate': 0.00794812040578884,
 'depth': 10,
 'l2_leaf_reg': 1.8797385861143014e-08,
 'subsample': 0.7495072912757781,
 'colsample_bylevel': 0.8787194065417225,
 'min_data_in_leaf': 75,
 'od_type': 'IncToDec',
 'od_wait': 47}

In [38]:
best_params = {'iterations': 987,
 'learning_rate': 0.00794812040578884,
 'depth': 10,
 'l2_leaf_reg': 1.8797385861143014e-08,
 'subsample': 0.7495072912757781,
 'colsample_bylevel': 0.8787194065417225,
 'min_data_in_leaf': 75,
 'od_type': 'IncToDec',
 'od_wait': 47}

model = CatBoostClassifier(**best_params, silent=True)
model.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x1e358594380>

In [39]:
y_pred = model.predict(X_test)

In [40]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.86      0.88      1015
           1       0.87      0.90      0.89      1055

    accuracy                           0.88      2070
   macro avg       0.88      0.88      0.88      2070
weighted avg       0.88      0.88      0.88      2070

