In [1]:
!pip install catboost
!pip install imbalanced-learn



In [2]:
!pip install scikit-learn==1.1.3



In [3]:
import os
import sys
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook

### Feature Transformation Related Methods ###
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler, PowerTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, ClassifierMixin
from imblearn.combine import SMOTEENN, SMOTETomek


### MachineLearning Models ###
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

import warnings
warnings.filterwarnings('ignore')

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [4]:
### Loading Csv Data in Dataframe ###

data = pd.read_csv("/content/Telco_Customer_Churn.csv")

### Printing Head ###

data.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [5]:
### TotalCharges Change Object to Float64 ###

data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors = 'coerce')

### Checking Null Values ###

data.isnull().sum()

Unnamed: 0,0
customerID,0
gender,0
SeniorCitizen,0
Partner,0
Dependents,0
tenure,0
PhoneService,0
MultipleLines,0
InternetService,0
OnlineSecurity,0


In [6]:
### Checking Duplicat Values ###

data.duplicated().sum()

0

In [7]:
### Remove customerID in DataFrame ###

data.drop('customerID', axis = 1, inplace = True)

In [8]:
### Checking data head customerID remove or not ###
data.head(2)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No


In [9]:
### X variable features ###

X = data.drop('Churn', axis = 1)

#Selected features

X_2 = X[['gender', 'InternetService', 'Contract', 'tenure', 'MonthlyCharges', 'TotalCharges']]

### Y variable target feature ###

y = data['Churn']

In [10]:
### Checking X and y shape
X_2.shape, y.shape

((7043, 6), (7043,))

In [11]:
### Data Transformation in Pipline and ColumnTransformer ###

### Spliting Numeric Features and Categorical Features ###

numeric = X_2.select_dtypes(include = 'number').columns.tolist()
categorical = X_2.select_dtypes(include = 'object').columns.tolist()


num_pipline = Pipeline([
    ('imputer', SimpleImputer(strategy = 'median')),
    ('transform', PowerTransformer(method='yeo-johnson')),
    ('scaler', StandardScaler())
])


cat_pipline = Pipeline([
    ('encoder', OrdinalEncoder())
])

preprosser = ColumnTransformer([
    ('numeric', num_pipline, numeric),

    ('categorical', cat_pipline, categorical)
])
preprosser

In [12]:
X_transformed_data = preprosser.fit_transform(X_2)
X_transformed_data.shape

(7043, 6)

In [13]:
### y target Value Encoding ###

le = LabelEncoder()
y_encoded = le.fit_transform(y)
y_encoded

### Checking y_encoded Shape ###
y_encoded

array([0, 0, 1, ..., 0, 1, 0])

In [14]:
smt = SMOTEENN()
X_resampled, y_resampled = smt.fit_resample(X_transformed_data, y_encoded)
X_resampled.shape, y_resampled.shape

((6114, 6), (6114,))

In [15]:
### Evaluation Metrics ###
def evaluate_clf(true, predicted):
    acc = accuracy_score(true, predicted)
    f1 = f1_score(true, predicted)
    precision = precision_score(true, predicted)
    recall = recall_score(true, predicted)
    roc_auc = roc_auc_score(true, predicted)
    return acc, f1 , precision, recall, roc_auc

In [20]:
### Training models ###

def evaluate_models(X, y, models, params):


    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

    models_list = []
    accuracy_list = []
    f1_list = []
    precision_list = []
    recall_list = []
    auc= []

    for i in tqdm_notebook(range(len(list(models)))):
        model = list(models.values())[i]
        para=params[list(models.keys())[i]]
        model.fit(X_train, y_train) # Train model

        gs = GridSearchCV(model,para,cv=3, refit=True)
        gs.fit(X_train,y_train)

        model.set_params(**gs.best_estimator_.get_params())
        model.fit(X_train,y_train)

        # Make predictions
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        # Training set performance
        model_train_accuracy, model_train_f1,model_train_precision,\
        model_train_recall,model_train_rocauc_score=evaluate_clf(y_train ,y_train_pred)


        # Test set performance
        model_test_accuracy,model_test_f1,model_test_precision,\
        model_test_recall,model_test_rocauc_score=evaluate_clf(y_test, y_test_pred)

        print(list(models.keys())[i])
        models_list.append(list(models.keys())[i])

        print('Model performance for Training set')
        print("- Accuracy: {:.4f}".format(model_train_accuracy))
        print('- F1 score: {:.4f}'.format(model_train_f1))
        print('- Precision: {:.4f}'.format(model_train_precision))
        print('- Recall: {:.4f}'.format(model_train_recall))
        print('- Roc Auc Score: {:.4f}'.format(model_train_rocauc_score))

        print('----------------------------------')

        print('Model performance for Test set')
        print('- Accuracy: {:.4f}'.format(model_test_accuracy))
        accuracy_list.append(model_test_accuracy)
        print('- F1 score: {:.4f}'.format(model_test_f1))
        f1_list.append(model_test_f1)
        print('- Precision: {:.4f}'.format(model_test_precision))
        precision_list.append(model_test_precision)
        print('- Recall: {:.4f}'.format(model_test_recall))
        recall_list.append(model_test_recall)
        print('- Roc Auc Score: {:.4f}'.format(model_test_rocauc_score))
        auc.append(model_test_rocauc_score)
        print('='*35)
        print('\n')

    report = pd.DataFrame(list(zip(models_list, accuracy_list, f1_list, precision_list, recall_list, auc)),
                      columns=['Model Name', 'Accuracy', 'f1_score', 'Precision', 'Recall', 'Roc Auc Score']) \
                      .sort_values(by=['Accuracy', 'f1_score', 'Precision', 'Recall', 'Roc Auc Score'], ascending=False)

    return report

In [21]:
### Define models ###

models = {
    'LogisticRegression': LogisticRegression(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'SVC': SVC(),
    'RandomForestClassifier': RandomForestClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    'AdaBoostClassifier': AdaBoostClassifier(),
    'XGBClassifier': XGBClassifier(),
    'LGBMClassifier': LGBMClassifier(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'GaussianNB': GaussianNB()
}

### Define Params ###

param_grids = {
    'LogisticRegression': {
        "class_weight":["balanced"],
        'penalty': ['l1', 'l2'],
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'solver': ['liblinear', 'saga']
    },
    'KNeighborsClassifier': {
        'n_neighbors': [3, 5, 7, 9]
    },
    'SVC': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
        'gamma': ['scale', 'auto']
    },
    'RandomForestClassifier': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    },
    'GradientBoostingClassifier': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5],
        'min_samples_split': [2]
    },
    'AdaBoostClassifier': {
        'n_estimators': [50, 100],
        'learning_rate': [0.01, 0.1]
    },
    'DecisionTreeClassifier': {
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2]
    },
    'GaussianNB': {},
    'XGBClassifier': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 6]
    },
    'LGBMClassifier': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'num_leaves': [31, 63]
    }
}

In [22]:
report = evaluate_models(X_resampled, y_resampled, models, param_grids)

  0%|          | 0/10 [00:00<?, ?it/s]

LogisticRegression
Model performance for Training set
- Accuracy: 0.9021
- F1 score: 0.9066
- Precision: 0.8952
- Recall: 0.9182
- Roc Auc Score: 0.9015
----------------------------------
Model performance for Test set
- Accuracy: 0.9019
- F1 score: 0.8998
- Precision: 0.8822
- Recall: 0.9182
- Roc Auc Score: 0.9025


KNeighborsClassifier
Model performance for Training set
- Accuracy: 0.9912
- F1 score: 0.9915
- Precision: 0.9894
- Recall: 0.9937
- Roc Auc Score: 0.9911
----------------------------------
Model performance for Test set
- Accuracy: 0.9771
- F1 score: 0.9764
- Precision: 0.9666
- Recall: 0.9864
- Roc Auc Score: 0.9775


SVC
Model performance for Training set
- Accuracy: 0.9325
- F1 score: 0.9359
- Precision: 0.9202
- Recall: 0.9522
- Roc Auc Score: 0.9318
----------------------------------
Model performance for Test set
- Accuracy: 0.9240
- F1 score: 0.9228
- Precision: 0.8997
- Recall: 0.9472
- Roc Auc Score: 0.9249


RandomForestClassifier
Model performance for Training

In [23]:
report

Unnamed: 0,Model Name,Accuracy,f1_score,Precision,Recall,Roc Auc Score
3,RandomForestClassifier,0.980376,0.979764,0.96995,0.989779,0.980738
1,KNeighborsClassifier,0.977105,0.976391,0.966611,0.986371,0.977462
7,LGBMClassifier,0.974652,0.973884,0.963333,0.984668,0.975038
4,GradientBoostingClassifier,0.968929,0.968013,0.956739,0.979557,0.969338
8,DecisionTreeClassifier,0.967294,0.966102,0.961214,0.971039,0.967438
6,XGBClassifier,0.960752,0.959732,0.945455,0.974446,0.96128
2,SVC,0.923957,0.922822,0.899676,0.947189,0.924852
5,AdaBoostClassifier,0.908422,0.906822,0.886179,0.92845,0.909193
0,LogisticRegression,0.901881,0.899833,0.88216,0.918228,0.90251
9,GaussianNB,0.883892,0.886218,0.836611,0.942078,0.886134
