In [None]:
import os
import sys
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook


### Feature Transformation Related Methods ###
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, ClassifierMixin
from imblearn.combine import SMOTEENN, SMOTETomek


### MachineLearning Models ###
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

import warnings
warnings.filterwarnings('ignore')

In [None]:
### Loading Csv Data in Dataframe ###

data = pd.read_csv("/content/Telco_Customer_Churn.csv")

### Printing Head ###

data.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [None]:
### TotalCharges Change Object to Float64 ###

data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors = 'coerce')

### Checking Null Values ###

data.isnull().sum()

Unnamed: 0,0
customerID,0
gender,0
SeniorCitizen,0
Partner,0
Dependents,0
tenure,0
PhoneService,0
MultipleLines,0
InternetService,0
OnlineSecurity,0


In [None]:
### Checking Duplicat Values ###

data.duplicated().sum()

0

In [None]:
### Remove customerID in DataFrame ###

data.drop('customerID', axis = 1, inplace = True)

In [None]:
### Checking data head customerID remove or not ###
data.head(2)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No


In [None]:
### X variable features ###

X = data.drop('Churn', axis = 1)

### Y variable target feature ###

y = data['Churn']

In [None]:
### Checking X and y shape
X.shape, y.shape

((7043, 19), (7043,))

### **1 Model Training With all Features** ###

In [None]:
### Data Transformation in Pipline and ColumnTransformer ###

### Spliting Numeric Features and Categorical Features ###

numeric = X.select_dtypes(include = 'number').columns.tolist()
categorical = X.select_dtypes(include = 'object').columns.tolist()

num_pipline = Pipeline([
    ('imputer', SimpleImputer(strategy = 'median')),
    ('scaler', StandardScaler())
])

cat_pipline = Pipeline([
    ('encoder', OrdinalEncoder())
])

preprosser = ColumnTransformer([
    ('numeric', num_pipline, numeric),
    ('categorical', cat_pipline, categorical)
])
preprosser

In [None]:
## Fiting Preprosser Object in X Features ###

X_pre_transformed = preprosser.fit_transform(X)

### Checking X_pre_transformed ###

X_pre_transformed.shape

(7043, 19)

In [None]:
### Printing X_pre_transformed Values ###

X_pre_transformed

array([[-0.43991649, -1.27744458, -1.16032292, ...,  0.        ,
         1.        ,  2.        ],
       [-0.43991649,  0.06632742, -0.25962894, ...,  1.        ,
         0.        ,  3.        ],
       [-0.43991649, -1.23672422, -0.36266036, ...,  0.        ,
         1.        ,  3.        ],
       ...,
       [-0.43991649, -0.87024095, -1.1686319 , ...,  0.        ,
         1.        ,  2.        ],
       [ 2.27315869, -1.15528349,  0.32033821, ...,  0.        ,
         1.        ,  3.        ],
       [-0.43991649,  1.36937906,  1.35896134, ...,  2.        ,
         1.        ,  0.        ]])

In [None]:
### y target Value Encoding ###

le = LabelEncoder()
y_encoded = le.fit_transform(y)
y_encoded

### Checking y_encoded Shape ###
y_encoded.shape

(7043,)

In [None]:
smt = SMOTEENN()
X_resampled, y_resampled = smt.fit_resample(X_pre_transformed, y_encoded)
X_resampled.shape, y_resampled.shape

((6388, 19), (6388,))

In [None]:
### Evaluation Metrics ###
def evaluate_clf(true, predicted):
    acc = accuracy_score(true, predicted)
    f1 = f1_score(true, predicted)
    precision = precision_score(true, predicted)
    recall = recall_score(true, predicted)
    roc_auc = roc_auc_score(true, predicted)
    return acc, f1 , precision, recall, roc_auc

In [None]:
### Training models ###

def evaluate_models(X, y, models, params):


    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

    models_list = []
    accuracy_list = []
    auc= []

    for i in tqdm_notebook(range(len(list(models)))):
        model = list(models.values())[i]
        para=params[list(models.keys())[i]]
        model.fit(X_train, y_train) # Train model

        gs = GridSearchCV(model,para,cv=3)
        gs.fit(X_train,y_train)

        model.set_params(**gs.best_params_)
        model.fit(X_train,y_train)

        # Make predictions
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        # Training set performance
        model_train_accuracy, model_train_f1,model_train_precision,\
        model_train_recall,model_train_rocauc_score=evaluate_clf(y_train ,y_train_pred)


        # Test set performance
        model_test_accuracy,model_test_f1,model_test_precision,\
        model_test_recall,model_test_rocauc_score=evaluate_clf(y_test, y_test_pred)

        print(list(models.keys())[i])
        models_list.append(list(models.keys())[i])

        print('Model performance for Training set')
        print("- Accuracy: {:.4f}".format(model_train_accuracy))
        print('- F1 score: {:.4f}'.format(model_train_f1))
        print('- Precision: {:.4f}'.format(model_train_precision))
        print('- Recall: {:.4f}'.format(model_train_recall))
        print('- Roc Auc Score: {:.4f}'.format(model_train_rocauc_score))

        print('----------------------------------')

        print('Model performance for Test set')
        print('- Accuracy: {:.4f}'.format(model_test_accuracy))
        accuracy_list.append(model_test_accuracy)
        print('- F1 score: {:.4f}'.format(model_test_f1))
        print('- Precision: {:.4f}'.format(model_test_precision))
        print('- Recall: {:.4f}'.format(model_test_recall))
        print('- Roc Auc Score: {:.4f}'.format(model_test_rocauc_score))
        auc.append(model_test_rocauc_score)
        print('='*35)
        print('\n')

    report=pd.DataFrame(list(zip(models_list, accuracy_list)), columns=['Model Name', 'Accuracy']).sort_values(by=['Accuracy'], ascending=False)

    return report

In [None]:
### Define models ###

models = {
    'LogisticRegression': LogisticRegression(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'SVC': SVC(),
    'RandomForestClassifier': RandomForestClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    'AdaBoostClassifier': AdaBoostClassifier(),
    'XGBClassifier': XGBClassifier(),
    'LGBMClassifier': LGBMClassifier(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'GaussianNB': GaussianNB()
}

### Define Params ###

param_grids = {
    'LogisticRegression': {
        "class_weight":["balanced"],
        'penalty': ['l1', 'l2'],
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'solver': ['liblinear', 'saga']
    },
    'KNeighborsClassifier': {
        'n_neighbors': [3, 5, 7, 9]
    },
    'SVC': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf', 'poly'],
        'gamma': ['scale', 'auto']
    },
    'RandomForestClassifier': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    },
    'GradientBoostingClassifier': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5],
        'min_samples_split': [2]
    },
    'AdaBoostClassifier': {
        'n_estimators': [50, 100],
        'learning_rate': [0.01, 0.1]
    },
    'DecisionTreeClassifier': {
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2]
    },
    'GaussianNB': {},
    'XGBClassifier': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 6]
    },
    'LGBMClassifier': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'num_leaves': [31, 63]
    }
}

In [None]:
report = evaluate_models(X_resampled, y_resampled, models, param_grids)

  0%|          | 0/10 [00:00<?, ?it/s]

LogisticRegression
Model performance for Training set
- Accuracy: 0.9088
- F1 score: 0.9176
- Precision: 0.9261
- Recall: 0.9093
- Roc Auc Score: 0.9087
----------------------------------
Model performance for Test set
- Accuracy: 0.8936
- F1 score: 0.9045
- Precision: 0.9161
- Recall: 0.8932
- Roc Auc Score: 0.8936


KNeighborsClassifier
Model performance for Training set
- Accuracy: 0.9890
- F1 score: 0.9903
- Precision: 0.9817
- Recall: 0.9989
- Roc Auc Score: 0.9877
----------------------------------
Model performance for Test set
- Accuracy: 0.9726
- F1 score: 0.9762
- Precision: 0.9573
- Recall: 0.9958
- Roc Auc Score: 0.9692


SVC
Model performance for Training set
- Accuracy: 0.9834
- F1 score: 0.9851
- Precision: 0.9826
- Recall: 0.9877
- Roc Auc Score: 0.9828
----------------------------------
Model performance for Test set
- Accuracy: 0.9640
- F1 score: 0.9684
- Precision: 0.9579
- Recall: 0.9792
- Roc Auc Score: 0.9618


RandomForestClassifier
Model performance for Training

In [None]:
report

Unnamed: 0,Model Name,Accuracy
1,KNeighborsClassifier,0.972613
2,SVC,0.964006
7,LGBMClassifier,0.964006
3,RandomForestClassifier,0.962441
6,XGBClassifier,0.961659
4,GradientBoostingClassifier,0.958529
8,DecisionTreeClassifier,0.938185
5,AdaBoostClassifier,0.913928
0,LogisticRegression,0.893584
9,GaussianNB,0.889671


### **2 Model Training With 6 features** ###

In [None]:
X_2 = X[['gender', 'InternetService', 'Contract', 'tenure', 'MonthlyCharges', 'TotalCharges']]
y_2 =y

In [None]:
X_2.head(3)

Unnamed: 0,gender,InternetService,Contract,tenure,MonthlyCharges,TotalCharges
0,Female,DSL,Month-to-month,1,29.85,29.85
1,Male,DSL,One year,34,56.95,1889.5
2,Male,DSL,Month-to-month,2,53.85,108.15


In [None]:
y_2.head(3)

Unnamed: 0,Churn
0,No
1,No
2,Yes


In [None]:
### Data Transformation in Pipline and ColumnTransformer ###

### Spliting Numeric Features and Categorical Features ###

numeric_1= ['tenure', 'MonthlyCharges', 'TotalCharges']
categorical_1= ['gender', 'InternetService', 'Contract']

### Numeric Pipline ###

num_pipline_1 = Pipeline([
    ('imputer', SimpleImputer(strategy = 'median')),
    ('scaler', StandardScaler())
])

### Categorical Pipline ###

cat_pipline_1 = Pipeline([
    ('encoder', OrdinalEncoder())
])

### ColumnTransformer Object ###
preprosser_1 = ColumnTransformer([
    ('numeric', num_pipline_1, numeric_1),
    ('categorical', cat_pipline_1, categorical_1)
])
preprosser_1

In [None]:
## Fiting Preprosser Object in X Features ###

X_pre_transformed_1 = preprosser_1.fit_transform(X_2)

### Checking X_pre_transformed ###

X_pre_transformed_1.shape

(7043, 6)

In [None]:
### Printing X_pre_transformed Values ###

X_pre_transformed_1

array([[-1.27744458, -1.16032292, -0.99424194,  0.        ,  0.        ,
         0.        ],
       [ 0.06632742, -0.25962894, -0.17324413,  1.        ,  0.        ,
         1.        ],
       [-1.23672422, -0.36266036, -0.95967407,  1.        ,  0.        ,
         0.        ],
       ...,
       [-0.87024095, -1.1686319 , -0.85446945,  0.        ,  0.        ,
         0.        ],
       [-1.15528349,  0.32033821, -0.87206242,  1.        ,  1.        ,
         0.        ],
       [ 1.36937906,  1.35896134,  2.01428802,  1.        ,  1.        ,
         2.        ]])

In [None]:
### y target Value Encoding ###

le = LabelEncoder()
y_encoded_1 = le.fit_transform(y_2)
y_encoded_1

### Checking y_encoded Shape ###
#y_encoded.shape

array([0, 0, 1, ..., 0, 1, 0])

In [None]:
smt_1 = SMOTEENN()
X_resampled_1, y_resampled_1 = smt_1.fit_resample(X_pre_transformed_1, y_encoded_1)
X_resampled_1.shape, y_resampled_1.shape

((6171, 6), (6171,))

In [None]:
report_2 = evaluate_models(X_resampled_1, y_resampled_1, models, param_grids)

  0%|          | 0/10 [00:00<?, ?it/s]

LogisticRegression
Model performance for Training set
- Accuracy: 0.8991
- F1 score: 0.9039
- Precision: 0.8818
- Recall: 0.9272
- Roc Auc Score: 0.8984
----------------------------------
Model performance for Test set
- Accuracy: 0.8777
- F1 score: 0.8812
- Precision: 0.8550
- Recall: 0.9091
- Roc Auc Score: 0.8778


KNeighborsClassifier
Model performance for Training set
- Accuracy: 0.9917
- F1 score: 0.9919
- Precision: 0.9909
- Recall: 0.9929
- Roc Auc Score: 0.9917
----------------------------------
Model performance for Test set
- Accuracy: 0.9741
- F1 score: 0.9741
- Precision: 0.9725
- Recall: 0.9756
- Roc Auc Score: 0.9741


SVC
Model performance for Training set
- Accuracy: 0.9216
- F1 score: 0.9249
- Precision: 0.9068
- Recall: 0.9438
- Roc Auc Score: 0.9211
----------------------------------
Model performance for Test set
- Accuracy: 0.9045
- F1 score: 0.9062
- Precision: 0.8879
- Recall: 0.9253
- Roc Auc Score: 0.9045


RandomForestClassifier
Model performance for Training

In [None]:
report_2

Unnamed: 0,Model Name,Accuracy
1,KNeighborsClassifier,0.974089
3,RandomForestClassifier,0.973279
7,LGBMClassifier,0.966802
8,DecisionTreeClassifier,0.961943
6,XGBClassifier,0.958704
4,GradientBoostingClassifier,0.954656
2,SVC,0.904453
5,AdaBoostClassifier,0.883401
0,LogisticRegression,0.877733
9,GaussianNB,0.872874


In [None]:
report

Unnamed: 0,Model Name,Accuracy
1,KNeighborsClassifier,0.972613
2,SVC,0.964006
7,LGBMClassifier,0.964006
3,RandomForestClassifier,0.962441
6,XGBClassifier,0.961659
4,GradientBoostingClassifier,0.958529
8,DecisionTreeClassifier,0.938185
5,AdaBoostClassifier,0.913928
0,LogisticRegression,0.893584
9,GaussianNB,0.889671
