In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
%matplotlib inline
import seaborn as sns

In [2]:
df = pd.read_csv('../input/churn-dataset/churn2.csv')
df.head()

In [3]:
df.drop(['RowNumber','CustomerId','Surname'], axis= 'columns', inplace=True)
df.head()

In [389]:
df.isnull().sum()

In [390]:
df.isna().sum()

In [391]:
df.groupby("Geography").count().sort_values(by="Age",ascending=False).Age.plot.pie()

In [392]:
df.groupby("Gender").count().sort_values(by="Age",ascending=False).Age.plot.pie()

In [393]:
df.groupby("HasCrCard").count().sort_values(by="Age",ascending=False).Age.plot.pie()

In [394]:
df.groupby("IsActiveMember").count().sort_values(by="Age",ascending=False).Age.plot.pie()

In [4]:
df.groupby("Exited").count().sort_values(by="Age",ascending=False).Age

In [396]:
sns.histplot(data=df,x="CreditScore",kde=True,bins=50)
print(f"Skewness {df.CreditScore.skew()}")

In [397]:
sns.histplot(data=df,x="Balance",kde=True,bins=50)
print(f"Skewness {df.Balance.skew()}")

In [398]:
sns.histplot(data=df,x="EstimatedSalary",kde=True,bins=50)
print(f"Skewness {df.EstimatedSalary.skew()}")

In [399]:
sns.histplot(data=df,x="Age",kde=True,bins=10)
print(f"Skewness {df.Age.skew()}")

df["Age"] = np.log(df["Age"])
print(f"Skewness {df.Age.skew()}")
sns.histplot(data=df,x="Age",kde=True,bins=10)

In [400]:
df1 = pd.get_dummies(data=df, columns=['Geography', 'Gender'],drop_first=True)
df1.columns

In [401]:
col_name = list(df1.columns)
target_col_name = "Exited"
col_name.remove("Exited")
feat_col_name = col_name

In [402]:
for names in feat_col_name:
    first_quant = df1[names].quantile(0.25)
    third_quant = df1[names].quantile(0.75)
    IQR = third_quant - first_quant
    
    lower_bnd = first_quant - 2 * IQR
    uppr_bnd = third_quant + 2 * IQR
    
    df1 = df1[df1[names]>=lower_bnd]
    df1 = df1[df1[names]<=uppr_bnd]

In [403]:
sns.boxplot(data=df1,x="Age")

In [404]:
from sklearn.model_selection import train_test_split

X = df1[feat_col_name]
y = df1[target_col_name]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10, stratify=y)

In [405]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.metrics import RocCurveDisplay, PrecisionRecallDisplay , auc
from sklearn.metrics import f1_score

import tensorflow as tf
from tensorflow import keras

In [406]:
def random_forest(X_train,y_train,X_test,y_test):
    forest = RandomForestClassifier(n_estimators=500, max_depth=10)
    forest.fit(X_train, y_train)
    y_pred = forest.predict(X_test)
    #print("Accuracy", accuracy_score(y_test,y_pred) )
    #print("Classification Report: \n", classification_report(y_test, y_pred))
    #print("Confusion Matrix",confusion_matrix(y_test,y_pred))
    #RocCurveDisplay.from_predictions(y_test, y_pred)
    #PrecisionRecallDisplay.from_predictions(y_test, y_pred)
    
    precision, recall, thresholds = precision_recall_curve(y_test, forest.predict_proba(X_test)[:,1])
    area = auc(recall, precision)
    
    
    del forest
    
    return f1_score(y_test, y_pred) ,accuracy_score(y_test,y_pred) , area

In [407]:
def extra_tree(X_train,y_train,X_test,y_test):
    extraTrees = ExtraTreesClassifier(n_estimators=500, max_depth=10, criterion='entropy')
    extraTrees.fit(X_train, y_train)
    y_pred = extraTrees.predict(X_test)
    #print("Accuracy", accuracy_score(y_test,y_pred) )
    #print("Classification Report: \n", classification_report(y_test, y_pred))
    #print("Confusion Matrix",confusion_matrix(y_test,y_pred))
    #RocCurveDisplay.from_predictions(y_test, y_pred)
    #PrecisionRecallDisplay.from_predictions(y_test, extraTrees.predict_proba(X_test)[:,1])
    
    
    precision, recall, thresholds = precision_recall_curve(y_test, extraTrees.predict_proba(X_test)[:,1])
    area = auc(recall, precision)
    
    del extraTrees
    
    return f1_score(y_test, y_pred) ,accuracy_score(y_test,y_pred) , area

In [408]:
def xgboost(X_train,y_train,X_test,y_test):
    xgb = XGBClassifier(n_estimators=500, max_depth=10)
    xgb.fit(X_train, y_train)
    y_pred = xgb.predict(X_test)
    #print("Accuracy", accuracy_score(y_test,y_pred) )
    #print("Classification Report: \n", classification_report(y_test, y_pred))
    #print("Confusion Matrix",confusion_matrix(y_test,y_pred))
    #RocCurveDisplay.from_predictions(y_test, y_pred)
    #PrecisionRecallDisplay.from_predictions(y_test, y_pred)
    
    precision, recall, thresholds = precision_recall_curve(y_test, xgb.predict_proba(X_test)[:,1])
    area = auc(recall, precision)

    del xgb
    
    return f1_score(y_test, y_pred) ,accuracy_score(y_test,y_pred) , area
    
    


In [409]:
def ANN(X_train, y_train, X_test, y_test, loss='binary_crossentropy'):
    
    model = keras.Sequential([
        keras.layers.Dense(X_train.shape[1], input_dim=X_train.shape[1], activation='relu'),
        #.layers.Dense(10, activation='relu'),
        keras.layers.Dense(10, activation='relu'),
        #keras.layers.Dense(5, activation='relu'),
        keras.layers.Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer='adam', loss=loss, metrics=['accuracy'])

    
    model.fit(X_train, y_train, epochs=100,verbose = 0)

    #print("Accuracy", model.evaluate(X_test, y_test) )
    

    y_preds = model.predict(X_test)
    y_pred = np.round(y_preds)

    #print("Classification Report: \n", classification_report(y_test, y_pred))
    #print("Confusion Matrix",confusion_matrix(y_test,y_pred))
    #RocCurveDisplay.from_predictions(y_test, y_pred)
    #PrecisionRecallDisplay.from_predictions(y_test, y_pred)
    
    precision, recall, thresholds = precision_recall_curve(y_test, y_preds)
    area = auc(recall, precision)

    
    return f1_score(y_test, y_pred) ,model.evaluate(X_test, y_test)[1] , area

    del model

In [410]:
score_df = pd.DataFrame(columns=["Model","Sampling_Technique","Accuracy","AUC","AP"])

In [411]:
aUc,acc,ap = random_forest(X_train,y_train,X_test,y_test)
score_df = score_df.append({"Model":"Random_Forest","Sampling_Technique":"None","Accuracy":acc,"AUC":aUc,"AP":ap},
                           ignore_index=True)

In [412]:
aUc,acc,ap = extra_tree(X_train,y_train,X_test,y_test)
score_df = score_df.append({"Model":"Extra_Tree","Sampling_Technique":"None","Accuracy":acc,"AUC":aUc,"AP":ap},
                           ignore_index=True)

In [413]:
aUc,acc,ap = xgboost(X_train,y_train,X_test,y_test)
score_df = score_df.append({"Model":"XGBOOST","Sampling_Technique":"None","Accuracy":acc,"AUC":aUc,"AP":ap},
                           ignore_index=True)

In [414]:
aUc,acc,ap = ANN(X_train,y_train,X_test,y_test)
score_df = score_df.append({"Model":"ANN","Sampling_Technique":"None","Accuracy":acc,"AUC":aUc,"AP":ap},
                           ignore_index=True)

## Undersampling

In [415]:
# Class count
count_class_0, count_class_1 = df1.Exited.value_counts()

# Divide by class
df_class_0 = df1[df1['Exited'] == 0]
df_class_1 = df1[df1['Exited'] == 1]

In [416]:
df_class_0_under = df_class_0.sample(count_class_1)
df_test_under = pd.concat([df_class_0_under, df_class_1], axis=0)

print('Random under-sampling:')
print(df_test_under.Exited.value_counts())

In [417]:
X = df_test_under[feat_col_name]
y = df_test_under[target_col_name]

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=15, stratify=y)

In [418]:
aUc,acc,ap = random_forest(X_train,y_train,X_test,y_test)
score_df = score_df.append({"Model":"Random_Forest","Sampling_Technique":"UnderSampling","Accuracy":acc,"AUC":aUc,"AP":ap},
                           ignore_index=True)

In [419]:
aUc,acc,ap = extra_tree(X_train,y_train,X_test,y_test)
score_df = score_df.append({"Model":"Extra_Tree","Sampling_Technique":"UnderSampling","Accuracy":acc,"AUC":aUc,"AP":ap},
                           ignore_index=True)


In [420]:
aUc,acc,ap = xgboost(X_train,y_train,X_test,y_test)
score_df = score_df.append({"Model":"XGBOOST","Sampling_Technique":"UnderSampling","Accuracy":acc,"AUC":aUc,"AP":ap},
                           ignore_index=True)

In [421]:
aUc,acc,ap = ANN(X_train,y_train,X_test,y_test)
score_df = score_df.append({"Model":"ANN","Sampling_Technique":"UnderSampling","Accuracy":acc,"AUC":aUc,"AP":ap},
                           ignore_index=True)

## OverSampling

In [422]:
# Oversample 1-class and concat the DataFrames of both classes
df_class_1_over = df_class_1.sample(count_class_0, replace=True)
df_test_over = pd.concat([df_class_0, df_class_1_over], axis=0)

print('Random over-sampling:')
print(df_test_over.Exited.value_counts())

In [423]:
X = df_test_over.drop('Exited', axis='columns')
y = df_test_over['Exited']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15, stratify=y)

In [424]:
aUc,acc,ap = random_forest(X_train,y_train,X_test,y_test)
score_df = score_df.append({"Model":"Random_Forest","Sampling_Technique":"OverSampling","Accuracy":acc,"AUC":aUc,"AP":ap},
                           ignore_index=True)

In [425]:
aUc,acc,ap = extra_tree(X_train,y_train,X_test,y_test)
score_df = score_df.append({"Model":"Extra_Tree","Sampling_Technique":"OverSampling","Accuracy":acc,"AUC":aUc,"AP":ap},
                           ignore_index=True)

In [426]:
aUc,acc,ap = xgboost(X_train,y_train,X_test,y_test)
score_df = score_df.append({"Model":"XGBOOST","Sampling_Technique":"OverSampling","Accuracy":acc,"AUC":aUc,"AP":ap},
                           ignore_index=True)

In [427]:
aUc,acc,ap = ANN(X_train,y_train,X_test,y_test)
score_df = score_df.append({"Model":"ANN","Sampling_Technique":"OverSampling","Accuracy":acc,"AUC":aUc,"AP":ap},
                           ignore_index=True)

## SMOTE

In [428]:
X = df1.drop('Exited', axis='columns')
y = df1['Exited']

from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='minority')
X_sm, y_sm = smote.fit_resample(X, y)

y_sm.value_counts()

In [429]:
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.2, random_state=15, stratify=y_sm)

In [430]:
aUc,acc,ap = random_forest(X_train,y_train,X_test,y_test)
score_df = score_df.append({"Model":"Random_Forest","Sampling_Technique":"SMOTE","Accuracy":acc,"AUC":aUc,"AP":ap},
                           ignore_index=True)

In [431]:
aUc,acc,ap = extra_tree(X_train,y_train,X_test,y_test)
score_df = score_df.append({"Model":"Extra_Tree","Sampling_Technique":"SMOTE","Accuracy":acc,"AUC":aUc,"AP":ap},
                           ignore_index=True)

In [432]:
aUc,acc,ap = xgboost(X_train,y_train,X_test,y_test)
score_df = score_df.append({"Model":"XGBOOST","Sampling_Technique":"SMOTE","Accuracy":acc,"AUC":aUc,"AP":ap},
                           ignore_index=True)

In [433]:
aUc,acc,ap = ANN(X_train,y_train,X_test,y_test)
score_df = score_df.append({"Model":"ANN","Sampling_Technique":"SMOTE","Accuracy":acc,"AUC":aUc,"AP":ap},
                           ignore_index=True)

In [442]:
score_df.groupby(["Sampling_Technique","Model"]).min("AP")