In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('Churn_Modelling.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [4]:
def preprocess_data(df):
    
    # Feature selection:-
    columns_to_drop =  ['RowNumber','CustomerId','Surname','NumOfProducts']
    df = df.drop(columns=columns_to_drop)
    
    # One-hot encoding categorial columns:-
    geography = pd.get_dummies(df['Geography'],drop_first=True)
    gender = pd.get_dummies(df['Gender'],drop_first=True)
    temp_df = df.drop(['Geography','Gender'],axis=1)
    data = pd.concat([temp_df,geography,gender],axis=1)
    
    return data

In [5]:
data = preprocess_data(data)

In [6]:
data.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Germany,Spain,Male
0,619,42,2,0.0,1,1,101348.88,1,0,0,0
1,608,41,1,83807.86,0,1,112542.58,0,0,1,0
2,502,42,8,159660.8,1,0,113931.57,1,0,0,0
3,699,39,1,0.0,0,0,93826.63,0,0,0,0
4,850,43,2,125510.82,1,1,79084.1,0,0,1,0


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [8]:
# Splitting and Feature scaling:

X = data.drop(columns=['Exited'])
y = data['Exited']

X_train, X_test, y_train, y_test= train_test_split(X,y,test_size=0.2,random_state=42)

In [9]:
# Neural Network:-
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.utils import to_categorical
# Ensemble mwthods:-
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV

In [10]:
# Scaling:-
scaler= StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert X_test to categorical:-
nn_y_test = to_categorical(y_test)
rounded_labels=np.argmax(nn_y_test, axis=1)

In [11]:
rounded_labels

array([0, 0, 0, ..., 1, 1, 1], dtype=int64)

In [12]:
X_train.shape

(8000, 10)

# Model:-

In [13]:
model = Sequential()
model.add(Dense(units=8,kernel_initializer='he_uniform',activation='relu',input_dim=10))
model.add(Dense(units=6,kernel_initializer='he_uniform',activation='relu'))
model.add(Dense(units=6,kernel_initializer='he_uniform',activation='relu'))
model.add(Dense(units=1,kernel_initializer='glorot_uniform',activation='sigmoid'))
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 8)                 88        
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 54        
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 42        
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 7         
Total params: 191
Trainable params: 191
Non-trainable params: 0
_________________________________________________________________


In [14]:
model.fit(X_train,y_train,epochs=150,batch_size=128)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

Epoch 82/150
Epoch 83/150
Epoch 84/150
Epoch 85/150
Epoch 86/150
Epoch 87/150
Epoch 88/150
Epoch 89/150
Epoch 90/150
Epoch 91/150
Epoch 92/150
Epoch 93/150
Epoch 94/150
Epoch 95/150
Epoch 96/150
Epoch 97/150
Epoch 98/150
Epoch 99/150
Epoch 100/150
Epoch 101/150
Epoch 102/150
Epoch 103/150
Epoch 104/150
Epoch 105/150
Epoch 106/150
Epoch 107/150
Epoch 108/150
Epoch 109/150
Epoch 110/150
Epoch 111/150
Epoch 112/150
Epoch 113/150
Epoch 114/150
Epoch 115/150
Epoch 116/150
Epoch 117/150
Epoch 118/150
Epoch 119/150
Epoch 120/150
Epoch 121/150
Epoch 122/150
Epoch 123/150
Epoch 124/150
Epoch 125/150
Epoch 126/150
Epoch 127/150
Epoch 128/150
Epoch 129/150
Epoch 130/150
Epoch 131/150
Epoch 132/150
Epoch 133/150
Epoch 134/150
Epoch 135/150
Epoch 136/150
Epoch 137/150
Epoch 138/150
Epoch 139/150
Epoch 140/150
Epoch 141/150
Epoch 142/150
Epoch 143/150
Epoch 144/150
Epoch 145/150
Epoch 146/150
Epoch 147/150
Epoch 148/150
Epoch 149/150
Epoch 150/150


<tensorflow.python.keras.callbacks.History at 0x27f0559d7c0>

# Random Forest:-

In [15]:
param_dist = {'n_estimators':[x for x in np.arange(100,1200,100)],
             'max_features':['log2','sqrt'],
             'max_depth':[8,16,24,32],
             'criterion':['gini','entropy']}

rfc = RandomForestClassifier()

r_search = RandomizedSearchCV(estimator=rfc,param_distributions=param_dist,scoring='accuracy',n_jobs=-1)

In [16]:
r_search.fit(X_train,y_train)

RandomizedSearchCV(estimator=RandomForestClassifier(), n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [8, 16, 24, 32],
                                        'max_features': ['log2', 'sqrt'],
                                        'n_estimators': [100, 200, 300, 400,
                                                         500, 600, 700, 800,
                                                         900, 1000, 1100]},
                   scoring='accuracy')

In [17]:
r_search.best_params_

{'n_estimators': 400,
 'max_features': 'log2',
 'max_depth': 8,
 'criterion': 'gini'}

# Gradient Boosting classifer:-

In [18]:
gbc = GradientBoostingClassifier()

gb_param_dist = {'n_estimators':[x for x in np.arange(100,600,50)],
                'learning_rate':[y for y in np.arange(0.25,1.0,0.25)],
                'max_features':['sqrt','log2']}

gb_search = RandomizedSearchCV(estimator=gbc,param_distributions=gb_param_dist,scoring='accuracy',n_jobs=-1)

In [19]:
gb_search.fit(X_train,y_train)

RandomizedSearchCV(estimator=GradientBoostingClassifier(), n_jobs=-1,
                   param_distributions={'learning_rate': [0.25, 0.5, 0.75],
                                        'max_features': ['sqrt', 'log2'],
                                        'n_estimators': [100, 150, 200, 250,
                                                         300, 350, 400, 450,
                                                         500, 550]},
                   scoring='accuracy')

In [20]:
gb_search.best_score_

0.83375

# Saving models:-

In [21]:
import pickle
from keras.models import load_model 
from sklearn.metrics import classification_report, accuracy_score

In [22]:
pickle.dump(gb_search.best_estimator_,open('Gradient_Boost_Model.pkl','wb'))
pickle.dump(r_search.best_estimator_,open('Random_Forest_Model.pkl','wb'))

In [23]:
# Saving the Neural Network:
model.save('ANN.h5')

## Predictions:-

In [24]:
# Random Forest Classifier:-
rf_model = pickle.load(open('Random_Forest_Model.pkl','rb'))
rf_pred = rf_model.predict(X_test)

# Gradient Boost Classifier:-
gb_model = pickle.load(open('Gradient_Boost_Model.pkl','rb'))
gb_pred = gb_model.predict(X_test)

# Neural Network:-
nn_model = load_model('ANN.h5')
nn_pred = (nn_model.predict(X_test)>0.5).astype(int)

In [25]:
print('The classification report for Random Forest model is: ','\n\n',classification_report(y_test,rf_pred))

The classification report for Random Forest model is:  

               precision    recall  f1-score   support

           0       0.85      0.97      0.91      1607
           1       0.74      0.32      0.45       393

    accuracy                           0.84      2000
   macro avg       0.80      0.65      0.68      2000
weighted avg       0.83      0.84      0.82      2000



In [26]:
print('The classification report for Gradient Boost model is: ','\n\n',classification_report(y_test,gb_pred))

The classification report for Gradient Boost model is:  

               precision    recall  f1-score   support

           0       0.86      0.96      0.91      1607
           1       0.68      0.38      0.49       393

    accuracy                           0.84      2000
   macro avg       0.77      0.67      0.70      2000
weighted avg       0.83      0.84      0.83      2000



In [27]:
print('The accuracy score for the neural network is: ','\n\n',classification_report(rounded_labels,nn_pred))

The accuracy score for the neural network is:  

               precision    recall  f1-score   support

           0       0.86      0.95      0.90      1607
           1       0.66      0.36      0.47       393

    accuracy                           0.84      2000
   macro avg       0.76      0.66      0.69      2000
weighted avg       0.82      0.84      0.82      2000

