In [117]:
import numpy as np 
import matplotlib.pyplot as plt 
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
import pickle

import warnings
warnings.filterwarnings('ignore')

In [128]:
data = pd.read_csv("/kaggle/input/churn-data/Churn_Modelling.csv")
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [119]:
data.shape

(10000, 14)

## Data Preprocessing 

In [129]:
data = data.drop(['RowNumber', 'CustomerId','Surname'], axis=1)

In [130]:
#encoding the categorical data 
# Geography ==> One Hot encoding 
# Gender ==> Label Encoding 

LB_gender = LabelEncoder()
data['Gender'] = LB_gender.fit_transform(data[['Gender']])

In [136]:
OHE_geo = OneHotEncoder(sparse=False)
geo_encoder = OHE_geo.fit_transform(data[['Geography']])

geoencoded_df = pd.DataFrame(
    geo_encoder,
    columns= OHE_geo.get_feature_names_out(['Geography'])
)
geoencoded_df.head()

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain
0,1.0,0.0,0.0
1,0.0,0.0,1.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,0.0,0.0,1.0


In [137]:
data = pd.concat([data.drop('Geography', axis=1), geoencoded_df], axis=1)
data.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.8,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.0,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.1,0,0.0,0.0,1.0


In [138]:
# divide the dataset 

X = data.drop('Exited', axis=1)
y = data['Exited']

# split the dat a
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# scale the features 
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [139]:
# save the LB_gender, OHE_geo and scaler into a pickle files 

with open('LB_gender.pkl', 'wb') as file:
    pickle.dump(LB_gender, file)

with open('OHE_geo.pkl', 'wb') as file:
    pickle.dump(OHE_geo, file)

with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

## ANN Implementation 

In [31]:
import tensorflow as tf 
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard

In [35]:
input_shape = X_train.shape[1]
(input_shape,)

(12,)

In [37]:
model = Sequential(
    [
        Dense(64, activation='relu', input_shape=(input_shape,)), # first hidden layer connected with the input layer
        Dense(32, activation='relu'), # second hidden layer
        Dense(1, activation='sigmoid') # output layer
    ]
)

In [38]:
model.summary()

In [43]:
from tensorflow.keras.optimizers import Adam

optimizer = Adam(learning_rate=0.01)
optimizer

<keras.src.optimizers.adam.Adam at 0x7e59fe585390>

In [45]:
model.compile(
    optimizer=optimizer,
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [58]:
import datetime
log_dir="logs/fit/" + datetime.datetime.now().strftime("%Y_%m_%d-%H_%M_%S")
tensorflow_callback = TensorBoard(log_dir = log_dir, histogram_freq = 1)

In [51]:
# setUp Early Stopping 

early_stopping = EarlyStopping(
    monitor='val_loss',
    patience = 10,
    restore_best_weights = True
)

In [59]:
history = model.fit(
    X_train,
    y_train,
    validation_data=(X_test, y_test),
    epochs = 100,
    callbacks=[tensorflow_callback,early_stopping]
)

Epoch 1/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8543 - loss: 0.3515 - val_accuracy: 0.8565 - val_loss: 0.3447
Epoch 2/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8697 - loss: 0.3170 - val_accuracy: 0.8550 - val_loss: 0.3464
Epoch 3/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8646 - loss: 0.3259 - val_accuracy: 0.8585 - val_loss: 0.3446
Epoch 4/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8726 - loss: 0.3162 - val_accuracy: 0.8550 - val_loss: 0.3501
Epoch 5/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8711 - loss: 0.3240 - val_accuracy: 0.8560 - val_loss: 0.3561
Epoch 6/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8803 - loss: 0.3047 - val_accuracy: 0.8515 - val_loss: 0.3527
Epoch 7/100
[1m250/25

In [53]:
# save the model 
model.save('model.h5')

In [55]:
# load tensorboard exetension

%load_ext tensorboard

In [61]:
%tensorboard --logdir logs/fit 

Reusing TensorBoard on port 6006 (pid 783), started 0:02:19 ago. (Use '!kill 783' to kill it.)

## Prediction

In [140]:
# load the pickle file
from tensorflow.keras.models import load_model

model = load_model('/kaggle/working/model.h5')
with open('/kaggle/working/LB_gender.pkl', 'rb') as file:
    LB_gender = pickle.load(file)

with open('/kaggle/working/OHE_geo.pkl', 'rb') as file:
    OHE_geo = pickle.load(file)

with open('/kaggle/working/scaler.pkl', 'rb') as file:
    scaler = pickle.load(file)

In [146]:
# example of input data 

input_data = {
    'CreditScore': 600,
    'Geography': 'France',
    'Gender': 'Male',
    'Age': 40,
    'Tenure': 3,
    'Balance': 60000,
    'NumOfProducts': 2,
    'HasCrCard': 1,
    'IsActiveMember': 1,
    'EstimatedSalary': 50000
}
input_data = pd.DataFrame([input_data])
input_data

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,600,France,Male,40,3,60000,2,1,1,50000


In [148]:
input_data['Gender'] = LB_gender.transform([input_data['Gender']])[0]

ValueError: y contains previously unseen labels: 1

In [157]:
!pip show scikit-learn


Name: scikit-learn
Version: 1.2.2
Summary: A set of python modules for machine learning and data mining
Home-page: http://scikit-learn.org
Author: 
Author-email: 
License: new BSD
Location: /opt/conda/lib/python3.10/site-packages
Requires: joblib, numpy, scipy, threadpoolctl
Required-by: bayesian-optimization, bigframes, Boruta, category-encoders, cesium, eli5, fastai, hep-ml, imbalanced-learn, librosa, lime, mlxtend, nilearn, pyLDAvis, rgf-python, scikit-learn-intelex, scikit-optimize, scikit-plot, shap, sklearn-pandas, TPOT, tsfresh, woodwork, yellowbrick


In [159]:
OHE_geo.get_feature_names_out(['Geography'])

array(['Geography_France', 'Geography_Germany', 'Geography_Spain'],
      dtype=object)

In [149]:
geo_enceded = OHE_geo.transform([input_data['Geography']])
name_columns = OHE_geo.get_feature_names_out(['Geography'])
geoencoded_df = pd.DataFrame(
    geo_enceded,
    columns= OHE_geo.get_feature_names_out(['Geography'])
)
geoencoded_df.head()

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain
0,1.0,0.0,0.0


In [150]:
input_data = pd.concat([input_data.drop('Geography', axis=1), geoencoded_df], axis=1)
input_data.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain
0,600,1,40,3,60000,2,1,1,50000,1.0,0.0,0.0


In [160]:
input_data.columns

Index(['CreditScore', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts',
       'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Geography_France',
       'Geography_Germany', 'Geography_Spain'],
      dtype='object')

In [151]:
# scaling data

input_scaler = scaler.transform(input_data)
input_scaler

array([[-0.53598516,  0.91324755,  0.10479359, -0.69539349, -0.25781119,
         0.80843615,  0.64920267,  0.97481699, -0.87683221,  1.00150113,
        -0.57946723, -0.57638802]])

In [154]:
prediction = model.predict(input_scaler)
prediction_proba = prediction[0][0]
prediction_proba

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step


0.34738734

In [155]:
if prediction_proba > 0.5:
    print('The costomer is likely to churn.')
else:
    print('The costomer is not likely to churn.')
    

The costomer is not likely to churn.


In [165]:
!pip install scikeras

Collecting scikeras
  Downloading scikeras-0.13.0-py3-none-any.whl.metadata (3.1 kB)
Collecting scikit-learn>=1.4.2 (from scikeras)
  Downloading scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading scikeras-0.13.0-py3-none-any.whl (26 kB)
Downloading scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m88.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m:01[0m
[?25hInstalling collected packages: scikit-learn, scikeras
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 0.22.0 requires google-cloud-bigquery[bqstor

In [167]:
# hyper parameter ==> how to find the best parameters 


from sklearn.model_selection import GridSearchCV
from scikeras.wrappers import KerasClassifier

def create_ann_model(input_shape, optimizer='adam', activation='relu', units1=64, units2=32):
    model = Sequential()
    
    model.add(Dense(units=units1, activation=activation, input_shape=(input_shape,)))
    
    model.add(Dense(units=units2, activation=activation))
    
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

ImportError: cannot import name '_fit_context' from 'sklearn.base' (/opt/conda/lib/python3.10/site-packages/sklearn/base.py)