In [6]:
## This is being done in nltk-env (Python 3.11.4)
## ANN implementation
## libraries
## tensorflow ! pip install tensorflow
## pandas ! pip install pandas
## numpy ! pip install numpy
## scikit-learn ! pip install scikit-learn
## tensorboard ! pip install tensorboard
## matplotlib ! pip install matplotlib
## streamlit ! pip install streamlit

## ! pip install tensorflow pandas numpy scikit-learn tensorboard matplotlib streamlit

## Load the dataset--> drop irrelevant columns--> Encode categorical variables(Gender-->LabelEncoder, Geography-->OHE)--> Combine OHE with original data --> 
## --> Save encoders as pickle files --> Divide the dataset into independent and dependant features --> Split the data in training and testing sets -->
## --> Scale these features

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pickle

## Load the dataset
data=pd.read_csv('/Users/puneetch/Desktop/Python/basics/churn_modelling.csv')
data.head()

## Pre-process the data
## Drop irrelevant columns

data=data.drop(['RowNumber','CustomerId','Surname'],axis=1)

## Encode categorical variable
label_encoder_gender=LabelEncoder()
data['Gender']=label_encoder_gender.fit_transform(data['Gender'])

## One Hot encode Geography column
from sklearn.preprocessing import OneHotEncoder
onehot_encoder_geo=OneHotEncoder(sparse_output=False)
geo_encoder=onehot_encoder_geo.fit_transform(data[['Geography']])
## onehot_encoder_geo.get_feature_names_out(['Geography'])

geo_encoded_df=pd.DataFrame(geo_encoder,columns=onehot_encoder_geo.get_feature_names_out(['Geography']))

## Combine One Hot Encoded columns with the original data
geo_rem=data.drop(['Geography'],axis=1)
data=pd.concat([geo_rem,geo_encoded_df],axis=1)
data.head()


## save the encoders and sscalar

with open('label_encoder_gender.pkl','wb') as file:
    pickle.dump(label_encoder_gender,file)

with open('onehot_encoder_geo.pkl','wb') as file:
    pickle.dump(onehot_encoder_geo,file)


## Divide the dataset into independent and dependant features
## Exited is dependent and rest all are independent

x=data.drop('Exited',axis=1)
y=data['Exited']

## Split the data in training and testing sets
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

## Scale these features
scaler=StandardScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.transform(x_test)

## Save it as a pickle file

with open('sscaler.pkl','wb') as file:
    pickle.dump(scaler,file)


## Build our ANN model
## Sequential n/w -->Dense for neurons --> Activation function(Sigmoid, tanh, relu, leaky relu) --> Optimizer(useful for backward propogation, updating the weights) -->
## --> loss function--> metrics(accuracy) --> Training(Store logs in a folder) --> Tensorboard for visualization

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
import datetime

## x_train.shape[1]
## Creating the model
model = Sequential([
    Input(shape=(x_train.shape[1],)), # Use Input() as the first layer
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

## compile the model - in oder to do forward and backward propogation
opt=tf.keras.optimizers.Adam(learning_rate=0.01)
losses=tf.keras.losses.BinaryCrossentropy()
model.compile(optimizer=opt,loss=losses,metrics=['accuracy'])


## Setup the tensorboard
logs="logs/fit" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorflow_callback=TensorBoard(log_dir=logs,histogram_freq=1)

## Setup Early Stopping
early_stopping_callback=EarlyStopping(monitor='val_loss',patience=10,restore_best_weights=True)

## Training the model
history=model.fit(

    x_train,y_train,validation_data=(x_test,y_test),epochs=100,
    callbacks=[tensorflow_callback,early_stopping_callback]
)

model.save('model.h5')

## Laod tensorflow extension
%load_ext tensorboard
%tensorboard --logdir logs/fit20250606-223315

Epoch 1/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8153 - loss: 0.4428 - val_accuracy: 0.8590 - val_loss: 0.3511
Epoch 2/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8533 - loss: 0.3556 - val_accuracy: 0.8590 - val_loss: 0.3423
Epoch 3/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8623 - loss: 0.3362 - val_accuracy: 0.8470 - val_loss: 0.3731
Epoch 4/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8635 - loss: 0.3386 - val_accuracy: 0.8595 - val_loss: 0.3442
Epoch 5/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8618 - loss: 0.3410 - val_accuracy: 0.8520 - val_loss: 0.3498
Epoch 6/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8605 - loss: 0.3399 - val_accuracy: 0.8570 - val_loss: 0.3496
Epoch 7/100
[1m250/25



The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 61170), started 1 day, 0:47:39 ago. (Use '!kill 61170' to kill it.)

In [105]:
## ANN 1st step practice

##! pip install tensorflow pandas numpy scikit-learn tensorboard matplotlib streamlit

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pickle

## Read csv
data=pd.read_csv("/Users/puneetch/Desktop/Python/basics/churn_modelling.csv")

## Remove unwanted columns
data=data.drop(['RowNumber','CustomerId','Surname'],axis=1)

## Encode Gender to 0 or 1
label_encoder_gender=LabelEncoder()
data['Gender']=label_encoder_gender.fit_transform(data['Gender'])

## Encode Country using One Hot encoding
from sklearn.preprocessing import OneHotEncoder
onehot_encoder_geo=OneHotEncoder(sparse_output=False)
geo_encoder=onehot_encoder_geo.fit_transform(data[['Geography']])
## onehot_encoder_geo.get_feature_names_out(['Geography'])

geo_encoded_df=pd.DataFrame(geo_encoder,columns=onehot_encoder_geo.get_feature_names_out(['Geography']))

## Combine One Hot Encoded columns with the original data
data=pd.concat((data.drop(['Geography'],axis=1),geo_encoded_df),axis=1)

## save the encoders as pickle files
with open('label_encoder_gender.pkl','wb') as file:
    pickle.dump(label_encoder_gender,file)

with open('onehot_encoder_geo.pkl','wb') as file:
    pickle.dump(label_encoder_gender,file)

## Divide the dataset into independent and dependant features
## Exited is dependent and rest all are independent

x=data.drop('Exited',axis=1)
y=data['Exited']

## Split the data in training and testing sets
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

## Scale these features
scaler=StandardScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.fit(x_test)

## Save it as a pickle file
with open('scaler.pkl','wb') as file:
    pickle.dump(scaler,file)

array([[1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]])