In [None]:
# Given a bank customer, build a neural network-based classifier that can determine whether 
# they will leave or not in the next 6 months.
# Dataset Description: The case study is from an open-source dataset from Kaggle.
# The dataset contains 10,000 sample points with 14 distinct features such as
# CustomerId, CreditScore, Geography, Gender, Age, Tenure, Balance, etc.
# Link to the Kaggle project:
# https://www.kaggle.com/barelydedicated/bank-customer-churn-modeling
# Perform following steps:
# 1. Read the dataset.
# 2. Distinguish the feature and target set and divide the data set into training and test sets.
# 3.  the train and test data. 
# 4. Initialize and build the model. Identify the points of improvement and implement the same. 
# 5. Print the accuracy score and confusion matrix (5 points)

In [1]:
import pandas as pd

dataset=pd.read_csv("Churn_Modelling.csv")
print(dataset)

      RowNumber  CustomerId    Surname  CreditScore Geography  Gender  Age  \
0             1    15634602   Hargrave          619    France  Female   42   
1             2    15647311       Hill          608     Spain  Female   41   
2             3    15619304       Onio          502    France  Female   42   
3             4    15701354       Boni          699    France  Female   39   
4             5    15737888   Mitchell          850     Spain  Female   43   
...         ...         ...        ...          ...       ...     ...  ...   
9995       9996    15606229   Obijiaku          771    France    Male   39   
9996       9997    15569892  Johnstone          516    France    Male   35   
9997       9998    15584532        Liu          709    France  Female   36   
9998       9999    15682355  Sabbatini          772   Germany    Male   42   
9999      10000    15628319     Walker          792    France  Female   28   

      Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMemb

In [2]:
dataset.isnull().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [3]:
dataset.keys()

Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

In [4]:
x=dataset[['RowNumber', 'CustomerId',  'CreditScore', 
        'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary']]
y=dataset['Exited']
print(x)

      RowNumber  CustomerId  CreditScore  Age  Tenure    Balance  \
0             1    15634602          619   42       2       0.00   
1             2    15647311          608   41       1   83807.86   
2             3    15619304          502   42       8  159660.80   
3             4    15701354          699   39       1       0.00   
4             5    15737888          850   43       2  125510.82   
...         ...         ...          ...  ...     ...        ...   
9995       9996    15606229          771   39       5       0.00   
9996       9997    15569892          516   35      10   57369.61   
9997       9998    15584532          709   36       7       0.00   
9998       9999    15682355          772   42       3   75075.31   
9999      10000    15628319          792   28       4  130142.79   

      NumOfProducts  HasCrCard  IsActiveMember  EstimatedSalary  
0                 1          1               1        101348.88  
1                 1          0               1     

In [5]:
y.value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

In [6]:
#here data is imbalanced


#normalize data

from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
x=sc.fit_transform(x)

#split data

from sklearn.model_selection import train_test_split


x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

In [7]:
#artificial neural network model

from sklearn.neural_network import MLPClassifier

nn=MLPClassifier()
nn.fit(x_train,y_train)




In [8]:

y_pred=nn.predict(x_test)
print(y_pred)

[0 0 0 ... 0 0 0]


In [9]:

from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

accuracy=accuracy_score(y_test,y_pred)
conf_mat=classification_report(y_test,y_pred)
print(accuracy,"\n",conf_mat)

0.852 
               precision    recall  f1-score   support

           0       0.87      0.96      0.91      1595
           1       0.72      0.44      0.55       405

    accuracy                           0.85      2000
   macro avg       0.79      0.70      0.73      2000
weighted avg       0.84      0.85      0.84      2000



In [10]:
#here the diif of accuracy of classes is high this is due to imbalance data  now balane the data

In [11]:
#to balance the  data we will use imbalance_learn library
!pip install scikit-learn==1.2.2 imbalanced-learn==0.10.1



In [12]:
from imblearn.over_sampling import RandomOverSampler

ros=RandomOverSampler(random_state=0)
x_res,y_res=ros.fit_resample(x,y)


In [16]:
y_res.value_counts()

1    7963
0    7963
Name: Exited, dtype: int64

In [17]:


x_train,x_test,y_train,y_test=train_test_split(x_res,y_res,test_size=0.2,random_state=0)
nn.fit(x_train,y_train)

y_pred=nn.predict(x_test)
print(y_pred)

[0 1 1 ... 0 0 0]




In [18]:
conf_mat=classification_report(y_test,y_pred)
print(accuracy,"\n",conf_mat)

0.852 
               precision    recall  f1-score   support

           0       0.79      0.77      0.78      1590
           1       0.78      0.80      0.79      1596

    accuracy                           0.78      3186
   macro avg       0.79      0.78      0.78      3186
weighted avg       0.79      0.78      0.78      3186

