In [2]:
#import libraries
import pandas as pd
import io
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [3]:
#Read the dataset from drive
data = pd.read_csv("Churn_Modelling.csv")
print(data.head())
print(data.tail())

   RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304      Onio          502    France  Female   42   
3          4    15701354      Boni          699    France  Female   39   
4          5    15737888  Mitchell          850     Spain  Female   43   

   Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMember  \
0       2       0.00              1          1               1   
1       1   83807.86              1          0               1   
2       8  159660.80              3          1               0   
3       1       0.00              2          0               0   
4       2  125510.82              1          1               1   

   EstimatedSalary  Exited  
0        101348.88       1  
1        112542.58       0  
2        113931.57       1  
3         93826.63       0  
4         790

In [4]:
# Finding Missing Values
print("Missing Values: \n ",data.isnull().sum())

Missing Values: 
  RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64


In [7]:
#Handling Missing values
data.fillna(data.mean(numeric_only=True), inplace=True)

In [8]:
#Check for Duplicates
print("Duplicate values:\n ")
print(data.duplicated())

Duplicate values:
 
0       False
1       False
2       False
3       False
4       False
        ...  
9995    False
9996    False
9997    False
9998    False
9999    False
Length: 10000, dtype: bool


In [9]:
#Detect Outliers
data.describe()

data = data.drop(['Surname', 'Geography','Gender'], axis=1)
data.head()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,619,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,608,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,502,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,699,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,850,43,2,125510.82,1,1,1,79084.1,0


In [10]:
#Normalize the dataset
scaler=MinMaxScaler()
df1=pd.DataFrame(scaler.fit_transform(data))
print("Normalized data \n" , df1)

Normalized data 
           0         1      2         3    4         5         6    7    8   \
0     0.0000  0.275616  0.538  0.324324  0.2  0.000000  0.000000  1.0  1.0   
1     0.0001  0.326454  0.516  0.310811  0.1  0.334031  0.000000  0.0  1.0   
2     0.0002  0.214421  0.304  0.324324  0.8  0.636357  0.666667  1.0  0.0   
3     0.0003  0.542636  0.698  0.283784  0.1  0.000000  0.333333  0.0  0.0   
4     0.0004  0.688778  1.000  0.337838  0.2  0.500246  0.000000  1.0  1.0   
...      ...       ...    ...       ...  ...       ...       ...  ...  ...   
9995  0.9996  0.162119  0.842  0.283784  0.5  0.000000  0.333333  1.0  0.0   
9996  0.9997  0.016765  0.332  0.229730  1.0  0.228657  0.000000  1.0  1.0   
9997  0.9998  0.075327  0.718  0.243243  0.7  0.000000  0.000000  0.0  1.0   
9998  0.9999  0.466637  0.844  0.324324  0.3  0.299226  0.333333  1.0  0.0   
9999  1.0000  0.250483  0.884  0.135135  0.4  0.518708  0.000000  1.0  0.0   

            9    10  
0     0.506735  1.0  
1

In [11]:
#split the dataset into input and output
X = data.drop('Exited', axis=1)  
y = data['Exited'] 

In [12]:
#splitting the data for training & Testing
X_train ,X_test ,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [13]:
#Print the training data and testing data
print("Training data")
print(X_train)
print(y_train)

print("Testing data")
print(X_test)
print(y_test)
print("Length of X_test: ", len(X_test))

Training data
      RowNumber  CustomerId  CreditScore  Age  Tenure    Balance  \
9254       9255    15601116          686   32       6       0.00   
1561       1562    15766374          632   42       4  119624.60   
1670       1671    15716994          559   24       3  114739.92   
6087       6088    15730759          561   27       9  135637.00   
6669       6670    15797900          517   56       9  142147.32   
...         ...         ...          ...  ...     ...        ...   
5734       5735    15596647          768   54       8   69712.74   
5191       5192    15681075          682   58       1       0.00   
5390       5391    15573851          735   38       1       0.00   
860         861    15807663          667   43       8  190227.46   
7270       7271    15706268          697   51       1  147910.30   

      NumOfProducts  HasCrCard  IsActiveMember  EstimatedSalary  
9254              2          1               1        179093.26  
1561              2          1       