# importing libaries

In [1]:
import pandas as pd
import numpy as np

In [2]:
#read csv file
df=pd.read_csv('/kaggle/input/bank-customer-dataset/Bank_customers_data.csv')

In [3]:
#gives first five rows of the csv file
df.head()

Unnamed: 0.1,Unnamed: 0,Customer ID,Name,Surname,Gender,Age,Region,Job Classification,Date Joined,Balance,loan,score,loan_eligibility
0,0,100000001,Simon,Walsh,Male,21,England,White Collar,05.Jan.15,113810.15,5252.0,644.0,2276203.0
1,1,400000002,Jasmine,Miller,Female,34,Northern Ireland,Blue Collar,06.Jan.15,36919.73,8907.0,697.0,0.0
2,2,100000003,Liam,Brown,Male,46,England,White Collar,07.Jan.15,101536.83,2162.0,617.0,2030736.6
3,3,300000004,Trevor,Parr,Male,32,Wales,White Collar,08.Jan.15,1421.52,7277.0,672.0,0.0
4,4,100000005,Deirdre,Pullman,Female,38,England,Blue Collar,09.Jan.15,35639.79,2586.0,405.0,178198.95


In [4]:
#gives the sum of total number of null values present in each feature
df.isnull().sum()

Unnamed: 0            0
Customer ID           0
Name                  0
Surname               0
Gender                0
Age                   0
Region                0
Job Classification    0
Date Joined           0
Balance               0
loan                  0
score                 0
loan_eligibility      0
dtype: int64

In [5]:
#drop unwanted column
df=df.drop(['Unnamed: 0'],axis=1)

In [6]:
#first 5 rows
df.head()

Unnamed: 0,Customer ID,Name,Surname,Gender,Age,Region,Job Classification,Date Joined,Balance,loan,score,loan_eligibility
0,100000001,Simon,Walsh,Male,21,England,White Collar,05.Jan.15,113810.15,5252.0,644.0,2276203.0
1,400000002,Jasmine,Miller,Female,34,Northern Ireland,Blue Collar,06.Jan.15,36919.73,8907.0,697.0,0.0
2,100000003,Liam,Brown,Male,46,England,White Collar,07.Jan.15,101536.83,2162.0,617.0,2030736.6
3,300000004,Trevor,Parr,Male,32,Wales,White Collar,08.Jan.15,1421.52,7277.0,672.0,0.0
4,100000005,Deirdre,Pullman,Female,38,England,Blue Collar,09.Jan.15,35639.79,2586.0,405.0,178198.95


In [7]:
# Convert data into a DataFrame
df = pd.DataFrame(df)
# Filter out only numerical columns
numerical_cols = df.select_dtypes(include=[np.number])
# Calculate the first quartile (Q1) for each numerical column
q1 = numerical_cols.quantile(0.25)
# Calculate the third quartile (Q3) for each numerical column
q3 = numerical_cols.quantile(0.75)
# Calculate the interquartile range (IQR) for each numerical column
iqr = q3 - q1
print("First Quartile (Q1):")
print(q1)
print("\nThird Quartile (Q3):")
print(q3)
print("\nInterquartile Range (IQR):")
print(iqr)

First Quartile (Q1):

Customer ID         1.000020e+08

Age                 3.100000e+01

Balance             1.611537e+04

loan                3.366000e+03

score               4.040000e+02

loan_eligibility    0.000000e+00

Name: 0.25, dtype: float64



Third Quartile (Q3):

Customer ID         2.000031e+08

Age                 4.500000e+01

Balance             5.753393e+04

loan                7.768750e+03

score               5.967500e+02

loan_eligibility    2.787510e+05

Name: 0.75, dtype: float64



Interquartile Range (IQR):

Customer ID         1.000011e+08

Age                 1.400000e+01

Balance             4.141856e+04

loan                4.402750e+03

score               1.927500e+02

loan_eligibility    2.787510e+05

dtype: float64


# using IQR method for removing outliers

In [8]:
#Define the lower and upper bounds for outlier detection
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

#Identify outliers for each numerical column
outliers = (numerical_cols < lower_bound) | (numerical_cols > upper_bound)

#Count the number of outliers in each numerical column
num_outliers = outliers.sum()

print("Number of outliers in each numerical column:")
print(num_outliers)

#Remove rows containing outliers
df_no_outliers = df[~outliers.any(axis=1)]

print("\nDataFrame after removing outliers:")
df_no_outliers

Number of outliers in each numerical column:

Customer ID         211

Age                   0

Balance              67

loan                  0

score                 0

loan_eligibility    335

dtype: int64



DataFrame after removing outliers:


Unnamed: 0,Customer ID,Name,Surname,Gender,Age,Region,Job Classification,Date Joined,Balance,loan,score,loan_eligibility
3,300000004,Trevor,Parr,Male,32,Wales,White Collar,08.Jan.15,1421.52,7277.0,672.0,0.00
4,100000005,Deirdre,Pullman,Female,38,England,Blue Collar,09.Jan.15,35639.79,2586.0,405.0,178198.95
6,100000007,Dorothy,Thomson,Female,34,England,Blue Collar,11.Jan.15,42879.84,2644.0,467.0,214399.20
7,200000008,Lisa,Knox,Female,48,Scotland,Other,11.Jan.15,36680.17,7824.0,314.0,0.00
8,300000009,Ruth,Campbell,Female,33,Wales,White Collar,11.Jan.15,74284.35,2566.0,498.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...
4009,200004010,Sam,Lewis,Male,64,Scotland,Other,30.Dec.15,19711.66,9957.0,586.0,0.00
4010,200004011,Keith,Hughes,Male,52,Scotland,Blue Collar,30.Dec.15,56069.72,6701.0,319.0,0.00
4011,200004012,Hannah,Springer,Female,50,Scotland,Other,30.Dec.15,59477.82,7650.0,682.0,0.00
4012,200004013,Christian,Reid,Male,51,Scotland,Blue Collar,30.Dec.15,239.45,4682.0,369.0,0.00


# using scaling to normalise the features

In [9]:

from sklearn.preprocessing import MinMaxScaler
numeric_cols = df_no_outliers.select_dtypes(include=['number'])
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(numeric_cols)
num = pd.DataFrame(scaled_data,columns = numeric_cols.columns)

num

Unnamed: 0,Customer ID,Age,Balance,loan,score,loan_eligibility
0,9.999800e-01,0.304348,0.011838,0.697755,0.9300,0.000000
1,0.000000e+00,0.434783,0.299129,0.176301,0.2625,0.256582
2,9.999800e-09,0.347826,0.359915,0.182748,0.4175,0.308705
3,4.999900e-01,0.652174,0.307863,0.758559,0.0350,0.000000
4,9.999800e-01,0.326087,0.623581,0.174077,0.4950,0.000000
...,...,...,...,...,...,...
3424,5.000100e-01,1.000000,0.165399,0.995665,0.7150,0.000000
3425,5.000100e-01,0.739130,0.470655,0.633726,0.0475,0.000000
3426,5.000100e-01,0.695652,0.499268,0.739217,0.9550,0.000000
3427,5.000100e-01,0.717391,0.001914,0.409293,0.1725,0.000000


# dividing the date joined column into Month,Day,Day of the week

In [10]:
df['Date Joined'] = pd.to_datetime(df['Date Joined'])
df['Month'] = df['Date Joined'].dt.month
df['Day'] = df['Date Joined'].dt.day
df['Day of Week'] = df['Date Joined'].dt.strftime('%A')
df['Day of Week'] = df['Date Joined'].dt.dayofweek


  df['Date Joined'] = pd.to_datetime(df['Date Joined'])


In [11]:
df_no_outliers

Unnamed: 0,Customer ID,Name,Surname,Gender,Age,Region,Job Classification,Date Joined,Balance,loan,score,loan_eligibility
3,300000004,Trevor,Parr,Male,32,Wales,White Collar,08.Jan.15,1421.52,7277.0,672.0,0.00
4,100000005,Deirdre,Pullman,Female,38,England,Blue Collar,09.Jan.15,35639.79,2586.0,405.0,178198.95
6,100000007,Dorothy,Thomson,Female,34,England,Blue Collar,11.Jan.15,42879.84,2644.0,467.0,214399.20
7,200000008,Lisa,Knox,Female,48,Scotland,Other,11.Jan.15,36680.17,7824.0,314.0,0.00
8,300000009,Ruth,Campbell,Female,33,Wales,White Collar,11.Jan.15,74284.35,2566.0,498.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...
4009,200004010,Sam,Lewis,Male,64,Scotland,Other,30.Dec.15,19711.66,9957.0,586.0,0.00
4010,200004011,Keith,Hughes,Male,52,Scotland,Blue Collar,30.Dec.15,56069.72,6701.0,319.0,0.00
4011,200004012,Hannah,Springer,Female,50,Scotland,Other,30.Dec.15,59477.82,7650.0,682.0,0.00
4012,200004013,Christian,Reid,Male,51,Scotland,Blue Collar,30.Dec.15,239.45,4682.0,369.0,0.00


In [12]:
#encoding the data to convert the categorical columns into numerical columns
from sklearn.preprocessing import LabelEncoder

le=LabelEncoder()
cat=df_no_outliers.select_dtypes(include=['object'])
# #categorical_columns

Categorcial_Final=cat.apply(le.fit_transform).reset_index()
Categorcial_Final

Unnamed: 0,index,Name,Surname,Gender,Region,Job Classification,Date Joined
0,3,160,101,1,2,2,67
1,4,41,110,0,0,0,77
2,6,46,134,0,0,0,98
3,7,104,68,0,1,1,98
4,8,137,21,0,2,2,98
...,...,...,...,...,...,...,...
3424,4009,140,73,1,1,1,288
3425,4010,94,58,1,1,0,288
3426,4011,63,129,0,1,1,288
3427,4012,34,115,1,1,0,288


# conactinating numerical and categorical columns

In [13]:
final = pd.concat([Categorcial_Final,num],axis=1)

In [14]:
final.isnull().sum()

index                 0
Name                  0
Surname               0
Gender                0
Region                0
Job Classification    0
Date Joined           0
Customer ID           0
Age                   0
Balance               0
loan                  0
score                 0
loan_eligibility      0
dtype: int64

In [15]:
x= final.drop(['loan_eligibility'],axis=1)
y= final[['loan_eligibility']]

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

# splitting the data 

In [17]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.30,random_state=42)

In [18]:
x_train

Unnamed: 0,index,Name,Surname,Gender,Region,Job Classification,Date Joined,Customer ID,Age,Balance,loan,score
3347,3926,131,32,0,0,2,237,0.000020,0.478261,0.263841,0.951979,0.8000
3026,3567,151,28,0,1,1,39,0.500008,0.500000,0.074715,0.538350,0.5150
1221,1455,44,61,1,2,2,290,0.999987,0.478261,0.030262,0.089818,0.7400
2890,3410,40,133,1,2,1,262,0.999997,0.608696,0.322813,0.271565,0.8875
994,1192,171,149,0,0,2,119,0.000006,0.086957,0.236576,0.779902,0.6975
...,...,...,...,...,...,...,...,...,...,...,...,...
1095,1310,116,62,0,0,0,210,0.000007,0.456522,0.537170,0.490996,0.3700
1130,1352,109,124,0,2,2,219,0.999987,0.347826,0.129856,0.191196,0.9700
1294,1551,113,17,0,0,0,75,0.000008,0.391304,0.195082,0.278235,0.3200
860,1036,158,140,1,0,1,291,0.000005,0.500000,0.517109,0.274789,0.2725


In [19]:
lr = LinearRegression()

In [20]:
lr.fit(x_train,y_train)

In [21]:
y_pred = lr.predict(x_test)
y_pred

array([[-0.04207456],
       [ 0.23138917],
       [ 0.21434183],
       ...,
       [ 0.31389573],
       [-0.18054065],
       [ 0.1400871 ]])

In [22]:
mean_squared_error(y_pred,y_test)

0.02540148908534934

In [23]:
lr.score(x_train,y_train)

0.5904708986585607

In [24]:
lr.score(x_test,y_test)

0.5802158688212076

In [25]:
scores =  cross_val_score(lr,x_train,y_train,cv=7,scoring ='r2')
print('Cross-validation scores:{}'.format(scores))

Cross-validation scores:[0.57057736 0.59272342 0.60231791 0.56898971 0.58117898 0.56499838

 0.6130055 ]


In [26]:
print('Average cross-validation score: {:.4f}'.format(scores.mean()))

Average cross-validation score: 0.5848


# using gradient boosting to enhancing the accuracy

In [27]:
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)

In [28]:
model.fit(x_train,y_train)


  y = column_or_1d(y, warn=True)


In [29]:
y_train_pred = model.predict(x_train)

In [30]:
y_pred = model.predict(x_test)
y_pred

array([ 0.00191536,  0.25587039, -0.0043124 , ...,  0.35095021,
       -0.00135968,  0.21008646])

In [31]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
r2_score(y_train, y_train_pred)

0.9949389656681266

In [32]:
model.score(x_test,y_test)

0.990452831456595

In [33]:
r2_score(y_test,y_pred)

0.990452831456595

In [34]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

# To performing cross validation

In [35]:
scores =  cross_val_score(model,x_train,y_train,cv=15,scoring ='r2')
print('Cross-validation scores:{}'.format(scores))


  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)


Cross-validation scores:[0.98914199 0.99300555 0.98620411 0.97762356 0.98672532 0.99157652

 0.97890105 0.97810311 0.98537466 0.99272528 0.99117377 0.99140036

 0.98648195 0.98762403 0.98677564]


In [36]:
print('Average cross-validation score: {:.4f}'.format(scores.mean()))

Average cross-validation score: 0.9869


# Throughout our implementation of algorithms like linear regression and gradient boosting, alongside the use of cross-validation techniques, we meticulously evaluated individual accuracies. By leveraging metrics such as mean squared error, R2 score, and mean absolute error, we achieved substantial accuracy levels. This detailed approach has allowed for comprehensive assessment of algorithmic performance, highlighting the effectiveness of our analytical methodology. Our efforts culminated in notable improvements in predictive accuracy and model refinement