# Goal
In the part of the loan prediction model, we saw that the credit score heavily influenced the loan decision. However, we want to see if there might be improvement to the model by scaling the data

## Import the libraries

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
plt.style.use('ggplot')

## Import the Dataset

In [18]:
df = pd.read_csv("loan.csv")
df.head()

Unnamed: 0,age,gender,occupation,education_level,marital_status,income,credit_score,loan_status
0,32,Male,Engineer,Bachelor's,Married,85000,720,Approved
1,45,Female,Teacher,Master's,Single,62000,680,Approved
2,28,Male,Student,High School,Single,25000,590,Denied
3,51,Female,Manager,Bachelor's,Married,105000,780,Approved
4,36,Male,Accountant,Bachelor's,Married,75000,710,Approved


In [19]:
df.describe()

Unnamed: 0,age,income,credit_score
count,61.0,61.0,61.0
mean,37.081967,78983.606557,709.836066
std,8.424755,33772.025802,72.674888
min,24.0,25000.0,560.0
25%,30.0,52000.0,650.0
50%,36.0,78000.0,720.0
75%,43.0,98000.0,770.0
max,55.0,180000.0,830.0


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61 entries, 0 to 60
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              61 non-null     int64 
 1   gender           61 non-null     object
 2   occupation       61 non-null     object
 3   education_level  61 non-null     object
 4   marital_status   61 non-null     object
 5   income           61 non-null     int64 
 6   credit_score     61 non-null     int64 
 7   loan_status      61 non-null     object
dtypes: int64(3), object(5)
memory usage: 3.9+ KB


## Split the dataset

In [21]:
from sklearn.model_selection import train_test_split
X = df.drop("loan_status", axis = 1)
y = df["loan_status"]

In [22]:
X_train, X_test, y_train, y_test = train_test_split (X, y, test_size=0.2, random_state =42)

## Encoding and normalization of data

In [25]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

numeric_features = ['age', 'income', 'credit_score']
categorical_features = ['gender', 'occupation', 'education_level', 'marital_status']

In [26]:
scaler = StandardScaler()
encoder = OneHotEncoder(handle_unknown='ignore')

X_train_num = scaler.fit_transform(X_train[numeric_features])
X_train_cat = encoder.fit_transform(X_train[categorical_features])

X_test_num = scaler.transform(X_test[numeric_features])
X_test_cat = encoder.transform(X_test[categorical_features])

X_train_final = np.hstack((X_train_num, X_train_cat.toarray()))
X_test_final = np.hstack((X_test_num, X_test_cat.toarray()))

X_train_final

array([[ 1.67457623,  0.72108107,  0.93810952, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.80493541,  0.13194249,  0.37347934, ...,  1.        ,
         1.        ,  0.        ],
       [ 0.4322322 ,  1.31021965,  1.22042461, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.81011184, -0.75176537, -0.61462348, ...,  0.        ,
         0.        ,  1.        ],
       [-1.05858065, -0.95796387, -1.03809611, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.30799779,  0.13194249,  0.37347934, ...,  0.        ,
         1.        ,  0.        ]])

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

numeric_features = ['age', 'income', 'credit_score']
categorical_features = ['gender', 'occupation', 'education_level', 'marital_status', 'loan_status']