In [6]:
import pandas as pd
import numpy as np

In [7]:
data=pd.read_csv("loanapproval.csv")

In [8]:
data.head()

Unnamed: 0,applicant_id,age,gender,marital_status,annual_income,loan_amount,credit_score,num_dependents,existing_loans_count,employment_status,loan_approved
0,1,59,Male,Divorced,100073,7169,793,1,1,Unemployed,1
1,2,49,Male,Married,112197,23556,789,0,2,Employed,1
2,3,35,Male,Divorced,84429,27052,372,1,4,Unemployed,0
3,4,63,Female,Single,124195,11313,808,3,4,Self-employed,1
4,5,28,Female,Married,81627,13315,689,0,1,Unemployed,1


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   applicant_id          1000 non-null   int64 
 1   age                   1000 non-null   int64 
 2   gender                1000 non-null   object
 3   marital_status        1000 non-null   object
 4   annual_income         1000 non-null   int64 
 5   loan_amount           1000 non-null   int64 
 6   credit_score          1000 non-null   int64 
 7   num_dependents        1000 non-null   int64 
 8   existing_loans_count  1000 non-null   int64 
 9   employment_status     1000 non-null   object
 10  loan_approved         1000 non-null   int64 
dtypes: int64(8), object(3)
memory usage: 86.1+ KB


In [11]:
data.drop(columns=['applicant_id'],inplace=True)

In [12]:
data['marital_status'].unique()

array(['Divorced', 'Married', 'Single'], dtype=object)

In [13]:
data['employment_status'].unique()

array(['Unemployed', 'Employed', 'Self-employed'], dtype=object)

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer,make_column_transformer

In [15]:
cat=['gender','marital_status','employment_status']
num=['age','annual_income','loan_amount','credit_score']

In [16]:
preprocessor=ColumnTransformer(transformers=[
    ('num',StandardScaler(),num),
    ('cat',OneHotEncoder(drop='first',handle_unknown='ignore'),cat)
    
],
remainder='passthrough'
                              )

In [18]:
from sklearn.linear_model import LogisticRegression

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
x_train,x_test,y_train,y_test=train_test_split(data.drop(columns=['loan_approved']),data['loan_approved'],test_size=0.2)

In [21]:
pipeline=Pipeline([
    ('preprocessor',preprocessor),
    ('logistic',LogisticRegression())
])

In [23]:
pipeline.fit(x_train,y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [24]:
print(pipeline.named_steps['preprocessor'].get_feature_names_out())

['num__age' 'num__annual_income' 'num__loan_amount' 'num__credit_score'
 'cat__gender_Male' 'cat__marital_status_Married'
 'cat__marital_status_Single' 'cat__employment_status_Self-employed'
 'cat__employment_status_Unemployed' 'remainder__num_dependents'
 'remainder__existing_loans_count']


In [23]:
y_pred=pipeline.predict(x_test)

In [25]:
from sklearn.metrics import *


In [26]:
accuracy_score(y_pred,y_test)

0.89

In [27]:
from sklearn.ensemble import RandomForestClassifier

In [29]:
p2=Pipeline([
    ('preprocessor',preprocessor),
    ('rf',RandomForestClassifier(n_estimators=100))
])

In [30]:
p2.fit(x_train,y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [31]:
y_pred=p2.predict(x_test)

In [32]:
accuracy_score(y_pred,y_test)

0.97

In [33]:
import pickle

In [36]:
with open("p2.pkl", "wb") as f:
    pickle.dump(p2, f)
