In [1]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn import ensemble

import pandas as pd
import numpy as np

In [2]:
train_data=pd.read_csv('Train.csv')

In [3]:
train_data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
train_data.shape

(614, 13)

In [5]:
train_data.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [6]:
columnsX=train_data.columns[1:-1]
columnsY=train_data.columns[-1]

In [7]:
columnsX

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area'],
      dtype='object')

In [8]:
columnsY

'Loan_Status'

In [9]:
train_data.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [10]:
categorical_col=['Gender','Married','Dependents','Education','Self_Employed','Property_Area']
numWithScaling_col=['ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term']
numerical_col=['Credit_History']

In [11]:
numeric_transformer_scaling=Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])

numeric_transformer=Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=99))])

categorical_transformer=Pipeline(steps=[('imputer',SimpleImputer(strategy='constant', fill_value='missing')),
                                       ('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor=ColumnTransformer(transformers=[
    ('num', numeric_transformer_scaling, numWithScaling_col),
    ('num2', numeric_transformer, numerical_col),
    ('cat', categorical_transformer, categorical_col)
])

In [12]:
modelPipeline=Pipeline(steps=[('preprocessor', preprocessor), ('classifier', ensemble.RandomForestClassifier())])

In [13]:
modelPipeline.fit(train_data[columnsX], train_data[columnsY])

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['ApplicantIncome',
                                                   'CoapplicantIncome',
                                                   'LoanAmount',
                                                   'Loan_Amount_Term']),
                                                 ('num2',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value=99,
                                                     

In [16]:
modelPipeline.score(train_data[columnsX], train_data[columnsY])

1.0

In [17]:
test_data=pd.read_csv('Test.csv')

In [18]:
predicted=modelPipeline.predict(test_data[columnsX])

In [20]:
print(predicted)

['Y' 'Y' 'Y' 'Y' 'N' 'Y' 'Y' 'N' 'Y' 'Y' 'Y' 'Y' 'Y' 'N' 'Y' 'Y' 'Y' 'Y'
 'Y' 'Y' 'Y' 'Y' 'N' 'Y' 'Y' 'N' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'N'
 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y'
 'Y' 'N' 'Y' 'Y' 'N' 'Y' 'Y' 'Y' 'Y' 'N' 'Y' 'Y' 'N' 'N' 'Y' 'N' 'Y' 'Y'
 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'N' 'Y' 'N' 'Y' 'N' 'Y' 'Y' 'Y' 'Y' 'N'
 'Y' 'Y' 'Y' 'Y' 'N' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'N' 'Y' 'N' 'Y' 'Y' 'N' 'Y'
 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'N' 'N' 'N' 'Y' 'Y' 'Y' 'N' 'N' 'Y'
 'N' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'N' 'Y'
 'Y' 'Y' 'Y' 'N' 'Y' 'Y' 'Y' 'Y' 'Y' 'N' 'Y' 'Y' 'Y' 'N' 'Y' 'N' 'Y' 'N'
 'Y' 'Y' 'Y' 'N' 'N' 'Y' 'N' 'Y' 'Y' 'Y' 'Y' 'N' 'N' 'Y' 'Y' 'Y' 'Y' 'Y'
 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'N' 'Y' 'Y' 'Y' 'N' 'N' 'Y' 'Y' 'N' 'Y'
 'N' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'N' 'Y' 'Y' 'Y' 'Y'
 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'N' 'Y' 'Y' 'Y' 'Y' 'N' 'Y' 'Y' 'Y' 'Y'
 'Y' 'N' 'N' 'Y' 'Y' 'N' 'Y' 'N' 'Y' 'N' 'Y' 'N' 'Y

In [22]:
prediction={j:k for j,k in zip(test_data['Loan_ID'],predicted)}

In [26]:
print(prediction)

{'LP001015': 'Y', 'LP001022': 'Y', 'LP001031': 'Y', 'LP001035': 'Y', 'LP001051': 'N', 'LP001054': 'Y', 'LP001055': 'Y', 'LP001056': 'N', 'LP001059': 'Y', 'LP001067': 'Y', 'LP001078': 'Y', 'LP001082': 'Y', 'LP001083': 'Y', 'LP001094': 'N', 'LP001096': 'Y', 'LP001099': 'Y', 'LP001105': 'Y', 'LP001107': 'Y', 'LP001108': 'Y', 'LP001115': 'Y', 'LP001121': 'Y', 'LP001124': 'Y', 'LP001128': 'N', 'LP001135': 'Y', 'LP001149': 'Y', 'LP001153': 'N', 'LP001163': 'Y', 'LP001169': 'Y', 'LP001174': 'Y', 'LP001176': 'Y', 'LP001177': 'Y', 'LP001183': 'Y', 'LP001185': 'Y', 'LP001187': 'Y', 'LP001190': 'Y', 'LP001203': 'N', 'LP001208': 'Y', 'LP001210': 'Y', 'LP001211': 'Y', 'LP001219': 'Y', 'LP001220': 'Y', 'LP001221': 'Y', 'LP001226': 'Y', 'LP001230': 'Y', 'LP001231': 'Y', 'LP001232': 'Y', 'LP001237': 'Y', 'LP001242': 'Y', 'LP001268': 'Y', 'LP001270': 'Y', 'LP001284': 'Y', 'LP001287': 'Y', 'LP001291': 'Y', 'LP001298': 'Y', 'LP001312': 'Y', 'LP001313': 'N', 'LP001317': 'Y', 'LP001321': 'Y', 'LP001323': '

In [28]:
import joblib

In [None]:
joblib.dump(modelPipeline, 'modelpipeline.pkl')