In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_csv('loan_data.csv')
data.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1


In [3]:
data['person_emp_exp'] = data['person_emp_exp'].replace(-1, data['person_emp_exp'].median())

In [4]:
X = data.drop('loan_status', axis=1)
y = data['loan_status']

In [5]:
num_cols = ['person_age','person_income','person_emp_exp','credit_score','cb_person_cred_hist_length','loan_percent_income','loan_int_rate','loan_amnt']
cat_cols = ['loan_intent','person_education','previous_loan_defaults_on_file','person_gender']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [9]:
preprocess = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

In [10]:
params = {
    'model__max_depth':[3,5,7],
    'model__learning_rate':[0.01, 0.05, 0.1],
    'model__n_estimators':[100,200,300]
}


model = GridSearchCV(
    estimator=XGBClassifier(eval_metric='logloss'),
    param_grid=params,
    cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

In [11]:
pipe = ImbPipeline(steps=[
    ('prep', preprocess),
    ('sm', SMOTE()),
    ('model', model)
])

In [12]:
pipe.fit(X_train, y_train)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


Parameters: { "model__learning_rate", "model__max_depth", "model__n_estimators" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [13]:
pred = pipe.predict(X_test)
print('Xgboost Accuracy:',accuracy_score(y_test, pred))

Xgboost Accuracy: 0.9208888888888889


In [14]:
# Run this — it will print ~0.80 every time
print("Your rigged baseline:", (y_test == 1).mean())

Your rigged baseline: 0.2222222222222222


In [15]:
data.describe

<bound method NDFrame.describe of        person_age person_gender person_education  person_income  \
0            22.0        female           Master        71948.0   
1            21.0        female      High School        12282.0   
2            25.0        female      High School        12438.0   
3            23.0        female         Bachelor        79753.0   
4            24.0          male           Master        66135.0   
...           ...           ...              ...            ...   
44995        27.0          male        Associate        47971.0   
44996        37.0        female        Associate        65800.0   
44997        33.0          male        Associate        56942.0   
44998        29.0          male         Bachelor        33164.0   
44999        24.0          male      High School        51609.0   

       person_emp_exp person_home_ownership  loan_amnt        loan_intent  \
0                   0                  RENT    35000.0           PERSONAL   
1      

In [16]:
import joblib
joblib.dump(pipe, "loan_approval_model.pkl")

['loan_approval_model.pkl']