In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [3]:
df_train = df_train.drop('id',axis = 1)
df_test = df_test.drop('id',axis = 1)

In [4]:
df_train.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
0,37,35000,RENT,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0
1,22,56000,OWN,6.0,MEDICAL,C,4000,13.35,0.07,N,2,0
2,29,28800,OWN,8.0,PERSONAL,A,6000,8.9,0.21,N,10,0
3,30,70000,RENT,14.0,VENTURE,B,12000,11.11,0.17,N,5,0
4,22,60000,RENT,2.0,MEDICAL,A,6000,6.92,0.1,N,3,0


In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58645 entries, 0 to 58644
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  58645 non-null  int64  
 1   person_income               58645 non-null  int64  
 2   person_home_ownership       58645 non-null  object 
 3   person_emp_length           58645 non-null  float64
 4   loan_intent                 58645 non-null  object 
 5   loan_grade                  58645 non-null  object 
 6   loan_amnt                   58645 non-null  int64  
 7   loan_int_rate               58645 non-null  float64
 8   loan_percent_income         58645 non-null  float64
 9   cb_person_default_on_file   58645 non-null  object 
 10  cb_person_cred_hist_length  58645 non-null  int64  
 11  loan_status                 58645 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 5.4+ MB


In [6]:
df_train.isna().sum()

person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
loan_status                   0
dtype: int64

In [7]:
label = LabelEncoder()

In [8]:
df_train['person_home_ownership'] = label.fit_transform(df_train['person_home_ownership'])
df_test['person_home_ownership'] = label.transform(df_test['person_home_ownership'])

In [9]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58645 entries, 0 to 58644
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  58645 non-null  int64  
 1   person_income               58645 non-null  int64  
 2   person_home_ownership       58645 non-null  int64  
 3   person_emp_length           58645 non-null  float64
 4   loan_intent                 58645 non-null  object 
 5   loan_grade                  58645 non-null  object 
 6   loan_amnt                   58645 non-null  int64  
 7   loan_int_rate               58645 non-null  float64
 8   loan_percent_income         58645 non-null  float64
 9   cb_person_default_on_file   58645 non-null  object 
 10  cb_person_cred_hist_length  58645 non-null  int64  
 11  loan_status                 58645 non-null  int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 5.4+ MB


In [11]:
df_train['loan_grade'] = label.fit_transform(df_train['loan_grade'])
df_test['loan_grade'] = label.transform(df_test['loan_grade'])

In [12]:
df_train['loan_intent'] = label.fit_transform(df_train['loan_intent'])
df_test['loan_intent'] = label.transform(df_test['loan_intent'])

In [13]:
df_train['cb_person_default_on_file'] = label.fit_transform(df_train['cb_person_default_on_file'])
df_test['cb_person_default_on_file'] = label.transform(df_test['cb_person_default_on_file'])

In [14]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58645 entries, 0 to 58644
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  58645 non-null  int64  
 1   person_income               58645 non-null  int64  
 2   person_home_ownership       58645 non-null  int64  
 3   person_emp_length           58645 non-null  float64
 4   loan_intent                 58645 non-null  int64  
 5   loan_grade                  58645 non-null  int64  
 6   loan_amnt                   58645 non-null  int64  
 7   loan_int_rate               58645 non-null  float64
 8   loan_percent_income         58645 non-null  float64
 9   cb_person_default_on_file   58645 non-null  int64  
 10  cb_person_cred_hist_length  58645 non-null  int64  
 11  loan_status                 58645 non-null  int64  
dtypes: float64(3), int64(9)
memory usage: 5.4 MB


In [19]:
X_train = df_train.drop('loan_status', axis = 1)
y_train = df_train['loan_status']
x_test = df_test.copy()

In [20]:
model = RandomForestClassifier()

In [21]:
model_params = {
    'n_estimators':[60,70,80,90],
    'criterion':['gini','entropy','log_loss'],
    'max_depth':[None,1,3,5,7],
    'max_features':[None,'sqrt','log2']
}

In [22]:
final_model = GridSearchCV(model,param_grid = model_params, cv = 3, verbose = 2)

In [23]:
final_model.fit(X_train,y_train)

Fitting 3 folds for each of 180 candidates, totalling 540 fits
[CV] END criterion=gini, max_depth=None, max_features=None, n_estimators=60; total time=  10.1s
[CV] END criterion=gini, max_depth=None, max_features=None, n_estimators=60; total time=  10.3s
[CV] END criterion=gini, max_depth=None, max_features=None, n_estimators=60; total time=   9.7s
[CV] END criterion=gini, max_depth=None, max_features=None, n_estimators=70; total time=  11.1s
[CV] END criterion=gini, max_depth=None, max_features=None, n_estimators=70; total time=  11.5s
[CV] END criterion=gini, max_depth=None, max_features=None, n_estimators=70; total time=  11.2s
[CV] END criterion=gini, max_depth=None, max_features=None, n_estimators=80; total time=  12.5s
[CV] END criterion=gini, max_depth=None, max_features=None, n_estimators=80; total time=  13.4s
[CV] END criterion=gini, max_depth=None, max_features=None, n_estimators=80; total time=  13.3s
[CV] END criterion=gini, max_depth=None, max_features=None, n_estimators=

In [24]:
best_model = final_model.best_estimator_

In [25]:
y_pred = best_model.predict(x_test)

In [27]:
submission = pd.read_csv('sample_submission.csv')

In [28]:
submission['loan_status'] = y_pred

In [29]:
submission.to_csv('sub.csv', index=False, index_label=False)