In [8]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [9]:
train = pd.read_csv("train_ctrUa4K.csv")
test = pd.read_csv("test_lAUu6dG.csv")

In [10]:
# !pip install imblearn
# !pip install sklearn-pandas==1.5.0
from sklearn_pandas import CategoricalImputer
ci = CategoricalImputer()
from sklearn.preprocessing import Imputer

In [11]:
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder, StandardScaler
scaler = StandardScaler()
le = LabelEncoder()

In [12]:
sm = SMOTE(random_state = 42)

In [13]:
imputer = Imputer()

In [14]:

def preprocess(df):
    df.drop(["Loan_ID"],1,inplace = True)
    df["Credit_History"] =df["Credit_History"].astype(object)
    numerical = df.select_dtypes(include = np.number)
    categorical = df.select_dtypes(exclude = np.number)
    numerical = pd.DataFrame(imputer.fit_transform(numerical), columns=list(numerical))
    numerical = pd.DataFrame(scaler.fit_transform(numerical), columns=list(numerical))
    for x in list(categorical):
        categorical[x] = le.fit_transform(ci.fit_transform(categorical[x].astype("category")))
    df = pd.concat([numerical,categorical],1)
    return df

In [15]:
import numpy as np
train = preprocess(train)
test = preprocess(test)

In [16]:
train.shape

(614, 12)

In [17]:
train.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Gender,Married,Dependents,Education,Self_Employed,Credit_History,Property_Area,Loan_Status
0,0.072991,-0.554487,0.0,0.279851,1,0,0,0,0,1,2,1
1,-0.134412,-0.038732,-0.219273,0.279851,1,1,1,0,0,1,0,0
2,-0.393747,-0.554487,-0.957641,0.279851,1,1,0,0,1,1,2,1
3,-0.462062,0.25198,-0.314547,0.279851,1,1,0,1,0,1,2,1
4,0.097728,-0.554487,-0.064454,0.279851,1,0,0,0,0,1,2,1


In [18]:
test.shape

(367, 11)

In [19]:
X = train.drop(["Loan_Status"],1)
y = train["Loan_Status"]
X_validation = test

In [20]:
from sklearn.model_selection import GridSearchCV, train_test_split as tts

In [21]:
X_train, X_test, y_train, y_test = tts(X,y, random_state = 42, test_size = 0.25)

In [22]:
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier, BaggingClassifier

In [23]:
from sklearn.neighbors import KNeighborsClassifier

### Model 1: Logistic Regression without hyperparameter and class weights in not balanced


In [24]:
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train,y_train)
log_reg.score(X_test,y_test)
y_pred  = log_reg.predict(X_test)
print (accuracy_score(y_test,y_pred))
print (classification_report(y_test,y_pred))
print (roc_auc_score(y_test,y_pred))

0.7727272727272727
              precision    recall  f1-score   support

           0       0.91      0.39      0.55        54
           1       0.75      0.98      0.85       100

    accuracy                           0.77       154
   macro avg       0.83      0.68      0.70       154
weighted avg       0.81      0.77      0.74       154

0.6844444444444444


### Model 2: Logistic Regression without hyperparameter and class weights balanced


In [25]:
log_reg = LogisticRegression(random_state=42, class_weight="balanced")
log_reg.fit(X_train,y_train)
log_reg.score(X_test,y_test)
y_pred  = log_reg.predict(X_test)
print (accuracy_score(y_test,y_pred))
print (classification_report(y_test,y_pred))
print (roc_auc_score(y_test,y_pred))

0.7597402597402597
              precision    recall  f1-score   support

           0       0.77      0.44      0.56        54
           1       0.76      0.93      0.83       100

    accuracy                           0.76       154
   macro avg       0.77      0.69      0.70       154
weighted avg       0.76      0.76      0.74       154

0.6872222222222222


### Model 3: Logistic Regression with hyperparameters optimized and class weights balanced

In [28]:
log_reg
params = {"C":np.arange(0.01,5,0.1), 
         "penalty":["l2", "l1"]}
log_reg_cv = GridSearchCV(log_reg, param_grid=params, cv = 10)
log_reg_cv.fit(X_train,y_train)
y_pred = log_reg_cv.best_estimator_.predict(X_test)
print (log_reg_cv.best_params_)
print (accuracy_score(y_test,y_pred))
print (classification_report(y_test,y_pred))
print (roc_auc_score(y_test,y_pred))

{'C': 0.11, 'penalty': 'l1'}
0.7727272727272727
              precision    recall  f1-score   support

           0       0.91      0.39      0.55        54
           1       0.75      0.98      0.85       100

    accuracy                           0.77       154
   macro avg       0.83      0.68      0.70       154
weighted avg       0.81      0.77      0.74       154

0.6844444444444444


### Model 4: Decision tree with hyperparameters optimized and class weights balanced

In [29]:
dtc = DecisionTreeClassifier(random_state=42)

In [30]:
dtc

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')

In [31]:

params = {"criterion":["gini", "entropy"], 
         "max_depth":np.arange(4,16,1), 
         "min_samples_split":np.arange(0.05, 0.16, 0.01)}
dtc_cv = GridSearchCV(dtc, param_grid=params, cv = 10)
dtc_cv.fit(X_train,y_train)
y_pred = dtc_cv.best_estimator_.predict(X_test)
print (dtc_cv.best_params_)
print (accuracy_score(y_test,y_pred))
print (classification_report(y_test,y_pred))
print (roc_auc_score(y_test,y_pred))

{'criterion': 'gini', 'max_depth': 8, 'min_samples_split': 0.14}
0.7532467532467533
              precision    recall  f1-score   support

           0       0.71      0.50      0.59        54
           1       0.77      0.89      0.82       100

    accuracy                           0.75       154
   macro avg       0.74      0.70      0.71       154
weighted avg       0.75      0.75      0.74       154

0.6950000000000001


### Model 5: Random Forest with no hyperparameter optmized

In [32]:
from sklearn.ensemble import RandomForestClassifier

In [33]:
rfc = RandomForestClassifier(random_state=42, oob_score=True)

In [34]:
rfc.fit(X_train,y_train)
y_pred = rfc.predict(X_test)
print (accuracy_score(y_test,y_pred))
print (classification_report(y_test,y_pred))
print (roc_auc_score(y_test,y_pred))
print (rfc.oob_score_)

0.7532467532467533
              precision    recall  f1-score   support

           0       0.75      0.44      0.56        54
           1       0.75      0.92      0.83       100

    accuracy                           0.75       154
   macro avg       0.75      0.68      0.69       154
weighted avg       0.75      0.75      0.73       154

0.6822222222222223
0.7195652173913043


### Model 5: Random Forest with no hyperparameter optmized

In [35]:
rfc = RandomForestClassifier(random_state=42, oob_score=True)
params = {"criterion":["gini", "entropy"]}
rfc_cv = GridSearchCV(rfc, param_grid=params, cv = 10)
rfc_cv.fit(X_train,y_train)
y_pred = rfc_cv.best_estimator_.predict(X_test)
print (rfc_cv.best_params_)
print (classification_report(y_test,y_pred))
print (roc_auc_score(y_test,y_pred))

{'criterion': 'gini'}
              precision    recall  f1-score   support

           0       0.75      0.44      0.56        54
           1       0.75      0.92      0.83       100

    accuracy                           0.75       154
   macro avg       0.75      0.68      0.69       154
weighted avg       0.75      0.75      0.73       154

0.6822222222222223


### Model 6 Adaboost with Decision tree

In [47]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
adb = AdaBoostClassifier()
gdb = GradientBoostingClassifier()

In [51]:

adb = AdaBoostClassifier(base_estimator=rfc, n_estimators=100)

adb.fit(X_train,y_train)
y_pred = adb.predict(X_test)
print (accuracy_score(y_test,y_pred))
print (classification_report(y_test,y_pred))
print (roc_auc_score(y_test,y_pred))

0.7337662337662337
              precision    recall  f1-score   support

           0       0.69      0.44      0.54        54
           1       0.75      0.89      0.81       100

    accuracy                           0.73       154
   macro avg       0.72      0.67      0.68       154
weighted avg       0.73      0.73      0.72       154

0.6672222222222222
