In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

import warnings
warnings.filterwarnings("ignore")

# Model Building 

### Logistic Regression
Logistic Regression disarankan memakai fitur-fitur hasil chi2

In [160]:
import time
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

log_reg_model = LogisticRegression()

start_time = time.time()
log_reg_model.fit(X_train_upsampled_chi2, y_train_upsampled_chi2)
print("Execution time: " + str((time.time() - start_time)) + 's')

Execution time: 1.3088648319244385s


In [161]:
# Evaluasi hasil
y_pred = log_reg_model.predict(X_test_chi2)
y_proba = log_reg_model.predict_proba(X_test_chi2)[:, 1]

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nROC-AUC on Test Set:", roc_auc_score(y_test, y_proba))


Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.68      0.80     56536
           1       0.15      0.66      0.25      4965

    accuracy                           0.68     61501
   macro avg       0.56      0.67      0.52     61501
weighted avg       0.89      0.68      0.75     61501


Confusion Matrix:
[[38530 18006]
 [ 1682  3283]]

ROC-AUC on Test Set: 0.732658070908415


### LightGBM

In [162]:
from lightgbm import LGBMClassifier

count_class_0 = (y == 0).sum()
count_class_1 = (y == 1).sum()
scale_pos_weight = count_class_0 / count_class_1
print(f"Nilai scale_pos_weight yang disarankan: {scale_pos_weight}")

lgbm_model = LGBMClassifier(random_state=42, n_jobs=-1, verbose=1, scale_pos_weight=scale_pos_weight)

start_time = time.time()
lgbm_model.fit(X_train_mi, y_train)
print('Execution time: '+ str(time.time() - start_time) + 's')

Nilai scale_pos_weight yang disarankan: 11.386908358509567
[LightGBM] [Info] Number of positive: 19860, number of negative: 226144
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015116 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1905
[LightGBM] [Info] Number of data points in the train set: 246004, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080730 -> initscore=-2.432464
[LightGBM] [Info] Start training from score -2.432464
Execution time: 0.7726054191589355s


In [163]:
# Evaluasi hasil
y_pred = lgbm_model.predict(X_test_mi)
y_proba = lgbm_model.predict_proba(X_test_mi)[:, 1]

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nROC-AUC on Test Set:", roc_auc_score(y_test, y_proba))


Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.69      0.80     56536
           1       0.16      0.65      0.25      4965

    accuracy                           0.69     61501
   macro avg       0.56      0.67      0.53     61501
weighted avg       0.89      0.69      0.76     61501


Confusion Matrix:
[[39000 17536]
 [ 1723  3242]]

ROC-AUC on Test Set: 0.7384396894719811


### Random Forest

In [164]:
from sklearn.ensemble import RandomForestClassifier

rfc_model = RandomForestClassifier()

start_time = time.time()
rfc_model.fit(X_train_upsampled_mi, y_train_upsampled_mi)
print('Execution time: '+ str(time.time() - start_time) + 's')

Execution time: 99.08204436302185s


In [165]:
# Evaluasi hasil
y_pred = rfc_model.predict(X_test_mi)
y_proba = rfc_model.predict_proba(X_test_mi)[:, 1]

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nROC-AUC on Test Set:", roc_auc_score(y_test, y_proba))


Classification Report:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96     56536
           1       0.39      0.02      0.05      4965

    accuracy                           0.92     61501
   macro avg       0.65      0.51      0.50     61501
weighted avg       0.88      0.92      0.88     61501


Confusion Matrix:
[[56347   189]
 [ 4845   120]]

ROC-AUC on Test Set: 0.708859864673202


## Hyperparameter Tuning
#### Logistic Regression

In [166]:
from sklearn.model_selection import GridSearchCV

# Setup grid search logistic regression
param_grid = {
    'C': [0.01, 0.1, 1],
    'penalty': ['l2'],
    'solver': ['liblinear']
}

grid_lr = GridSearchCV(log_reg_model, param_grid, scoring='roc_auc', cv=3, n_jobs=-1, verbose=1)

start_time = time.time()
grid_lr.fit(X_train_upsampled_chi2, y_train_upsampled_chi2)
print("Execution time: " + str((time.time() - start_time)) + 's')

Fitting 3 folds for each of 3 candidates, totalling 9 fits
Execution time: 8.457830429077148s


In [167]:
# Evaluasi hasil
y_pred = grid_lr.predict(X_test_chi2)
y_proba = grid_lr.predict_proba(X_test_chi2)[:, 1]

print("Best Parameters:", grid_lr.best_params_)
print("Best ROC-AUC (CV):", grid_lr.best_score_)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nROC-AUC on Test Set:", roc_auc_score(y_test, y_proba))

Best Parameters: {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}
Best ROC-AUC (CV): 0.7339280009873494

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.68      0.80     56536
           1       0.15      0.66      0.25      4965

    accuracy                           0.68     61501
   macro avg       0.56      0.67      0.52     61501
weighted avg       0.89      0.68      0.75     61501


Confusion Matrix:
[[38529 18007]
 [ 1680  3285]]

ROC-AUC on Test Set: 0.7326576042200598


### LightGBM

In [168]:
param_grid_lgbm = {
    'n_estimators': [100],                  # Jumlah pohon dalam forest
    'learning_rate': [0.1, 0.05],           # Laju pembelajaran
    'num_leaves': [31, 50],                 # Jumlah daun maksimum per pohon
}

grid_lgbm = GridSearchCV(lgbm_model, param_grid_lgbm, scoring='roc_auc', cv=3, n_jobs=-1, verbose=1)

start_time = time.time()
grid_lgbm.fit(X_train_mi, y_train)
print('Execution time: '+ str(time.time() - start_time) + 's')

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[LightGBM] [Info] Number of positive: 19860, number of negative: 226144
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012534 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1905
[LightGBM] [Info] Number of data points in the train set: 246004, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080730 -> initscore=-2.432464
[LightGBM] [Info] Start training from score -2.432464
Execution time: 14.139281511306763s


In [169]:
# Evaluasi
y_pred_lgbm = grid_lgbm.predict(X_test_mi)
y_proba_lgbm = grid_lgbm.predict_proba(X_test_mi)[:, 1]

print('Best Parameters: ', grid_lgbm.best_params_)
print('Best ROC-AUC (CV): ', grid_lgbm.best_score_)

print('\nClassification Report: ')
print(classification_report(y_test, y_pred_lgbm))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_lgbm))

print("\nROC-AUC on Test Set:", roc_auc_score(y_test, y_proba_lgbm))

Best Parameters:  {'learning_rate': 0.05, 'n_estimators': 100, 'num_leaves': 31}
Best ROC-AUC (CV):  0.7396492964141705

Classification Report: 
              precision    recall  f1-score   support

           0       0.96      0.69      0.80     56536
           1       0.16      0.66      0.25      4965

    accuracy                           0.68     61501
   macro avg       0.56      0.67      0.53     61501
weighted avg       0.89      0.68      0.76     61501


Confusion Matrix:
[[38821 17715]
 [ 1674  3291]]

ROC-AUC on Test Set: 0.739809523107201


### Random Forest

In [170]:
from sklearn.ensemble import RandomForestClassifier

rfc_model = RandomForestClassifier()
param_grid_rfc = {
    'n_estimators': [100],
    'max_depth': [10, 20],
    'min_samples_split': [5, 10],
    'max_features': ['sqrt']
}

grid_rfc = GridSearchCV(rfc_model, param_grid_rfc, scoring='roc_auc', cv=3, n_jobs=-1, verbose=1)

start_time = time.time()
grid_rfc.fit(X_train_upsampled_mi, y_train_upsampled_mi)
print('Execution time: '+ str(time.time() - start_time) + 's')

Fitting 3 folds for each of 4 candidates, totalling 12 fits
Execution time: 228.7957808971405s


In [171]:
# Evaluasi
y_pred_rfc = grid_rfc.predict(X_test_mi)
y_proba_rfc = grid_rfc.predict_proba(X_test_mi) [:, 1]

print('Best Parameters: ', grid_rfc.best_params_)
print('Best ROC-AUC (CV): ', grid_rfc.best_score_)

print('\nClassification Report: ')
print(classification_report(y_test, y_pred_rfc))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_rfc))

print("\nROC-AUC on Test Set:", roc_auc_score(y_test, y_proba_rfc))

Best Parameters:  {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_split': 5, 'n_estimators': 100}
Best ROC-AUC (CV):  0.9939254999422186

Classification Report: 
              precision    recall  f1-score   support

           0       0.93      0.92      0.93     56536
           1       0.23      0.26      0.24      4965

    accuracy                           0.87     61501
   macro avg       0.58      0.59      0.59     61501
weighted avg       0.88      0.87      0.87     61501


Confusion Matrix:
[[52227  4309]
 [ 3675  1290]]

ROC-AUC on Test Set: 0.710411297078702


# Prediction on Test Set ('application_test.csv')

In [172]:
final_appts_chi2.head()

Unnamed: 0,FLAG_WORK_PHONE,FLAG_PHONE,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,FLAG_DOCUMENT_3,DAYS_BIRTH,DAYS_EMPLOYED,REGION_RATING_CLIENT_W_CITY,EXT_SOURCE_2,EXT_SOURCE_3,DAYS_LAST_PHONE_CHANGE,NAME_CONTRACT_TYPE,CODE_GENDER,NAME_INCOME_TYPE_Pensioner,NAME_INCOME_TYPE_State servant,NAME_INCOME_TYPE_Working,NAME_EDUCATION_TYPE_Higher education,NAME_FAMILY_STATUS_Single / not married,NAME_FAMILY_STATUS_Widow,NAME_HOUSING_TYPE_With parents,EMERGENCYSTATE_MODE_Unknown,OCCUPATION_TYPE_Core staff,OCCUPATION_TYPE_Drivers,OCCUPATION_TYPE_Laborers,OCCUPATION_TYPE_Low-skill Laborers,OCCUPATION_TYPE_Sales staff,OCCUPATION_TYPE_Unknown,ORGANIZATION_TYPE_Business Entity Type 3,ORGANIZATION_TYPE_Self-employed
0,0.0,0.0,0.0,0.0,0.0,1.0,0.663505,0.130025,0.5,0.923573,0.177549,0.405405,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.597053,0.249498,0.5,0.341118,0.482907,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.708503,0.248883,0.5,0.818464,0.681715,0.199441,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,1.0,0.366249,0.104176,0.5,0.596114,0.683628,0.42055,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,1.0,0.0,0.0,1.0,1.0,1.0,0.313403,0.12232,0.5,0.49788,0.597163,0.191286,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [173]:
predict = pd.Series(grid_lgbm.predict(final_appts_mi), name = "TARGET").astype(int)
results = pd.concat([appts_id, predict],axis = 1)
results.to_csv("predict_application.csv", index = False)
results.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0
1,100005,1
2,100013,0
3,100028,0
4,100038,0
