In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, confusion_matrix, classification_report
)
from imblearn.over_sampling import SMOTE
import joblib
from imblearn.combine import SMOTEENN
from sklearn.model_selection import GridSearchCV

In [17]:
data = pd.read_csv('withoutScaling.csv')

In [3]:
data.columns

Index(['loan_amnt', 'term', 'int_rate', 'installment', 'grade', 'sub_grade',
       'emp_title', 'emp_length', 'home_ownership', 'annual_inc',
       'verification_status', 'loan_status', 'purpose', 'dti',
       'earliest_cr_line', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util',
       'total_acc', 'initial_list_status', 'application_type', 'mort_acc',
       'pub_rec_bankruptcies', 'city'],
      dtype='object')

In [4]:
data['earliest_cr_line'].value_counts()

earliest_cr_line
2000-10-01    3017
2000-08-01    2935
2001-10-01    2896
2001-08-01    2884
2000-11-01    2736
              ... 
1958-07-01       1
1957-11-01       1
1953-01-01       1
1955-07-01       1
1959-08-01       1
Name: count, Length: 684, dtype: int64

In [5]:
data.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,...,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,application_type,mort_acc,pub_rec_bankruptcies,city
0,10000.0,0,11.44,329.48,1,8,80956,1,5,117000.0,...,16.0,0.0,36369.0,41.8,25.0,1,1,0.0,0.0,28584
1,8000.0,0,11.99,265.68,1,9,33317,4,1,65000.0,...,17.0,0.0,20131.0,53.3,27.0,0,1,3.0,0.0,26621
2,15600.0,0,10.49,506.97,1,7,127182,10,5,43057.0,...,13.0,0.0,11987.0,92.2,26.0,0,1,0.0,0.0,34634
3,7200.0,0,6.49,220.65,0,1,27760,6,5,54000.0,...,6.0,0.0,5472.0,21.5,13.0,0,1,0.0,0.0,6299
4,24375.0,1,17.27,609.33,2,14,38300,9,1,55000.0,...,13.0,0.0,24584.0,69.8,43.0,0,1,1.0,0.0,15388


In [18]:
X = data.drop(['loan_status','earliest_cr_line'],axis=1)
y = data['loan_status']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Random Forest Classifier

In [12]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [13]:
def print_score(true, pred, train=True):
    if train:
        clf_report = pd.DataFrame(classification_report(true, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(true, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(true, pred)}\n")
        
    elif train==False:
        clf_report = pd.DataFrame(classification_report(true, pred, output_dict=True))
        print("Test Result:\n================================================")        
        print(f"Accuracy Score: {accuracy_score(true, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(true, pred)}\n")

In [14]:
rf_clf = RandomForestClassifier(class_weight='balanced',n_estimators=200, random_state=42)
rf_clf.fit(X_resampled, y_resampled)

y_train_pred = rf_clf.predict(X_resampled)
y_test_pred = rf_clf.predict(X_test)

print_score(y_resampled, y_train_pred, train=True)
print_score(y_test, y_test_pred, train=False)

Train Result:
Accuracy Score: 100.00%
_______________________________________________
CLASSIFICATION REPORT:
                  0         1  accuracy  macro avg  weighted avg
precision       1.0       1.0       1.0        1.0           1.0
recall          1.0       1.0       1.0        1.0           1.0
f1-score        1.0       1.0       1.0        1.0           1.0
support    222918.0  222918.0       1.0   445836.0      445836.0
_______________________________________________
Confusion Matrix: 
 [[222918      0]
 [     0 222918]]

Test Result:
Accuracy Score: 78.32%
_______________________________________________
CLASSIFICATION REPORT:
                      0             1  accuracy      macro avg   weighted avg
precision      0.400034      0.825981  0.783232       0.613007       0.742196
recall         0.204108      0.925041  0.783232       0.564574       0.783232
f1-score       0.270301      0.872709  0.783232       0.571505       0.754214
support    23370.000000  95439.000000  0.78

In [20]:
# Apply SMOTEENN
smoteenn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smoteenn.fit_resample(X_train, y_train)

# Train Random Forest Classifier
rf_clf = RandomForestClassifier(class_weight='balanced', n_estimators=200, random_state=42)
rf_clf.fit(X_resampled, y_resampled)

# Predict on train and test sets
y_train_pred = rf_clf.predict(X_resampled)
y_test_pred = rf_clf.predict(X_test)

# Evaluate using custom print_score function
print_score(y_resampled, y_train_pred, train=True)
print_score(y_test, y_test_pred, train=False)


Train Result:
Accuracy Score: 100.00%
_______________________________________________
CLASSIFICATION REPORT:
                  0         1  accuracy  macro avg  weighted avg
precision       1.0       1.0       1.0        1.0           1.0
recall          1.0       1.0       1.0        1.0           1.0
f1-score        1.0       1.0       1.0        1.0           1.0
support    175600.0  100734.0       1.0   276334.0      276334.0
_______________________________________________
Confusion Matrix: 
 [[175600      0]
 [     0 100734]]

Test Result:
Accuracy Score: 69.10%
_______________________________________________
CLASSIFICATION REPORT:
                      0             1  accuracy      macro avg   weighted avg
precision      0.327048      0.865922  0.691042       0.596485       0.759925
recall         0.539581      0.728130  0.691042       0.633855       0.691042
f1-score       0.407254      0.791071  0.691042       0.599162       0.715573
support    23370.000000  95439.000000  0.69

In [21]:
# Apply SMOTEENN
smoteenn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smoteenn.fit_resample(X_train, y_train)

# Train Random Forest Classifier
rf_clf = RandomForestClassifier(class_weight={0: 2, 1: 1}, n_estimators=200, random_state=42)
rf_clf.fit(X_resampled, y_resampled)

# Predict on train and test sets
y_train_pred = rf_clf.predict(X_resampled)
y_test_pred = rf_clf.predict(X_test)

# Evaluate using custom print_score function
print_score(y_resampled, y_train_pred, train=True)
print_score(y_test, y_test_pred, train=False)

Train Result:
Accuracy Score: 100.00%
_______________________________________________
CLASSIFICATION REPORT:
                  0         1  accuracy  macro avg  weighted avg
precision       1.0       1.0       1.0        1.0           1.0
recall          1.0       1.0       1.0        1.0           1.0
f1-score        1.0       1.0       1.0        1.0           1.0
support    175600.0  100734.0       1.0   276334.0      276334.0
_______________________________________________
Confusion Matrix: 
 [[175600      0]
 [     0 100734]]

Test Result:
Accuracy Score: 70.84%
_______________________________________________
CLASSIFICATION REPORT:
                      0             1  accuracy      macro avg   weighted avg
precision      0.332988      0.857378  0.708406       0.595183       0.754229
recall         0.480916      0.764111  0.708406       0.622513       0.708406
f1-score       0.393509      0.808062  0.708406       0.600785       0.726519
support    23370.000000  95439.000000  0.70

In [None]:
# Step 1: Apply SMOTEENN to handle class imbalance
# smoteenn = SMOTEENN(random_state=42)
# X_resampled, y_resampled = smoteenn.fit_resample(X_train, y_train)

# Step 2: Define hyperparameter grid for Random Forest
param_grid = {
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'class_weight': ['balanced', 'balanced_subsample']
}

# Step 3: Initialize Random Forest and GridSearchCV
rf = RandomForestClassifier(random_state=42, n_estimators=200)
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1)
grid_search.fit(X_resampled, y_resampled)

# Step 4: Get the best model and hyperparameters
best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

# Step 5: Train and evaluate the best model
# Train Predictions
y_train_pred = best_model.predict(X_resampled)

# Test Predictions
y_test_pred = best_model.predict(X_test)

# Custom evaluation function (print_score assumed to be defined)
print_score(y_resampled, y_train_pred, train=True)
print_score(y_test, y_test_pred, train=False)

# Optional: Detailed classification report for the test set
print("\nClassification Report on Test Set:")
print(classification_report(y_test, y_test_pred))
