In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

df = pd.read_csv("Loan.csv")

df.drop(columns=['Loan_ID', 'effective_date', 'due_date', 'paid_off_time'], inplace=True)
df.fillna(df.select_dtypes(include=[np.number]).median(), inplace=True)

label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

X = df.drop(columns=['loan_status'])
y = df['loan_status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print("Baseline Accuracy:", accuracy_score(y_test, y_pred))

param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

random_search = RandomizedSearchCV(RandomForestClassifier(random_state=42), param_distributions=param_dist, 
                                   n_iter=10, cv=5, n_jobs=-1)
random_search.fit(X_train, y_train)

rf_optimized = random_search.best_estimator_
y_pred_optimized = rf_optimized.predict(X_test)

print("Optimized Accuracy:", accuracy_score(y_test, y_pred_optimized))

grid_params = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt']
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid=grid_params, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

rf_final = grid_search.best_estimator_
y_pred_final = rf_final.predict(X_test)

print("Final Optimized Accuracy:", accuracy_score(y_test, y_pred_final))

Baseline Accuracy: 0.96
Optimized Accuracy: 0.96
Final Optimized Accuracy: 0.96
