# Basic Random Forest

In [43]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# 1. Load data
df = pd.read_csv("train_cleaned.csv")

# 3. Preprocessing
# Check and encode categorical variables
categorical_columns = df.select_dtypes(include=["object"]).columns
df_encoded = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Separate features and target label
X = df_encoded.drop(columns=["Loan_Status_Y"], errors="ignore")
y = df_encoded["Loan_Status_Y"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Build the model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# 5. Evaluate the model
y_pred = rf_model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.2f}")

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)

# Classification Report
class_report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(class_report)

# 6. Feature Importance
importances = rf_model.feature_importances_
feature_importance = pd.DataFrame({
    "Feature": X.columns,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)

print("\nTop 10 most important features:")
print(feature_importance.head(10))


Accuracy: 0.79

Confusion Matrix:
[[18 25]
 [ 1 79]]

Classification Report:
              precision    recall  f1-score   support

       False       0.95      0.42      0.58        43
        True       0.76      0.99      0.86        80

    accuracy                           0.79       123
   macro avg       0.85      0.70      0.72       123
weighted avg       0.83      0.79      0.76       123


Top 10 most important features:
                     Feature  Importance
4             Credit_History    0.189253
0            ApplicantIncome    0.078834
2                 LoanAmount    0.073886
1          CoapplicantIncome    0.048769
3           Loan_Amount_Term    0.035518
625  Property_Area_Semiurban    0.019232
619              Married_Yes    0.016902
623   Education_Not Graduate    0.013505
626      Property_Area_Urban    0.012581
620             Dependents_1    0.012068


# Advanced Random Forest

In [12]:
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.model_selection import RandomizedSearchCV, train_test_split, cross_val_score
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv("train_cleaned.csv")

# Split features and labels
X = df.drop(columns=["Loan_Status"], errors="ignore")
y = df["Loan_Status"]

# Drop unnecessary columns
X = X.drop(columns=["Loan_ID"], errors="ignore")

# Identify column types
categorical_columns = X.select_dtypes(include=["object"]).columns
numerical_columns = X.select_dtypes(include=["int64", "float64"]).columns

# Preprocessing pipelines
numerical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=0.95))
])

categorical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_pipeline, numerical_columns),
        ("cat", categorical_pipeline, categorical_columns)
    ]
)

# Full pipeline
pipeline = ImbPipeline(steps=[
    ("preprocessor", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("classifier", RandomForestClassifier(random_state=42, n_jobs=-1))
])

# Hyperparameter tuning with RandomizedSearchCV
param_dist = {
    'classifier__n_estimators': [50, 100, 150, 200],
    'classifier__max_depth': [10, 20, None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__max_features': ['sqrt', 'log2', None],
    'classifier__bootstrap': [True, False]
}

random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    n_iter=10,
    cv=3,
    scoring="accuracy",
    random_state=42,
    n_jobs=-1
)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
random_search.fit(X_train, y_train)

# Make predictions
y_pred = random_search.best_estimator_.predict(X_test)

# Evaluation metrics
# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.2f}")

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)

# Classification Report
class_report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(class_report)

# Cross-validation
cv_scores = cross_val_score(random_search.best_estimator_, X, y, cv=5, scoring="accuracy")
print("\nCross-Validation Scores:", cv_scores)
print(f"Mean CV Accuracy: {cv_scores.mean():.2f}")
print(f"Standard Deviation of CV Accuracy: {cv_scores.std():.2f}")

# Feature Importance
rf_model = random_search.best_estimator_.named_steps["classifier"]
importances = rf_model.feature_importances_
feature_names = (
    numerical_columns.tolist() + 
    list(random_search.best_estimator_.named_steps["preprocessor"].transformers_[1][1]
         .named_steps["encoder"].get_feature_names_out(categorical_columns))
)
feature_importance = pd.DataFrame({
    "Feature": feature_names,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))


Accuracy: 0.73

Confusion Matrix:
[[19 24]
 [ 9 71]]

Classification Report:
              precision    recall  f1-score   support

           N       0.68      0.44      0.54        43
           Y       0.75      0.89      0.81        80

    accuracy                           0.73       123
   macro avg       0.71      0.66      0.67       123
weighted avg       0.72      0.73      0.71       123


Cross-Validation Scores: [0.78861789 0.7398374  0.79674797 0.76422764 0.7295082 ]
Mean CV Accuracy: 0.76
Standard Deviation of CV Accuracy: 0.03

Top 10 Most Important Features:
                    Feature  Importance
2                LoanAmount    0.260163
3          Loan_Amount_Term    0.155894
1         CoapplicantIncome    0.109444
4            Credit_History    0.092998
0           ApplicantIncome    0.082510
18  Property_Area_Semiurban    0.050343
17      Property_Area_Rural    0.048479
8               Married_Yes    0.024688
19      Property_Area_Urban    0.023666
7               