In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [2]:
def load_and_preprocess(file_path):
    data = pd.read_csv(file_path)

    X = data.drop("loan_status", axis=1)
    y = data["loan_status"]

    numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns
    categorical_cols = X.select_dtypes(include=["object"]).columns

    numeric_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    categorical_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ])

    preprocessor = ColumnTransformer([
        ("num", numeric_pipeline, numerical_cols),
        ("cat", categorical_pipeline, categorical_cols)
    ])

    return X, y, preprocessor


def train_model(X_train, y_train, preprocessor):
    model = Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", LogisticRegression(max_iter=1000))
    ])

    model.fit(X_train, y_train)
    return model


def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)

    print("\nAccuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:\n")
    print(classification_report(y_test, y_pred))

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))


def show_feature_importance(model):
    feature_names = model.named_steps["preprocessor"].get_feature_names_out()
    coefficients = model.named_steps["classifier"].coef_[0]

    importance = pd.DataFrame({
        "Feature": feature_names,
        "Coefficient": coefficients
    }).sort_values(by="Coefficient", ascending=False)

    print("\nTop Factors Increasing Approval:")
    print(importance.head(10))

    print("\nTop Factors Leading to Rejection:")
    print(importance.tail(10))


def main():
    file_path = "data.csv"

    X, y, preprocessor = load_and_preprocess(file_path)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    model = train_model(X_train, y_train, preprocessor)

    evaluate_model(model, X_test, y_test)

    show_feature_importance(model)


if __name__ == "__main__":
    main()



Accuracy: 0.8993333333333333

Classification Report:

              precision    recall  f1-score   support

           0       0.93      0.94      0.94      7000
           1       0.79      0.75      0.77      2000

    accuracy                           0.90      9000
   macro avg       0.86      0.84      0.85      9000
weighted avg       0.90      0.90      0.90      9000

Confusion Matrix:
[[6600  400]
 [ 506 1494]]

Top Factors Increasing Approval:
                                   Feature  Coefficient
25  cat__previous_loan_defaults_on_file_No     3.136223
5                 num__loan_percent_income     1.348589
4                       num__loan_int_rate     0.990910
18         cat__person_home_ownership_RENT     0.312553
19      cat__loan_intent_DEBTCONSOLIDATION     0.184524
21        cat__loan_intent_HOMEIMPROVEMENT     0.176892
0                          num__person_age     0.124881
1                       num__person_income     0.047273
16        cat__person_home_ownershi