In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import pandas as pd
import numpy as np

# Load and copy dataset
df = pd.read_csv("loan_data.csv").copy()

# Replace '3+' with 3 in Dependents
df['Dependents'] = df['Dependents'].replace('3+', 3).astype(float)

# Fill missing values
df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)
df['Dependents'].fillna(df['Dependents'].mode()[0], inplace=True)
df['Self_Employed'].fillna(df['Self_Employed'].mode()[0], inplace=True)
df['LoanAmount'].fillna(df['LoanAmount'].median(), inplace=True)
df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode()[0], inplace=True)
df['Credit_History'].fillna(df['Credit_History'].mode()[0], inplace=True)

# Drop Loan_ID
df.drop("Loan_ID", axis=1, inplace=True)

# Features and target
X = df.drop("Loan_Status", axis=1)
y = df["Loan_Status"]

# Define column types
numeric_features = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Dependents']
categorical_features = ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area']
binary_features = ['Credit_History']

# Pipelines for preprocessing
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
)
])

binary_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent"))
])

# Combine transformers
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
        ("bin", binary_transformer, binary_features)
    ]
)

# Fit and transform
X_processed = preprocessor.fit_transform(X)

# Get feature names
ohe = preprocessor.named_transformers_['cat']['onehot']
cat_feature_names = ohe.get_feature_names_out(categorical_features)

# Combine all feature names
all_feature_names = numeric_features + list(cat_feature_names) + binary_features

# Create DataFrame
X_encoded = pd.DataFrame(X_processed, columns=all_feature_names)

# Combine with target
df_cleaned = pd.concat([X_encoded, y.reset_index(drop=True)], axis=1)

# Confirm
print(df_cleaned.head())
print(df_cleaned.isnull().sum())


   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0           5720.0                0.0       110.0             360.0   
1           3076.0             1500.0       126.0             360.0   
2           5000.0             1800.0       208.0             360.0   
3           2340.0             2546.0       100.0             360.0   
4           3276.0                0.0        78.0             360.0   

   Dependents  Gender_Male  Married_Yes  Education_Not Graduate  \
0         0.0          1.0          1.0                     0.0   
1         1.0          1.0          1.0                     0.0   
2         2.0          1.0          1.0                     0.0   
3         2.0          1.0          1.0                     0.0   
4         0.0          1.0          0.0                     1.0   

   Self_Employed_Yes  Property_Area_Semiurban  Property_Area_Urban  \
0                0.0                      0.0                  1.0   
1                0.0          

In [8]:
from sklearn.model_selection import train_test_split

X = df_cleaned.drop("Loan_Status", axis=1)
y = df_cleaned["Loan_Status"].map({'Y': 1, 'N': 0})  # Convert to binary 0/1 if not done already

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)

print("Logistic Regression Results:")
print(confusion_matrix(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))
print("Accuracy:", accuracy_score(y_test, y_pred_lr))


Logistic Regression Results:
[[19 13]
 [ 4 38]]
              precision    recall  f1-score   support

           0       0.83      0.59      0.69        32
           1       0.75      0.90      0.82        42

    accuracy                           0.77        74
   macro avg       0.79      0.75      0.75        74
weighted avg       0.78      0.77      0.76        74

Accuracy: 0.7702702702702703


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

print("Random Forest Results:")
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))
print("Accuracy:", accuracy_score(y_test, y_pred_rf))


Random Forest Results:
[[25  7]
 [ 6 36]]
              precision    recall  f1-score   support

           0       0.81      0.78      0.79        32
           1       0.84      0.86      0.85        42

    accuracy                           0.82        74
   macro avg       0.82      0.82      0.82        74
weighted avg       0.82      0.82      0.82        74

Accuracy: 0.8243243243243243


In [13]:
import joblib

joblib.dump(rf, "rf_model.pkl")
joblib.dump(lr, "logistic_model.pkl")
joblib.dump(preprocessor, "preprocessor.pkl")


['preprocessor.pkl']