In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

In [2]:
!pip uninstall -y scikit-learn
!pip install scikit-learn==1.5.2

Found existing installation: scikit-learn 1.6.1
Uninstalling scikit-learn-1.6.1:
  Successfully uninstalled scikit-learn-1.6.1
Collecting scikit-learn==1.5.2
  Downloading scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m77.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
Successfully installed scikit-learn-1.5.2


In [2]:
import xgboost
import sklearn
print(xgboost.__version__)
print(sklearn.__version__)


2.1.3
1.5.2


In [3]:
# Load data
data = pd.read_csv('/content/loan_data_set.csv')

In [4]:
# Step 1: Data Cleaning
# Drop irrelevant column
if 'Loan_ID' in data.columns:
    data = data.drop('Loan_ID', axis=1)

# Fill missing values
categorical_cols = ['Gender', 'Married', 'Dependents', 'Self_Employed', 'Credit_History']
numerical_cols = ['LoanAmount', 'Loan_Amount_Term']
for col in categorical_cols:
    data[col] = data[col].fillna(data[col].mode()[0])
for col in numerical_cols:
    data[col] = data[col].fillna(data[col].median())

In [5]:
# Step 2: Data Preprocessing
# Encode categorical variables
label_encoders = {}
for col in data.select_dtypes(include='object').columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Step 3: Feature Engineering
# Combine incomes and calculate Debt-to-Income ratio
data['Total_Income'] = data['ApplicantIncome'] + data['CoapplicantIncome']
data['Debt_To_Income_Ratio'] = data['LoanAmount'] / (data['Total_Income'] + 1e-6)
data = data.drop(['ApplicantIncome', 'CoapplicantIncome'], axis=1)

In [6]:
# Step 4: Outlier Removal
for col in ['LoanAmount', 'Total_Income']:
    q1 = data[col].quantile(0.25)
    q3 = data[col].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    data = data[(data[col] >= lower_bound) & (data[col] <= upper_bound)]

# Step 5: Normalize the dataset
scaler = StandardScaler()
data[['LoanAmount', 'Loan_Amount_Term', 'Total_Income', 'Debt_To_Income_Ratio']] = scaler.fit_transform(
    data[['LoanAmount', 'Loan_Amount_Term', 'Total_Income', 'Debt_To_Income_Ratio']]
)

In [7]:
# Step 6: Data Transformation
X = data.drop('Loan_Status', axis=1)
y = data['Loan_Status']

# Address Class Imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Step 7: Model Building with Hyperparameter Tuning
# XGBoost Classifier
# Example parameter grid for grid search
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [3, 6],
    'learning_rate': [0.01, 0.1],
}

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=StratifiedKFold(n_splits=5), scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [8]:
# Best model
best_model = grid_search.best_estimator_

In [9]:
# Cross-validation scores
cv_scores = cross_val_score(best_model, X_train, y_train, cv=StratifiedKFold(n_splits=5), scoring='accuracy')
print("Cross-validation Accuracy:", np.mean(cv_scores))

# Predictions and evaluation
y_pred = best_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Cross-validation Accuracy: 0.8050847457627119
Accuracy: 0.8108108108108109
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.68      0.76        66
           1       0.78      0.91      0.84        82

    accuracy                           0.81       148
   macro avg       0.82      0.80      0.80       148
weighted avg       0.82      0.81      0.81       148



Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



In [10]:
# Function for predicting loan eligibility based on user input
def predict_loan_eligibility():
    print("Enter the following details:")
    user_input = {
        'Gender': input("Gender (Male/Female): "),
        'Married': input("Married (Yes/No): "),
        'Dependents': input("Dependents (0/1/2/3+): "),
        'Education': input("Education (Graduate/Not Graduate): "),
        'Self_Employed': input("Self Employed (Yes/No): "),
        'ApplicantIncome': float(input("Applicant Income: ")),
        'CoapplicantIncome': float(input("Coapplicant Income: ")),
        'LoanAmount': float(input("Loan Amount: ")),
        'Loan_Amount_Term': float(input("Loan Amount Term: ")),
        'Credit_History': float(input("Credit History (1.0/0.0): ")),
        'Property_Area': input("Property Area (Urban/Semiurban/Rural): ")
    }

    input_df = pd.DataFrame([user_input])

    # Encode categorical variables
    for col, le in label_encoders.items():
        if col in input_df.columns:
            input_df[col] = le.transform(input_df[col])

    # Combine incomes and calculate Debt-to-Income Ratio
    input_df['Total_Income'] = input_df['ApplicantIncome'] + input_df['CoapplicantIncome']
    input_df['Debt_To_Income_Ratio'] = input_df['LoanAmount'] / (input_df['Total_Income'] + 1e-6)
    input_df = input_df.drop(['ApplicantIncome', 'CoapplicantIncome'], axis=1)

    # Ensure all features are present in the input and in the correct order
    input_df = input_df.reindex(columns=X.columns, fill_value=0)

    # Normalize numerical features using the original column order
    num_features = ['LoanAmount', 'Loan_Amount_Term', 'Total_Income', 'Debt_To_Income_Ratio']
    input_df[num_features] = scaler.transform(input_df[num_features])

    # Make prediction
    prediction = best_model.predict(input_df)
    return "Loan Approved" if prediction[0] == 1 else "Loan Denied"

In [12]:
# Prompt user for input and predict eligibility
result = predict_loan_eligibility()
print("Prediction:", result)

Enter the following details:
Gender (Male/Female): Male
Married (Yes/No): Yes
Dependents (0/1/2/3+): 2
Education (Graduate/Not Graduate): Not Graduate
Self Employed (Yes/No): Yes
Applicant Income: 13000
Coapplicant Income: 400
Loan Amount: 400000
Loan Amount Term: 250
Credit History (1.0/0.0): 0.0
Property Area (Urban/Semiurban/Rural): Urban
Prediction: Loan Denied
