In [1]:
import pandas as pd
df = pd.read_csv('loan_default_prediction_project.csv') 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Age                    1000 non-null   int64  
 1   Gender                 792 non-null    object 
 2   Income                 1000 non-null   float64
 3   Employment_Status      906 non-null    object 
 4   Location               1000 non-null   object 
 5   Credit_Score           1000 non-null   int64  
 6   Debt_to_Income_Ratio   1000 non-null   float64
 7   Existing_Loan_Balance  1000 non-null   float64
 8   Loan_Status            1000 non-null   object 
 9   Loan_Amount            1000 non-null   float64
 10  Interest_Rate          1000 non-null   float64
 11  Loan_Duration_Months   1000 non-null   int64  
dtypes: float64(5), int64(3), object(4)
memory usage: 93.9+ KB


### 1) Data Preprocessing

##### Identified the missed values, Handle the missing values

In [8]:
import pandas as pd

# Load the dataset
df = pd.read_csv('loan_default_prediction_project.csv')

# Handling missing values in the 'Gender' column (fill with mode)
df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])

# Handling missing values in the 'Employment_Status' column (fill with mode)
df['Employment_Status'] = df['Employment_Status'].fillna(df['Employment_Status'].mode()[0])

# Save the updated DataFrame to a new CSV file
df.to_csv('loan_default_prediction_project_cleaned.csv', index=False)

##### Used One hot encoding for categorical variables

In [21]:
import pandas as pd

# Load your dataset
df = pd.read_csv('loan_default_prediction_project_cleaned.csv')

# List of categorical columns to encode
categorical_columns = ['Gender', 'Employment_Status', 'Location', 'Loan_Status']

# Apply One-Hot Encoding while keeping the original column names
for col in categorical_columns:
    # Perform One-Hot Encoding
    encoded_cols = pd.get_dummies(df[col], prefix='', prefix_sep='')  # No prefix
    # Keep only the relevant binary column
    df[col] = encoded_cols.iloc[:, 1]  # Assuming `drop_first=True` behavior

# Save the updated DataFrame to a new CSV file
df.to_csv('loan_default_prediction_project_onehot_encoded.csv', index=False)

print("One-Hot Encoding applied and saved to 'loan_default_prediction_project_onehot_encoded.csv'.")

One-Hot Encoding applied and saved to 'loan_default_prediction_project_onehot_encoded.csv'.


##### Used Min Max Scaling for numerical columns

In [22]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

# Load your cleaned dataset
df = pd.read_csv('loan_default_prediction_project_onehot_encoded.csv')

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Numerical columns to scale
numerical_cols = [
    'Age',
    'Income',
    'Credit_Score',
    'Debt_to_Income_Ratio',
    'Existing_Loan_Balance',
    'Loan_Amount',
    'Interest_Rate',
    'Loan_Duration_Months'
]

# Apply Min-Max Scaling
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Save the scaled DataFrame to a new CSV file
df.to_csv('loan_price_prediction_scaled.csv', index=False)

### 2) Model Building

In [23]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

# Load the cleaned and scaled dataset
df = pd.read_csv('loan_price_prediction_scaled.csv')

# Define features (X) and target (y)
X = df.drop(columns=['Loan_Status'])  # Replace 'Loan_Status' with your target column
y = df['Loan_Status']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize a Random Forest Classifier (can replace with Logistic Regression, Gradient Boosting, etc.)
model = RandomForestClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Calculate ROC-AUC Score
y_pred_proba = model.predict_proba(X_test)[:, 1]  # Use probabilities for the positive class
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC-AUC Score: {roc_auc:.2f}")

Classification Report:
              precision    recall  f1-score   support

       False       0.00      0.00      0.00        43
        True       0.78      0.99      0.88       157

    accuracy                           0.78       200
   macro avg       0.39      0.50      0.44       200
weighted avg       0.62      0.78      0.69       200

ROC-AUC Score: 0.53


In [37]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

# Load the cleaned and preprocessed dataset
data = pd.read_csv('loan_price_prediction_scaled.csv')

# Split the dataset into features and target variable
X = data.drop('Loan_Status', axis=1)  # Replace 'Loan_Status' with the actual target column name
y = data['Loan_Status']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Dictionary to store model results
results = {}

# 1. Logistic Regression
logreg = LogisticRegression(max_iter=1000, class_weight='balanced')
logreg.fit(X_train, y_train)
y_pred_logreg = logreg.predict(X_test)
results['Logistic Regression'] = {
    'classification_report': classification_report(y_test, y_pred_logreg, zero_division=0),
    'roc_auc_score': roc_auc_score(y_test, logreg.predict_proba(X_test)[:, 1])
}

# 2. Decision Tree
dt = DecisionTreeClassifier(random_state=42, class_weight='balanced')
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
results['Decision Tree'] = {
    'classification_report': classification_report(y_test, y_pred_dt, zero_division=0),
    'roc_auc_score': roc_auc_score(y_test, dt.predict_proba(X_test)[:, 1])
}

# 3. Random Forest
rf = RandomForestClassifier(random_state=42, n_estimators=100, class_weight='balanced')
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
results['Random Forest'] = {
    'classification_report': classification_report(y_test, y_pred_rf, zero_division=0),
    'roc_auc_score': roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1])
}

# 4. Gradient Boosting (XGBoost or GradientBoostingClassifier from sklearn)
gb = GradientBoostingClassifier(random_state=42, n_estimators=80)
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)
results['Gradient Boosting'] = {
    'classification_report': classification_report(y_test, y_pred_gb, zero_division=0),
    'roc_auc_score': roc_auc_score(y_test, gb.predict_proba(X_test)[:, 1])
}

# 5. Support Vector Machine (SVM)
svm = SVC(random_state=42, probability=True, class_weight='balanced')
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
results['Support Vector Machine'] = {
    'classification_report': classification_report(y_test, y_pred_svm, zero_division=0),
    'roc_auc_score': roc_auc_score(y_test, svm.predict_proba(X_test)[:, 1])
}

# Print results
for model, metrics in results.items():
    print(f"Model: {model}")
    print(metrics['classification_report'])
    print(f"ROC-AUC Score: {metrics['roc_auc_score']:.2f}\n")

Model: Logistic Regression
              precision    recall  f1-score   support

       False       0.29      0.60      0.39        43
        True       0.85      0.60      0.70       157

    accuracy                           0.60       200
   macro avg       0.57      0.60      0.55       200
weighted avg       0.73      0.60      0.64       200

ROC-AUC Score: 0.59

Model: Decision Tree
              precision    recall  f1-score   support

       False       0.25      0.23      0.24        43
        True       0.79      0.81      0.80       157

    accuracy                           0.69       200
   macro avg       0.52      0.52      0.52       200
weighted avg       0.68      0.69      0.68       200

ROC-AUC Score: 0.52

Model: Random Forest
              precision    recall  f1-score   support

       False       0.00      0.00      0.00        43
        True       0.79      1.00      0.88       157

    accuracy                           0.79       200
   macro avg     