## loan Status Classify

Here is a step-by-step Python implementation for **Credit Risk Scoring**. This involves building a classification model and performing clustering to categorize borrowers into low-risk, medium-risk, and high-risk groups.

We will use Logistic Regression for classification and KMeans Clustering for segmentation.



In [None]:
# Data Manipulation and Analysis
import pandas as pd
import numpy as np

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning Models
import sklearn # Import sklearn before using it.
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score

In [None]:
# Load the dataset
loan_data = pd.read_csv("/content/loan_data.csv")

# Identify numeric and categorical columns
numeric_columns = loan_data.select_dtypes(include=['float64', 'int64']).columns
categorical_columns = loan_data.select_dtypes(include=['object']).columns

# Handling missing values for numeric columns (using mean)
loan_data[numeric_columns] = loan_data[numeric_columns].fillna(loan_data[numeric_columns].mean())

# Handling missing values for categorical columns (using mode)
for col in categorical_columns:
    loan_data[col] = loan_data[col].fillna(loan_data[col].mode()[0])

# Encoding categorical variables using LabelEncoder (for simplicity)
label_encoder = LabelEncoder()
loan_data['person_gender'] = label_encoder.fit_transform(loan_data['person_gender'])
loan_data['person_education'] = label_encoder.fit_transform(loan_data['person_education'])
loan_data['person_home_ownership'] = label_encoder.fit_transform(loan_data['person_home_ownership'])
loan_data['loan_intent'] = label_encoder.fit_transform(loan_data['loan_intent'])
loan_data['previous_loan_defaults_on_file'] = label_encoder.fit_transform(loan_data['previous_loan_defaults_on_file'])

# Feature selection (Based on your previous analysis)
X = loan_data[['person_age', 'person_gender', 'person_education', 'person_income',
               'person_emp_exp', 'person_home_ownership', 'loan_amnt', 'loan_intent',
               'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length',
               'credit_score', 'previous_loan_defaults_on_file']]
y = loan_data['loan_status']  # Target variable

# Standardizing the features (Important for some models like Logistic Regression)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
from sklearn.model_selection import train_test_split

# Split data into training and testing sets (80% for training, 20% for testing)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Train Logistic Regression model
logreg_model = LogisticRegression(max_iter=1000)
logreg_model.fit(X_train, y_train)

# Predict on the test set
y_pred_logreg = logreg_model.predict(X_test)

# Evaluate the model
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_logreg))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_logreg))
print("Classification Report:\n", classification_report(y_test, y_pred_logreg))

Logistic Regression Accuracy: 0.8901111111111111
Confusion Matrix:
 [[6542  448]
 [ 541 1469]]
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.94      0.93      6990
           1       0.77      0.73      0.75      2010

    accuracy                           0.89      9000
   macro avg       0.84      0.83      0.84      9000
weighted avg       0.89      0.89      0.89      9000



# Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Train Random Forest Classifier model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 0.9281111111111111
Confusion Matrix:
 [[6793  197]
 [ 450 1560]]
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.97      0.95      6990
           1       0.89      0.78      0.83      2010

    accuracy                           0.93      9000
   macro avg       0.91      0.87      0.89      9000
weighted avg       0.93      0.93      0.93      9000



# Support machine vector

In [None]:
from sklearn.svm import SVC

# Train SVM model
svm_model = SVC(kernel='rbf', random_state=42)
svm_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_svm = svm_model.predict(X_test)

print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))
print("Classification Report:\n", classification_report(y_test, y_pred_svm))

SVM Accuracy: 0.9084444444444445
Confusion Matrix:
 [[6684  306]
 [ 518 1492]]
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.96      0.94      6990
           1       0.83      0.74      0.78      2010

    accuracy                           0.91      9000
   macro avg       0.88      0.85      0.86      9000
weighted avg       0.91      0.91      0.91      9000



# MLPClassifier is a neural network

In [None]:
from sklearn.neural_network import MLPClassifier

# Train MLP model
mlp_model = MLPClassifier(hidden_layer_sizes=(50, 50), max_iter=1000, random_state=42)
mlp_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_mlp = mlp_model.predict(X_test)

print("MLP Accuracy:", accuracy_score(y_test, y_pred_mlp))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_mlp))
print("Classification Report:\n", classification_report(y_test, y_pred_mlp))


MLP Accuracy: 0.9111111111111111
Confusion Matrix:
 [[6664  326]
 [ 474 1536]]
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.95      0.94      6990
           1       0.82      0.76      0.79      2010

    accuracy                           0.91      9000
   macro avg       0.88      0.86      0.87      9000
weighted avg       0.91      0.91      0.91      9000



# XGBoost Classifier

In [None]:
# Train XGBoost model
xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_xgb = xgb_model.predict(X_test)

print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))
print("Classification Report:\n", classification_report(y_test, y_pred_xgb))

XGBoost Accuracy: 0.9321111111111111
Confusion Matrix:
 [[6774  216]
 [ 395 1615]]
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.97      0.96      6990
           1       0.88      0.80      0.84      2010

    accuracy                           0.93      9000
   macro avg       0.91      0.89      0.90      9000
weighted avg       0.93      0.93      0.93      9000



In [None]:
# Example input features
input_data = {
    'person_age': 22,  # Age of the person in years
    'person_gender': 0,  # 0 for Male, 1 for Female
    'person_education': 3,  # Education level (0: High School, 1: Bachelor, 2: Master, 3: Doctorate)
    'person_income': 71948.0,  # Annual income of the person in dollars
    'person_emp_exp': 0,  # Employment experience in years
    'person_home_ownership': 0,  # 0 for RENT, 1 for OWN, 2 for MORTGAGE
    'loan_intent': 3,  # Loan purpose (0: DEBTCONSOLIDATION, 1: CREDITCARD, 2: HOMEIMPROVEMENT, 3: PERSONAL, 4: MEDICAL, 5: EDUCATION)
    'loan_int_rate': 16.02,  # Interest rate for the loan in percentage
    'loan_percent_income': 0.49,  # Loan amount as a percentage of income
    'cb_person_cred_hist_length': 3.0,  # Credit bureau history length in years
    'credit_score': 561,  # Credit score of the person
    'previous_loan_defaults_on_file': 0  # 0 for No previous defaults, 1 for Yes
}

In [None]:
# Example: Predict loan_status for a new input feature
input_features = np.array([[22, 1, 0, 71948.0, 0, 0, 3, 16.02, 0.49, 3.0, 561, 550, 0]])  # Ensure all features are included

# Standardize the input features using the same scaler used during training
input_features_scaled = scaler.transform(input_features)

# Predict loan_status using the trained XGBoost model
predicted_loan_status = xgb_model.predict(input_features_scaled)

# Output as "Approved" or "Rejected" based on the prediction
loan_status_output = "Approved" if predicted_loan_status[0] == 1 else "Rejected"
print(f"Predicted Loan Status: {loan_status_output}")

Predicted Loan Status: Rejected


