In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Problem Statement:** This dataset contains columns simulating credit bureau data. The primary goal is to gain understanding of data ,analyse it, clean it, discover correlation between data points and create best performing Machine Learning Model for default risk prediction.

# Features Description:
person_age: Age of the individual applying for the loan.

person_income: Annual income of the individual.

person_home_ownership: Type of home ownership of the individual.

    rent: The individual is currently renting a property.

    mortgage: The individual has a mortgage on the property they own.

    own: The individual owns their home outright.

    other: Other categories of home ownership that may be specific to the dataset.

person_emp_length: Employment length of the individual in years.

loan_intent: The intent behind the loan application.

loan_grade: The grade assigned to the loan based on the creditworthiness of the borrower.

    A: The borrower has a high creditworthiness, indicating low risk.
    B: The borrower is relatively low-risk, but not as creditworthy as Grade A.
    C: The borrower's creditworthiness is moderate.
    D: The borrower is considered to have higher risk compared to previous grades.
    E: The borrower's creditworthiness is lower, indicating a higher risk.
    F: The borrower poses a significant credit risk.
    G: The borrower's creditworthiness is the lowest, signifying the highest risk.
loan_amnt: The loan amount requested by the individual.

loan_int_rate: The interest rate associated with the loan.

loan_status: Loan status, where 0 indicates non-default and 1 indicates default.

    0: Non-default - The borrower successfully repaid the loan as agreed, and there was no default.
    1: Default - The borrower failed to repay the loan according to the agreed-upon terms and defaulted on the loan.
    
loan_percent_income: The percentage of income represented by the loan amount.

cb_person_default_on_file: Historical default of the individual as per credit bureau records.

    Y: The individual has a history of defaults on their credit file.
    N: The individual does not have any history of defaults.
    
cb_preson_cred_hist_length: The length of credit history for the individual.

In [None]:
df = pd.read_csv('/kaggle/input/credit-risk-dataset/credit_risk_dataset.csv')
df

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
#Checking for null values
df.isnull().sum()

In [None]:
#checking for unique values in each columns
df.nunique()

# Columns that can be categorized

In [None]:
df.loan_grade.value_counts()

In [None]:
df.person_home_ownership.value_counts()

In [None]:
df.loan_intent.value_counts()

In [None]:
df.loan_status.value_counts()

In [None]:
df.cb_person_default_on_file.value_counts()

# Duplicate Rows

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

News Shape of data after dropping 165 dupliacted rows

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
num_cols= df.select_dtypes(include=['int64', 'float64'])
num_cols

# Visualization of  Categorical Coluumns

In [None]:
# Visualize categorical coluumns
plt.figure(figsize=(10, 8))
for col in ["person_home_ownership", "loan_intent", "loan_grade", "cb_person_default_on_file"]:
    sns.countplot(data= df, x=col)
    plt.xticks(rotation= 45)
    plt.show()

# Visulaization of Numerical Columns

In [None]:
# Visualize distributions of numerical features
df.hist(bins=20, figsize=(14, 10))
plt.show()

In [None]:
# Correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(num_cols.corr(), annot=True, cmap='coolwarm')
plt.show()

In [None]:
sns.pairplot(data=num_cols)

In [None]:
# Handling Missing values by removing them #axis=0 (row-wise of the columns)
df.dropna(axis=0,inplace=True)

In [None]:
df.isnull().sum()

In [None]:
# Earlier it was (32416, 12)
df.shape

# Identifying and Handling Outliers

In [None]:
outlier_age= df[df["person_age"] > 80].shape[0]
outlier_age

In [None]:
# Dropping Ages greater than 80
df = df[df["person_age"] < 80]
df.shape

In [None]:
# Dropping outliers for person_emp_length
outliers_emp_length = df[df['person_emp_length'] > 80].shape[0]
outliers_emp_length

In [None]:
df = df[df['person_emp_length'] <= 80]
df.describe()

In [None]:
df.shape

In [None]:
sns.histplot(data= df, x='person_emp_length')
plt.show()

In [None]:
sns.histplot(data= df, x='person_age')
plt.show()

# Feature Engineering

In [None]:
# Encode categorical variables
df = pd.get_dummies(df, columns=['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file'], drop_first=True)
df

In [None]:
# Scale numerical features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
numerical_cols = ['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length']
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])


In [None]:
df[numerical_cols]

In [None]:
df

# Model Building

In [None]:
# Split the data into features and target
X = df.drop('loan_status', axis=1)
y = df['loan_status']

In [None]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Import necessary libraries for the mentioned models
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [None]:
# List of models
models = {
    'Support Vector Machine': SVC(probability=True),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Bagging Classifier': BaggingClassifier(),
    'Extra Trees': ExtraTreesClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'XGBoost': XGBClassifier(),
    'LightGBM': lgb.LGBMClassifier()
}

In [None]:
# Train and evaluate each model
model_results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else y_pred
    
    model_results[name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred),
        'ROC AUC': roc_auc_score(y_test, y_pred_proba)
    }

# Convert results to DataFrame
results_df = pd.DataFrame(model_results).T
print(results_df)

# Here are the top 5 performers with brief explanations:

1. ****LightGBM**
Accuracy: 93.7%
Precision: 98.3%
ROC AUC: 94.7%
Reason: High precision and balanced metrics make LightGBM highly effective, especially in reducing false positives.

2. **XGBoost**
Accuracy: 93.6%
Precision: 95.6%
ROC AUC: 95.3%
Reason: Highest ROC AUC, indicating excellent ability to distinguish between positive and negative classes, making it very reliable.

3. **Random Forest**
Accuracy: 93.2%
Precision: 96.2%
ROC AUC: 93.5%
Reason: Strong overall performance with high precision and balanced recall, suitable for diverse datasets.

4. **Bagging Classifier**
Accuracy: 93.3%
Precision: 96.1%
ROC AUC: 91.5%
Reason: Provides robust accuracy with good generalization, reducing variance by averaging multiple models.

5. **Gradient Boosting**
Accuracy: 92.5%
Precision: 93.9%
ROC AUC: 92.8%
Reason: Effective at capturing complex patterns, with good balance across all metrics.