In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Loading the dataset
df = pd.read_csv(r"/workspaces/Predicting-loan-default/Loan_default.csv")

print(df.head())

In [None]:
# summary stats
print(df.describe())

In [None]:
# Check for missing values
print(df.isnull().sum())

In [None]:
# Check data types
print(df.dtypes)

Dataset Overview: The dataset contains 255,347 entries and 18 columns. The target variable is 'Default', which is binary (0 or 1).

Missing Values: There are no missing values in the dataset, which simplifies the preprocessing step.

Data Types: Most of the features are numeric (int64 or float64), but there are several categorical features ('Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner') that need to be encoded before they can be used in a machine learning model.

Feature Distribution: The 'describe()' method provides a summary of the numeric features. It's important to look at the mean, standard deviation, and the range (min and max) to get a sense of the distribution and scale of each feature.



In [None]:
# Removing irrelevant features: Drop the LoanID column
# the LoanID column seems to be a unique identifier for each loan and is unlikely to have predictive power, so we can consider dropping it.
df = df.drop('LoanID', axis=1)

In [None]:
# Features that are highly correlated with each other can be considered redundant. 
# For example, if two features provide very similar information, you might choose to keep only one of them. 
# At this stage, we can look at the correlation matrix to identify any potential redundant features.

# Select numerical features
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns


# RUNNING CORRELATION ANALYSIS
# Plot the correlation matrix for numerical features
plt.figure(figsize=(12, 8))
sns.heatmap(df[numerical_features].corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.show()

In [None]:
df[numerical_features].corr()

In [None]:
# Age and Default: There is a moderate negative correlation (-0.167783) between Age and Default. This suggests that older borrowers are less likely to default on their loans.
# Income and Default: There is a negative correlation (-0.099119) between Income and Default, indicating that borrowers with higher incomes are less likely to default.
# LoanAmount and Default: There is a positive correlation (0.086659) between LoanAmount and Default, suggesting that higher loan amounts are associated with a higher likelihood of default.
# CreditScore and Default: There is a negative correlation (-0.034166) between CreditScore and Default, which is expected as higher credit scores are typically associated with lower default rates. (based on domain knowledge)
# MonthsEmployed and Default: There is a negative correlation (-0.097374) between MonthsEmployed and Default, indicating that borrowers with longer employment history are less likely to default.
# InterestRate and Default: There is a positive correlation (0.131273) between InterestRate and Default, suggesting that higher interest rates are associated with a higher likelihood of default.

In [None]:
# only captures linear relationships and might not detect non-linear relationships. 
# It also doesn't take into account the interactions between features or the combined effect of multiple features on the target variable.

In [None]:
# To check for important features, an alternative is to use Feature Importance from Tree-based Models

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Separate features and target
X = df.drop('Default', axis=1)
y = df['Default']

# Define the preprocessor with one-hot encoding for categorical variables
categorical_cols = X.select_dtypes(include=['object']).columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first'), categorical_cols)
    ], remainder='passthrough')

# Apply the preprocessing
X_transformed = preprocessor.fit_transform(X)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.3, random_state=42)

# Train a Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Get feature importance
importances = rf.feature_importances_
feature_names = preprocessor.get_feature_names_out()
feature_importances = pd.DataFrame({'Feature': feature_names, 'Importance': importances}).sort_values(by='Importance', ascending=False)

# Display the feature importance
print(feature_importances)

In [None]:
# Based on the feature importance scores from the Random Forest model, we can see that the most important features for predicting loan defaults are:

# Income: This is the most important feature, which aligns with the intuition that a borrower's income is a crucial factor in their ability to repay a loan.
# Interest Rate: The interest rate of the loan is also highly important, suggesting that higher interest rates might be associated with a higher likelihood of default.
# Loan Amount: The amount of the loan is another key factor, which makes sense as larger loans might be harder to repay.
# Age: The borrower's age is also important, potentially reflecting different financial stability levels at different life stages.
# Credit Score: As expected, the borrower's credit score is a significant predictor, with higher scores indicating lower risk of default.
# Months Employed: The length of employment is also important, likely reflecting the borrower's job stability and income security.
# DTIRatio (Debt-to-Income Ratio): This is another critical factor, indicating the borrower's ability to manage loan repayments relative to their income.
# The categorical features, such as HasMortgage, MaritalStatus, and Education, have lower importance scores compared to the numerical features. 
# However, they still contribute to the model's predictive power and should not be disregarded.

In [None]:
#Lets look at Categorical specifically

import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency

# Define a function to plot the distribution of the target variable by category
def plot_target_distribution_by_category(df, feature, target):
    plt.figure(figsize=(10, 6))
    sns.countplot(x=feature, hue=target, data=df)
    plt.title(f'Distribution of {target} by {feature}')
    plt.xlabel(feature)
    plt.ylabel('Count')
    plt.show()

# Define a function to perform a chi-squared test for independence
def chi_squared_test(df, feature, target):
    contingency_table = pd.crosstab(df[feature], df[target])
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    print(f'Chi-squared test for {feature} and {target}:')
    print(f'Chi-squared statistic: {chi2}')
    print(f'p-value: {p}\n')

# List of categorical features
categorical_features = df.select_dtypes(include=['object']).columns

# Analyze each categorical feature
for feature in categorical_features:
    plot_target_distribution_by_category(df, feature, 'Default')
    chi_squared_test(df, feature, 'Default')

In [None]:
"""
For numerical features, we selected the below:
Income
InterestRate
LoanAmount
Age
CreditScore
MonthsEmployed
DTIRatio

For categoricals, we selected
Education
EmploymentType
MaritalStatus
HasMortgage
HasDependents
LoanPurpose
HasCoSigner
"""

In [None]:
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder