# Step 1: Business Problem Understanding

In this case study, we need to build a model that can predict the loss given default (LGD) for defaulted accounts to enhance risk management and compliance with regulatory standards.

# Step 2: Define Business Objectives

For this assignment, the business objective is to build a model that can predict the Loss Given Default (LGD) for defaulted accounts.

# Step 3: Data Understanding

In [None]:
#Importing Libraries and Warnings.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

import warnings
warnings.filterwarnings('ignore')

In [None]:
pd.set_option( 'display.max_rows' , 500)
pd.set_option( 'display.max_columns' , 500)

In [None]:
# # Load datasets
# main_loan_base = pd.read_csv("main_loan_base.csv")
# repayment_base = pd.read_csv("repayment_base.csv")
# monthly_balance_base = pd.read_csv("monthly_balance_base.csv")

In [None]:
# Load datasets
main_loan_base = pd.read_csv('main_loan_base.csv')
repayment_base = pd.read_csv('repayment_base.csv')
monthly_balance_base = pd.read_csv('monthly_balance_base.csv')

In [None]:
# Display first few rows of dataset and shape
print(main_loan_base.shape)
main_loan_base.head()

In [None]:
# Display first few rows of dataset and shape
print(repayment_base.shape)
repayment_base.head()

In [None]:
# Display first few rows of dataset and shape
print(monthly_balance_base.shape)
monthly_balance_base.head()

# Step 4: Data Cleaning and Pre-processing

In [None]:
# Convert 'repayment_date' column to datetime format
repayment_base['repayment_date'] = pd.to_datetime(repayment_base['repayment_date'])

# Group by 'loan_acc_num' and sum the repayment amounts
repayment_sum_per_loan = repayment_base.groupby('loan_acc_num')['repayment_amount'].sum().reset_index()

# Display the resulting DataFrame
print(repayment_sum_per_loan.shape)
repayment_sum_per_loan.head()


In [None]:
# Convert 'date' column to datetime format
monthly_balance_base['date'] = pd.to_datetime(monthly_balance_base['date'])

# Find the index of rows with the latest date for each loan_acc_num
latest_date_idx = monthly_balance_base.groupby('loan_acc_num')['date'].idxmax()

# Select rows with the latest date for each loan_acc_num
latest_balance_rows = monthly_balance_base.loc[latest_date_idx]

# Display the resulting DataFrame
print(latest_balance_rows.shape)
latest_balance_rows.head()

In [None]:
#Merging all datasets together

Train_Data = pd.merge(main_loan_base, repayment_sum_per_loan, on='loan_acc_num', how='left')
Train_Data = pd.merge(Train_Data, latest_balance_rows, on='loan_acc_num', how='left')

print(Train_Data.shape)
Train_Data.head()

### Feature Engineering
#### Creating new column as "Regular_Payment_Period" = "default_date" - "disbursal_date"

In [None]:
#Convert 'disbursal_date' and 'default_date' columns to datetime format
Train_Data['disbursal_date'] = pd.to_datetime(Train_Data['disbursal_date'])
Train_Data['default_date'] = pd.to_datetime(Train_Data['default_date'])

# Calculate the number of months between 'disbursal_date' and 'default_date'
Train_Data['Regular_Payment_Period'] = (Train_Data['default_date'] - Train_Data['disbursal_date']).dt.days / 30.4375

# Round off the values to two decimal places
Train_Data['Regular_Payment_Period'] = Train_Data['Regular_Payment_Period'].round(2)
# Convert 'tenure_years' column to months
Train_Data['tenure_months'] = Train_Data['tenure_years'] * 12


# Display the resulting DataFrame with the converted 'Regular_Payment_Period' & Tenure in months column
Train_Data.head()


#### Removing all Date columns as those wont be useful for creating final model

In [None]:
# List of columns to drop
drop_cols = ['disbursal_date', 'default_date', 'date', 'customer_address', 'customer_name','tenure_years']

# Drop the columns from Train_Data
Train_Data.drop(columns=drop_cols, inplace=True)

# Display the resulting DataFrame
print(Train_Data.shape)
Train_Data.head()

In [None]:
Train_Data.info()

### Handling missing values

In [None]:
round(100*Train_Data.isnull().mean(),2).sort_values(ascending = False)

In [None]:
# Replace missing values in 'repayment_amount' and 'balance_amount' columns with 0
Train_Data['repayment_amount'].fillna(0, inplace=True)
Train_Data['balance_amount'].fillna(0, inplace=True)

round(100*Train_Data.isnull().mean(),2).sort_values(ascending = False)

In [None]:
# Heads of dataset
Train_Data.head()

In [None]:
# Display the value_counts of loan_type column
Train_Data['loan_type'].value_counts()

### Creating LGD column

In [None]:
# Calculate the Loss Given Default (LGD)
Train_Data['LGD'] = Train_Data['loan_amount'] - ((Train_Data['collateral_value'] + Train_Data['repayment_amount']))

Train_Data['LGD'] = Train_Data['LGD']/Train_Data['loan_amount']

# Display the resulting DataFrame
Train_Data.head()

### EDA
Performing Exploratory Data Analysis of Final Data to understand relationship of all variables with Target Variable as "LGD"

In [None]:
# Get value counts of 'loan_type'
loan_type_counts = Train_Data['loan_type'].value_counts()

# Define colors
colors = ['#ff9999', '#66b3ff', '#99ff99', '#ffcc99']

# Plotting a pie chart
plt.figure(figsize=(4, 4))  # Set the figure size
plt.pie(loan_type_counts, labels=loan_type_counts.index, autopct='%1.1f%%', startangle=180, colors=colors)
plt.title('Distribution of Loan Types')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle
plt.show()


In [None]:
# Lists of columns in Train Data
Train_Data.columns

In [None]:
# Define the figure size
plt.figure(figsize=(6, 4))

# Plot the bar plot for loan_amount
ax = sns.barplot(data=Train_Data, x='loan_type', y=Train_Data['loan_amount'] / 1000, color='lightgreen', label='Loan Amount', ci=None)  # ci=None removes error bars

# Plot the bar plot for repayment_amount
ax = sns.barplot(data=Train_Data, x='loan_type', y=Train_Data['repayment_amount'] / 1000, color='lightcoral', label='Repayment Amount', ci=None)  # ci=None removes error bars

# Add data labels to each bar
for p in ax.patches:
    ax.annotate(f'{p.get_height():.2f}', (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', fontsize=8, color='black', xytext=(0, 5),
                textcoords='offset points')

plt.title('Loan Amount and Repayment Amount by Loan Type')
plt.xlabel('Loan Type')
plt.ylabel('Amount (in thousands)')  
plt.xticks(rotation=0)  
plt.legend() 
plt.tight_layout() 
plt.show()

In [None]:
# Create a dictionary to store plots for each loan type
loan_type_plots = {}

# Define the columns to plot
columns_to_plot = ['cheque_bounces', 'missed_repayments', 'number_of_loans']

# Define colors for the histograms
colors = ['orange', 'salmon', 'lightgreen','crimson']

# Iterate over unique loan types
for loan_type, color in zip(Train_Data['loan_type'].unique(), colors):
    # Filter data for the current loan type
    loan_type_data = Train_Data[Train_Data['loan_type'] == loan_type]
    
    # Create subplots for each column
    fig, axes = plt.subplots(len(columns_to_plot), 1, figsize=(12, 3 * len(columns_to_plot)))
    
    # Plot each column
    for i, column in enumerate(columns_to_plot):
        # Plot the distribution of the current column
        ax = axes[i] if len(columns_to_plot) > 1 else axes
        ax.hist(loan_type_data[column], color=color, bins=20, edgecolor='black', linewidth=1.5, alpha=0.7)  # Use plt.hist for histogram
        ax.set_title(f'Distribution of {column.replace("_", " ").title()} for Loan Type: {loan_type}')
        ax.set_xlabel(column.replace("_", " ").title())
        ax.set_ylabel('Frequency')
        ax.grid(False) 
        
        # Add labels to each bar
        for rect in ax.patches:
            height = rect.get_height()
            ax.text(rect.get_x() + rect.get_width() / 2, height + 5, f'{height:.0f}', ha='center', va='bottom')
        
        # Add distance between bars
        ax.margins(x=0.1)
    
    # Adjust layout of subplots
    plt.tight_layout()
    plt.show()


In [None]:
# Group by 'loan_type' and calculate the mean 'LGD' for each group
mean_lgd_by_loan_type = Train_Data.groupby('loan_type')['LGD'].mean()

# Define color
color ='limegreen'

# Plotting the bar chart
plt.figure(figsize=(6, 4))
mean_lgd_by_loan_type.plot(kind='bar', color=color)
plt.title('Mean LGD by Loan Type')
plt.xlabel('Loan Type')
plt.ylabel('Mean LGD')
plt.xticks(rotation=0)
plt.grid(axis='y')

# Add labels to each bar
for i, mean_lgd in enumerate(mean_lgd_by_loan_type):
    plt.text(i, mean_lgd, f'{mean_lgd:.2f}', ha='center', va='bottom')

plt.show()


In [None]:
# Create a dictionary to store plots for each loan type
loan_type_plots = {}

# Define the columns to plot
columns_to_plot = ['loan_amount', 'collateral_value', 'monthly_emi', 'repayment_amount','balance_amount']

# Define colors for the histograms
colors = ['limegreen', 'dodgerblue','orchid', 'mediumorchid']

# Iterate over unique loan types
for loan_type, color in zip(Train_Data['loan_type'].unique(), colors):
    # Filter data for the current loan type
    loan_type_data = Train_Data[Train_Data['loan_type'] == loan_type]
    
    # Create subplots for each column
    fig, axes = plt.subplots(len(columns_to_plot), 1, figsize=(12, 3 * len(columns_to_plot)))
    
    # Plot each column
    for i, column in enumerate(columns_to_plot):
        # Plot the distribution of the current column
        ax = axes[i] if len(columns_to_plot) > 1 else axes
        ax.hist(loan_type_data[column], color=color, bins=20, edgecolor='black', alpha=0.7)  # Use plt.hist for histogram
        ax.set_title(f'Distribution of {column.replace("_", " ").title()} for Loan Type: {loan_type}')
        ax.set_xlabel(column.replace("_", " ").title())
        ax.set_ylabel('Frequency')
        ax.grid(False) 
        
        # Add labels to each bar
        for rect in ax.patches:
            height = rect.get_height()
            ax.text(rect.get_x() + rect.get_width() / 2, height + 5, f'{height:.0f}', ha='center', va='bottom')
        
        # Add distance between bars
        ax.margins(x=0.1)
    
    # Adjust layout of subplots
    plt.tight_layout()
    plt.show()


In [None]:
# Create a dictionary to store plots for each loan type
loan_type_plots = {}

# Define the columns to plot
columns_to_plot = ['vintage_in_months', 'Regular_Payment_Period', 'tenure_months']

# Define colors for the histograms
colors = ['tomato', 'gold', 'skyblue','teal']

# Iterate over unique loan types
for loan_type, color in zip(Train_Data['loan_type'].unique(), colors):
    # Filter data for the current loan type
    loan_type_data = Train_Data[Train_Data['loan_type'] == loan_type]
    
    # Create subplots for each column
    fig, axes = plt.subplots(len(columns_to_plot), 1, figsize=(12, 3 * len(columns_to_plot)))
    
    # Plot each column
    for i, column in enumerate(columns_to_plot):
        # Plot the distribution of the current column
        ax = axes[i] if len(columns_to_plot) > 1 else axes
        ax.hist(loan_type_data[column], color=color, bins=20, edgecolor='black', alpha=0.7)  # Use plt.hist for histogram
        ax.set_title(f'Distribution of {column.replace("_", " ").title()} for Loan Type: {loan_type}')
        ax.set_xlabel(column.replace("_", " ").title())
        ax.set_ylabel('Frequency')
        ax.grid(False) 
        
        # Add labels to each bar
        for rect in ax.patches:
            height = rect.get_height()
            ax.text(rect.get_x() + rect.get_width() / 2, height + 5, f'{height:.0f}', ha='center', va='bottom')
        
        # Add distance between bars
        ax.margins(x=0.1)
    
    # Adjust layout of subplots
    plt.tight_layout()
    plt.show()


In [None]:
# Create a dictionary to store plots for each loan type
loan_type_plots = {}

# Define the columns to plot
columns_to_plot = ['interest']

# Define colors for the histograms
colors = ['indigo', 'mediumorchid', 'darkorange', 'limegreen', 'tomato', 
          'skyblue', 'salmon', 'gold', 'teal', 'crimson', 'orchid']

# Iterate over unique loan types
for loan_type, color in zip(Train_Data['loan_type'].unique(), colors):
    # Filter data for the current loan type
    loan_type_data = Train_Data[Train_Data['loan_type'] == loan_type]
    
    # Create subplots for each column
    fig, axes = plt.subplots(len(columns_to_plot), 1, figsize=(12, 3 * len(columns_to_plot)))
    
    # Plot each column
    for i, column in enumerate(columns_to_plot):
        # Plot the distribution of the current column
        ax = axes[i] if len(columns_to_plot) > 1 else axes
        ax.hist(loan_type_data[column], color=color, bins=20, edgecolor='black', alpha=0.7)  # Use plt.hist for histogram
        ax.set_title(f'Distribution of {column.replace("_", " ").title()} for Loan Type: {loan_type}')
        ax.set_xlabel(column.replace("_", " ").title())
        ax.set_ylabel('Frequency')
        ax.grid(False) 
        
        # Add labels to each bar
        for rect in ax.patches:
            height = rect.get_height()
            ax.text(rect.get_x() + rect.get_width() / 2, height + 5, f'{height:.0f}', ha='center', va='bottom')
        
        # Add distance between bars
        ax.margins(x=0.1)
    
    # Adjust layout of subplots
    plt.tight_layout()
    plt.show()


In [None]:
#Correlation Heatmap
correlation_matrix = Train_Data.corr()
plt.figure(figsize=(20, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='mako', center=0)
plt.title('Correlation Matrix Heatmap')
plt.show()


### As we can see all loan types are equally contributing for the overall distribution,hence, these categorical input variable will have mutli colinearity in model building, Hence deleting this column

In [None]:
Train_Data = Train_Data.drop(["loan_type"], axis=1)
Train_Data.head()

### Train-Test Split

In [None]:
X = Train_Data.drop(["LGD"], axis=1)
y = Train_Data["LGD"]

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.3 , random_state = 100)

In [None]:
print(X_train.shape)
X_train.head()

In [None]:
print(X_test.shape)
X_test.head()

In [None]:
X_train_1 = X_train
X_test_1 = X_test

In [None]:
print(y_train.shape)
y_train.head().reset_index()

In [None]:
print(y_test.shape)
y_test.head().reset_index()

In [None]:
# Drop loan_acc_num column
X_train = X_train_1.drop('loan_acc_num', axis=1)
X_train.head()

In [None]:
# Drop loan_acc_num column
X_test = X_test_1.drop('loan_acc_num', axis=1)
X_test.head()

In [None]:
#Correlation Heatmap
correlation_matrix = Train_Data.corr()
plt.figure(figsize=(20, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='viridis', center=0)
plt.title('Correlation Matrix Heatmap')
plt.show()

### Rescaling the Features
### We will use MinMax scaling.

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_train_scaled.head()

In [None]:
X_test.shape

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_test_scaled = scaler.fit_transform(X_test)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)
print(X_test.shape)
X_test_scaled.head()

### Scaling

### Modeling
### Feature Selection

In [None]:
from sklearn.feature_selection import RFE #RFE (Recursive Feature Elimination)
from sklearn.linear_model import LinearRegression

estimator = LinearRegression()
selector = RFE(estimator, n_features_to_select=12)
selector = selector.fit(X_train_scaled, y_train)
selector.support_

In [None]:
selected_features= list(X_train_scaled.columns[selector.support_])
selected_features

In [None]:
X_train = X_train_scaled[selected_features]
X_test = X_test_scaled[selected_features]
print(X_train.head())
print(X_test.head())

In [None]:
X_test.head()

In [None]:
import statsmodels.api as sm 
X_train_sm=sm.add_constant(X_train) 
X_test_sm=sm.add_constant(X_test)

In [None]:
# Reset indices of y_train and X_train_sm
y_train.reset_index(drop=True, inplace=True)
X_train_sm.reset_index(drop=True, inplace=True)

# Fit the OLS model
model1 = sm.OLS(y_train, X_train_sm) 
res1 = model1.fit() 
res1.summary()

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif_data=pd.DataFrame()
vif_data["Feature"]=X_train_sm.columns

vif_data["VIF"]=[variance_inflation_factor(X_train_sm.values,i) for i in range(len(X_train_sm.columns))]
vif_data

In [None]:
X_train_sm=X_train_sm.drop(["loan_amount"],axis=1)
X_test_sm=X_test_sm.drop(["loan_amount"],axis=1)

In [None]:
model2=sm.OLS(y_train,X_train_sm)
res2=model2.fit()
res2.summary()

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif_data=pd.DataFrame()
vif_data["Feature"]=X_train_sm.columns
vif_data["VIF"]=[variance_inflation_factor(X_train_sm.values,i) for i in range(len(X_train_sm.columns))]
vif_data

In [None]:
X_train_sm=X_train_sm.drop(["Regular_Payment_Period"],axis=1)
X_test_sm=X_test_sm.drop(["Regular_Payment_Period"],axis=1)

In [None]:
model3=sm.OLS(y_train,X_train_sm)
res3=model3.fit()
res3.summary()

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif_data=pd.DataFrame()
vif_data["Feature"]=X_train_sm.columns
vif_data["VIF"]=[variance_inflation_factor(X_train_sm.values,i) for i in range(len(X_train_sm.columns))]
vif_data

### Picking up Model-III for performing Residual Analysis

### Residual Analysis of the train data

So, now to check if the error terms are also normally distributed (which is infact, one of the major assumptions of linear regression), let us plot the histogram of the error terms and see what it looks like.

In [None]:
y_train_pred_1 = res3.predict(X_train_sm)
y_train_pred_1.head().reset_index()
print(y_train_pred_1.shape)
y_train_pred_1.describe()

In [None]:
# Plot the histogram of the error terms
fig = plt.figure()
sns.distplot((y_train - y_train_pred_1), bins = 20)
fig.suptitle('Error Terms', fontsize = 10)                  # Plot heading 
plt.xlabel('Errors', fontsize = 10)                         # X-label

#### R^2 Value for Train Dataset

In [None]:
from sklearn.metrics import r2_score
r2 = r2_score(y_train, y_train_pred_1)
round(r2,4)

In [None]:
# Predict the target variable
y_test_pred_1 = res3.predict(X_test_sm)
y_test_pred_1.value_counts()

### Plotting Loan Account number Distribution Plot wrt Predicted Loss Given Default (LGD) Values

In [None]:
# Rename the Series to 'LGD'
y_test_pred_1 = y_test_pred_1.rename('LGD')

# Concatenate the loan account numbers and LGD values along the column axis
matched_data_train = pd.concat([X_test_1['loan_acc_num'], y_test_pred_1], axis=1)

# Display the matched data
matched_data_train['LGD'].value_counts()
matched_data_train['loan_acc_num'].unique()
X_test_1.shape
matched_data_train

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Set the style of the plot
sns.set(style="whitegrid")

# Plot the distribution of LGD values
plt.figure(figsize=(20, 8))
ax = sns.histplot(matched_data_train['LGD'], kde=True, color='blue')

# Add data labels for frequency
for rect in ax.patches:
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width() / 4, height, f'{height:.0f}', ha='center', va='bottom')

plt.title('Distribution of LGD Values')
plt.xlabel('LGD')
plt.ylabel('Frequency')
plt.show()

In [None]:
X_test_sm.shape

In [None]:
y_test.shape

In [None]:
y_test_pred_1.shape

#### Adjusted R^2 Value for Train

In [None]:
n = X_train_sm.shape[0]
# Number of features (predictors, p) is the shape along axis 1
p = X_train_sm.shape[1]

# We find the Adjusted R-squared using the formula

adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
adjusted_r2

#### R^2 Value for TEST

In [None]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_test_pred_1)
round(r2,4)

In [None]:
from sklearn.metrics import r2_score
print("Test Performance:",round(r2_score(y_test, y_test_pred_1)*100,2),"%")

In [None]:
# Plot the histogram of the error terms
fig = plt.figure()
sns.distplot((y_train - y_test_pred_1), bins=10)  # Specify the number of bins as an integer
fig.suptitle('Error Terms', fontsize=10)  # Plot heading
plt.xlabel('Errors', fontsize=10)

In [None]:
res3.params

## Making Predictions on Test Data Provided

In [None]:
# Load datasets
main_loan_base_test = pd.read_csv('test_main_loan_base.csv')
repayment_base_test = pd.read_csv('test_repayment_base.csv')
monthly_balance_base_test = pd.read_csv('test_monthly_balance_base.csv')

In [None]:
print(main_loan_base_test.shape)
main_loan_base_test.head()

In [None]:
print(repayment_base_test.shape)
repayment_base_test.head()

In [None]:
print(monthly_balance_base_test.shape)
monthly_balance_base_test.head()

In [None]:
# Convert 'repayment_date' column to datetime format
repayment_base_test['repayment_date'] = pd.to_datetime(repayment_base_test['repayment_date'])

# Group by 'loan_acc_num' and sum the repayment amounts
repayment_sum_per_loan_test = repayment_base_test.groupby('loan_acc_num')['repayment_amount'].sum().reset_index()

# Display the resulting DataFrame
print(repayment_sum_per_loan_test.shape)
repayment_sum_per_loan_test.head()


In [None]:
# Convert 'date' column to datetime format
monthly_balance_base_test['date'] = pd.to_datetime(monthly_balance_base_test['date'])

# Find the index of rows with the latest date for each loan_acc_num
latest_date_idx = monthly_balance_base_test.groupby('loan_acc_num')['date'].idxmax()

# Select rows with the latest date for each loan_acc_num
latest_balance_rows_test = monthly_balance_base_test.loc[latest_date_idx]

# Display the resulting DataFrame
print(latest_balance_rows_test.shape)
latest_balance_rows_test.head()

### Merging all "TEST" datasets together

In [None]:

Test_Data_1 = pd.merge(main_loan_base_test, repayment_sum_per_loan_test, on='loan_acc_num', how='left')
Test_Data_1 = pd.merge(Test_Data_1, latest_balance_rows_test, on='loan_acc_num', how='left')

print(Test_Data_1.shape)
Test_Data_1.head()

### Feature Engineering
#### Creating new column as "Regular_Payment_Period" = "default_date" - "disbursal_date"

In [None]:
#Convert 'disbursal_date' and 'default_date' columns to datetime format
Test_Data_1['disbursal_date'] = pd.to_datetime(Test_Data_1['disbursal_date'])
Test_Data_1['default_date'] = pd.to_datetime(Test_Data_1['default_date'])

# Calculate the number of months between 'disbursal_date' and 'default_date'
Test_Data_1['Regular_Payment_Period'] = (Test_Data_1['default_date'] - Test_Data_1['disbursal_date']).dt.days / 30.4375

# Round off the values to two decimal places
Test_Data_1['Regular_Payment_Period'] = Test_Data_1['Regular_Payment_Period'].round(2)
# Convert 'tenure_years' column to months
Test_Data_1['tenure_months'] = Test_Data_1['tenure_years'] * 12


# Display the resulting DataFrame with the converted 'Regular_Payment_Period' & Tenure in months column
Test_Data_1.head()


#### Removing all Date columns as those wont be useful for creating final model

In [None]:
# Drop the columns from Test_Data and assign the result back to Test_Data
# List of columns to drop
drop_cols = ['disbursal_date', 'default_date', 'date', 'customer_address', 'customer_name','tenure_years']
Test_Data = Test_Data_1.drop(columns=drop_cols)

# Display the resulting DataFrame
print(Test_Data.shape)
Test_Data.head()


In [None]:
Test_Data.info()

### Handling missing values

In [None]:
round(100*Test_Data.isnull().mean(),2).sort_values(ascending = False)

In [None]:
# Replace missing values in 'repayment_amount' and 'balance_amount' columns with 0
Test_Data['repayment_amount'].fillna(0, inplace=True)
Test_Data['balance_amount'].fillna(0, inplace=True)

round(100*Test_Data.isnull().mean(),2).sort_values(ascending = False)

In [None]:
# Heads of dataset
Test_Data.head()

In [None]:
# Drop circle_id column
Test_Data = Test_Data.drop('loan_acc_num', axis=1)
Test_Data.head()

In [None]:
# Display the value_counts of loan_type column
Test_Data['loan_type'].value_counts()

### Creating LGD column

In [None]:
# Calculate the Loss Given Default (LGD)
Test_Data['LGD'] = Test_Data['loan_amount'] - ((Test_Data['collateral_value'] + Test_Data['repayment_amount']))

Test_Data['LGD'] = Test_Data['LGD']/Test_Data['loan_amount']

# Display the resulting DataFrame
Test_Data.head()

### As we can see all loan types are equally contributing for the overall distribution,hence, these categorical input variable will have mutli colinearity in model building, Hence deleting this column

In [None]:
Test_Data = Test_Data.drop(["loan_type"], axis=1)
print(Test_Data.shape)
Test_Data.head()

### Rescaling the Features
### We will use MinMax scaling.

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
Test_Data_scaled = scaler.fit_transform(Test_Data)
Test_Data_scaled = pd.DataFrame(Test_Data_scaled, columns=Test_Data.columns)
print(Test_Data_scaled.shape)
Test_Data_scaled.head()

### Modeling the Test_Data

In [None]:
Test_Data_scaled.describe()

In [None]:
y_test = Test_Data_scaled.pop('LGD')
X_test = Test_Data_scaled

In [None]:
y_test.head()

In [None]:
X_test.head()

In [None]:
# Adding a constant variable to Test Dataset
import statsmodels.api as sm
X_test_sm=sm.add_constant(X_test)
X_test_sm.head()

In [None]:
selected_features_test = X_train_sm.columns

X_test_sm = X_test_sm[selected_features_test]
X_test_sm.head()

In [None]:
# Predict the target variable
y_test_pred_2 = res3.predict(X_test_sm)
y_test_pred_2.describe()

In [None]:
# Plot the histogram of the error terms
fig = plt.figure()
sns.distplot((y_test - y_test_pred_2), bins = 20)
fig.suptitle('Error Terms', fontsize = 10)                  # Plot heading 
plt.xlabel('Errors', fontsize = 10)                         # X-label

#### Adjusted R^2 Value for TEST

In [None]:
n = X_test_sm.shape[0]
# Number of features (predictors, p) is the shape along axis 1
p = X_test_sm.shape[1]

# We find the Adjusted R-squared using the formula

adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
adjusted_r2

## Model Evaluation of Test Data

In [None]:
# Plotting y_test and y_pred to understand the spread.
fig = plt.figure()
plt.scatter(y_test,y_test_pred_2)
fig.suptitle('y_test vs y_test_pred_1', fontsize=20)              # Plot heading 
plt.xlabel('y_test', fontsize=10)                          # X-label
plt.ylabel('y_pred', fontsize=10)                          # Y-label

In [None]:
# Visualizing the fit on the test data
# plotting a Regression plot

plt.figure()
sns.regplot(x=y_test, y=y_test_pred_2, ci=68, fit_reg=True,scatter_kws={"color": "green"}, line_kws={"color": "red"})
plt.title('y_test vs y_pred', fontsize=20)
plt.xlabel('y_test', fontsize=18)
plt.ylabel('y_pred', fontsize=16)
plt.show()

### Plotting Loan Account number Distribution Plot wrt Predicted Loss Given Default (LGD) Values

In [None]:
# Rename the Series to 'LGD'
y_test_pred_2 = y_test_pred_2.rename('LGD')

# Concatenate the loan account numbers and LGD values along the column axis
matched_data_test = pd.concat([Test_Data_1['loan_acc_num'], y_test_pred_2], axis=1)

# Display the matched data
matched_data_test


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Set the style of the plot
sns.set(style="whitegrid")

# Plot the distribution of LGD values
plt.figure(figsize=(20, 8))
ax = sns.histplot(matched_data_test['LGD'], kde=True, color='red')

# Add data labels for frequency
for rect in ax.patches:
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width() / 4, height, f'{height:.0f}', ha='center', va='bottom')

plt.title('Distribution of LGD Values')
plt.xlabel('LGD')
plt.ylabel('Frequency')
plt.show()

In [None]:
round(res3.params, 3)