# Credit Risk Modeling Using Python
## Based on online course from 365DataScience

In [180]:
#import relevant libraries

import sklearn
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os 

In [181]:
#import the datasets

#first dataset contains the backup storage for the data, this will not be changed throughout the project, but only kept as backup in case of errors
loan_data1 = pd.read_csv("data/3.1 loan_data_2007_2014 - 1.csv")



In [182]:
loan_data2 = pd.read_csv("data/3.1 loan_data_2007_2014 - 2.csv")


  loan_data2 = pd.read_csv("data/3.1 loan_data_2007_2014 - 2.csv")


In [183]:
loan_data3 = pd.read_csv("data/3.1 loan_data_2007_2014 - 3.csv",encoding = 'unicode_escape')

  loan_data3 = pd.read_csv("data/3.1 loan_data_2007_2014 - 3.csv",encoding = 'unicode_escape')


In [184]:
# concatenate these three files to one 
loan_data = pd.concat([loan_data1, loan_data2, loan_data3])
pd.options.display.max_rows = 50
pd.options.display.max_columns = None

In [185]:
loan_data
loan_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 466272 entries, 0 to 173349
Data columns (total 75 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Unnamed: 0                   466272 non-null  int64  
 1   id                           466272 non-null  int64  
 2   member_id                    466272 non-null  int64  
 3   loan_amnt                    466272 non-null  int64  
 4   funded_amnt                  466272 non-null  int64  
 5   funded_amnt_inv              466272 non-null  float64
 6   term                         466272 non-null  object 
 7   int_rate                     466272 non-null  float64
 8   installment                  466272 non-null  float64
 9   grade                        466272 non-null  object 
 10  sub_grade                    466272 non-null  object 
 11  emp_title                    438684 non-null  object 
 12  emp_length                   445264 non-null  object 
 13 

In [186]:
loan_data = loan_data.drop(loan_data['annual_inc'].nlargest(10).index)


In [187]:
# preprocessing the continuous variables

# delete the "year" or "years" from employement length variable
loan_data['emp_length'].unique()
loan_data["emp_length_int"] = loan_data["emp_length"].str.replace("\+ years", '')
loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('< 1 year', str(0))
loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('n/a', str(0))
loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' years', '')
loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' year', '')

# transform the string into numeric
loan_data["emp_length_int"] = pd.to_numeric(loan_data["emp_length_int"])

# double check if the new variable is of a numeric type
type(loan_data["emp_length_int"][0])

  loan_data["emp_length_int"] = loan_data["emp_length"].str.replace("\+ years", '')


pandas.core.series.Series

In [188]:
# transfor the term variable into numeric using the same steps as above
loan_data["term"].unique()
loan_data["term_int"] = loan_data["term"].str.replace(" months", "")
loan_data["term_int"] = pd.to_numeric(loan_data["term_int"])
loan_data["term_int"].unique()

array([36, 60], dtype=int64)

In [189]:
#transform the string variable indicating date time
loan_data['earliest_date'] = pd.to_datetime(loan_data['earliest_cr_line'], format = "%b-%y")
loan_data['days_passed'] = pd.to_datetime('2022-12-31') - loan_data['earliest_date']
loan_data['month_passed'] = round(pd.to_numeric((pd.to_datetime('2022-12-31')-loan_data['earliest_date'])/np.timedelta64(1, 'M')))
loan_data['month_passed'].describe()

count    466213.000000
mean        300.482664
std          93.976589
min        -551.000000
25%         244.000000
50%         286.000000
75%         346.000000
max         648.000000
Name: month_passed, dtype: float64

In [190]:
# time since the credit issued can not be negative, so, let's check what is happening there
loan_data[loan_data['month_passed'] < 0]

In the above specification, we can see that we have measurement error in years. To be precise, earliest date the credit line issued is stated with future dates, such as '2065-05-27', which is obviously not the case. Since our data is rich enough, we will not investigate the causes of this error, and drop the values with month_passed being negative. This way, we can get rid of credits issued later than the real time. 

In [None]:
#the following 
loan_data.drop(loan_data[loan_data['month_passed'] < 0].index, inplace=True)
loan_data['month_passed'].describe()
#


In [None]:
# run the same procedure for term and issue_date variables

loan_data['earliest_date'] = pd.to_datetime(loan_data['earliest_cr_line'], format = "%b-%y")
loan_data['days_passed'] = pd.to_datetime('2022-12-31') - loan_data['earliest_date']
loan_data['month_passed'] = round(pd.to_numeric((pd.to_datetime('2022-12-31')-loan_data['earliest_date'])/np.timedelta64(1, 'M')))
loan_data['month_passed'].describe()

In [None]:
type(loan_data['term'])
loan_data['term_int'] = pd.to_numeric(loan_data['term'].str.replace(' months', ''))
term_dummies = pd.get_dummies(loan_data['term_int'], prefix='term')
loan_data = pd.concat([loan_data, term_dummies], axis=1)



Since the term variable has only two inputs (36 and 60) representing months, we can confidently consider this as categorical variables. get_dummies command is creating two dummy variables. term_36 is equal to 1, when term is equal to 36 month, and 0 otherwise. The same definition is valid for term_60 variable as well. 

In [None]:
# calculate the issue date from end of 2022

loan_data['issue_d_date'] = pd.to_datetime(loan_data['issue_d'], format = '%b-%y')
loan_data['months_issue_d'] = round(pd.to_numeric((pd.to_datetime('2022-12-31') - loan_data['issue_d_date']) / np.timedelta64(1, 'M')))
loan_data['months_issue_d'].describe()

# the minimum of the months issued seems quite reasonable, therefore, we do not need to further make changes in this variable


## Preprocessing some discrete variables 

In [None]:
# create dummy variables for gender of the clients
all_dummies = [ pd.get_dummies(loan_data['grade'], prefix = 'grade'),
                pd.get_dummies(loan_data['sub_grade'], prefix = 'sub_grade'),
                pd.get_dummies(loan_data['home_ownership'], prefix= 'home_ownership'),
                pd.get_dummies(loan_data['verification_status'], prefix = 'verif_status'),
                pd.get_dummies(loan_data['loan_status'], prefix = 'loan_status'),
                pd.get_dummies(loan_data['purpose'], prefix = 'purpose'),
                pd.get_dummies(loan_data['addr_state'], prefix = 'addr_state'),
                pd.get_dummies(loan_data['initial_list_status'], prefix = 'initial_list_status')]
all_dummies = pd.concat(all_dummies, axis=1)

# concatenate the new dummy variables to the main dataset
loan_data = pd.concat([loan_data, all_dummies], axis=1)



This step creates dummy variables for mentioned categorical variables, such as gender, loan or verification status, or home ownership. In further modeling, these variables are treated separately. In other words, type of the ownership the client has in terms of housing could be rent, mortgage or his/her own, and these dummy variables help us estimate each of these home ownership impacts on credit risk. 

# Dealing with missing values

In [None]:
pd.options.display.max_rows=None
loan_data.isnull().sum()


Dealing with missing values (MV) in credit risk modeling can have a significant impact on model accuracy and performance. Depending on the context of each variables, we can come up with a way to deal with missing values. For example, missing values in maximum revolving amount could be replaced by the loan amount itself, meaning the limit is the loan amount per se. Employment title (emp_title) has very large number of missing values, and this has been categorized among more than 17 thousand categories. Therefore, this variable is not likely to become our main variables of interest. Dropping unimportant missing variables decreases the sample size, but has no benefits after all, sp we leave it as it is.

In [None]:
# MVs revolving limit is replaced by funded amount
loan_data['total_rev_hi_lim'].fillna(loan_data['funded_amnt'], inplace=True)

# MVs in annual income is replaced by mean of annual income 
loan_data['annual_inc'].fillna(loan_data['annual_inc'].mean(), inplace=True)

# MVs of the rest of the variables are replaced by zero based on their nature
loan_data['months_issue_d'].fillna(0, inplace=True)
loan_data['acc_now_delinq'].fillna(0, inplace=True)
loan_data['total_acc'].fillna(0, inplace=True)
loan_data['pub_rec'].fillna(0, inplace=True)
loan_data['open_acc'].fillna(0, inplace=True)
loan_data['inq_last_6mths'].fillna(0, inplace=True)
loan_data['delinq_2yrs'].fillna(0, inplace=True)
loan_data['emp_length_int'].fillna(0, inplace=True)


# Model building 

We can create dummy variables to indicate whether a continuous variable falls within certain intervals, but this is not a typical approach for logistic regression. Logistic regression is designed to model the relationship between a binary response variable and one or more predictor variables, where the predictors can be continuous, categorical, or a combination of both.

In the case of a continuous predictor variable like income, it is more common to use the raw income values directly in the logistic regression model, rather than transforming them into dummy variables based on intervals. This is because the relationship between income and the response variable may be non-linear, and splitting income into discrete intervals may lead to loss of information and decrease the accuracy of the model.

That being said, in some cases, creating interval dummy variables may be useful if the relationship between the response variable and the predictor variable is not linear, and if the intervals are based on a priori knowledge or domain expertise. In these cases, the dummy variables can be included in the logistic regression model along with the raw income values to capture the non-linear relationship. However, this approach should be used with caution, as it can result in overfitting if the number of intervals is large.

Before we begin building the model, we need to specify the definition of default. We have loan status variable indicating the the loan status, whether it has fully been paid, charged off, defaulted, delaying the payment up to 120 days and so forth. 

In the following step, qualitative variable indicating if the loan status is good or bad is assigned with dummy variable. 1 represents the payment quality being good, that is, the loan is paid off in time, while 0 means all categories that represents critical situation with payments: default, charged off, payment is delayed for 31-120 days and not meeting the credit policy. This variable is further used in logistic and other regression models as dependent variable.

In [None]:
pd.options.display.max_rows=50
pd.options.display.max_columns=None
loan_data['loan_status'].value_counts()
loan_data["payment_quality"] = np.where(loan_data['loan_status'].isin(['Charged Off', 'Default', 
                                                                        'Does not meet the credit policy. Status_Charged Off',
                                                                        'Late (31-120 days)',]), 0, 1)

## Independent variables 
We can group the independent variables into two categories. Categorical (discrete) and continuous variables. We group the continuous variables into categorical variables, such as income and debt using "Weight of Evidence" method. This is  

## Training and Test data

In [None]:
from sklearn.model_selection import train_test_split
train_test_split(loan_data.drop('payment_quality', axis = 1), loan_data['payment_quality'])

In [None]:
loan_data_train_inputs, loan_data_test_inputs, loan_data_train_targets, loan_data_test_targets = train_test_split(loan_data.drop('payment_quality',
                                                                                                                  axis = 1), 
                                                                                                                  loan_data['payment_quality'],
                                                                                                                  test_size=0.25,
                                                                                                                  random_state=42)
shapes = [  loan_data_train_inputs.shape,
            loan_data_train_targets.shape,
            loan_data_test_inputs.shape,
            loan_data_test_targets.shape]

shapes



In [None]:
inputs = loan_data_train_inputs
targets = loan_data_train_targets

df=pd.concat([inputs['grade'], targets], axis = 1)

df.groupby(df.columns.values[0], as_index = False)[df.columns.values[1]].count()

df = pd.concat([df.groupby(df.columns.values[0], as_index=False)[df.columns.values[1]].count(),
                    df.groupby(df.columns.values[0], as_index=False)[df.columns.values[1]].mean()], axis = 1 )

df = df.iloc[:,[0,1,3]]
df

# Weight of Evidence and Information Value
Weight of Evidence (WOE) and Information Value (IV) are two statistical measures used in credit scoring and predictive modeling to evaluate the power of a predictor in explaining the target variable.

Weight of Evidence (WOE) is a measure of how well a predictor separates the positive (good) cases from the negative (bad) cases. It is the logarithmic transformation of the odds ratio and is expressed as the difference between the natural logarithm of the event rate for the positive cases and the event rate for the negative cases. WOE helps in transforming the original predictor into a new predictor that is better at separating the positive and negative cases.

Information Value (IV) is a measure of the strength of association between a predictor and the target variable. It is a simple summary statistic that can be used to select the best predictors for a predictive model. IV is the sum of the weighted differences between the event rate for the positive cases and the event rate for the negative cases, where the weight is the proportion of observations in that group.

In credit scoring, WOE and IV are used to select the best predictors for the credit scorecard, to transform the predictors so that they are better at separating the good and bad cases, and to evaluate the performance of the scorecard. In predictive modeling, they are used to identify the predictors that have the strongest association with the target variable and to transform the predictors so that they are better suited for building a predictive model.

## The formula for weight of evidence (WoE) is as following:
$$
WoE = ln\left(\frac{\text{proportion of good events}}{\text{proportion of bad events}}\right)
$$




The following woe_iv_dis function 

In [None]:
def woe_iv_dis(df, varname, qualitative_var):
    """
    Calculates the Weight of Evidence (WoE) and Information Value (IV) of a categorical variable.
    
    Parameters:
    df (pd.DataFrame): DataFrame containing the target variable and the categorical variable.
    varname (str): Name of the target variable.
    qualitative_var (str): Name of the categorical variable.
    
    Returns:
    pd.DataFrame: DataFrame containing the WoE and IV values for each category of the categorical variable.
    
    The columns of the output DataFrame are:
    - the categorical variable
    - 'freq. of class': frequency of each category
    - 'mean of class': mean value of the target variable for each category
    - 'proportions': proportion of each category in the data
    - 'n_good': number of good outcomes in each category
    - 'n_bad': number of bad outcomes in each category
    - 'prop_good': proportion of good outcomes in each category
    - 'prop_bad': proportion of bad outcomes in each category
    - 'weight_of_evidence': WoE value for each category
    - 'delta_WoE': difference in absolute value of WoE between consecutive categories
    - 'info_value': IV value for the categorical variable
    """
    df = pd.concat([df[varname], qualitative_var], axis = 1)
    df = pd.concat([df.groupby(df.columns.values[0], as_index=False)[df.columns.values[1]].count(),
                    df.groupby(df.columns.values[0], as_index=False)[df.columns.values[1]].mean()], axis = 1 )

    df = df.iloc[:, [0,1,3]]
    df.columns = [df.columns.values[0], 'freq. of class', 'mean of class']
    df['proportions'] = df['freq. of class'] / df['freq. of class'].sum()
    df['n_good'] = df['mean of class'] * df['freq. of class']
    df['n_bad'] = (1 - df['mean of class']) * df['freq. of class']
    df['prop_good'] = df['n_good'] / df['n_good'].sum()
    df['prop_bad'] = df['n_bad'] / df['n_bad'].sum()
    df['weight_of_evidence'] = np.log(df['prop_good']/df['prop_bad'])
    df = df.sort_values(['weight_of_evidence'])
    df = df.reset_index(drop=True)
    df['delta_WoE'] = df['weight_of_evidence'].diff().abs()
    df['info_value'] = (df['prop_good'] - df['prop_bad']) * df['weight_of_evidence']
    df['info_value'] = df['info_value'].sum()
    
    return df
    

In [None]:
inputs = loan_data_train_inputs
targets = loan_data_train_targets

df_test = woe_iv_dis(df=inputs, varname='grade',  qualitative_var= targets)
df_test

In [None]:
def plot_woe(df_woe_iv, fig_size=(20,8), x_degree=0):
    """"""
    x = np.array(df_woe_iv.iloc[:,0].apply(str))
    y = df_woe_iv['weight_of_evidence']
    plt.figure(figsize=fig_size)
    plt.plot(x,y, marker = 'o', linestyle = '--', color = 'k')
    plt.xlabel(df_woe_iv.columns[0])
    plt.ylabel('Weight of Evidence')
    plt.title(str('Weight of Evidence  ' + df_woe_iv.columns[0]))
    plt.xticks(rotation = x_degree)
    


In [None]:
plot_woe(df_test)

In [None]:
grades  = inputs['grade']
grades_order = ['A', 'B', 'C', 'D', 'E', 'F']
grades.sort_values()
#count the grades
grade_counts = {}
for grade in grades:
    if grade in grade_counts:
        grade_counts[grade] +=1
    else:
        grade_counts[grade] = 1

# plot the grade frequency
plt.figure(figsize=(10,5))
plt.bar(grades_order, [grade_counts[grade] for grade in grades_order])
plt.xlabel('Grade category')
plt.ylabel('Frequency')
plt.title('Frequency of grades in the Dataset')
plt.show()

In [None]:
# Estimate the weight of evidence of the home ownership variable
df_home_own = woe_iv_dis(inputs, 'home_ownership', targets)
df_home_own

In [None]:
plot_woe(df_home_own, (16,5))

In [None]:
# home ownership categories "other", "none" and "any" can be combined
inputs['home_own_none_other_any_combined'] = sum([inputs['home_ownership_ANY'], inputs['home_ownership_NONE'], inputs['home_ownership_OTHER']])

''' this step leaves us only four categories: rent, mortgage, own, and others combined in one variable. 
Using unification technique we combine the three less informative variables into one which does not impact on final result very much, 
yet cuts computational costs'''

In [None]:
# calculate weights of evidences for location
df_location = woe_iv_dis(inputs, 'addr_state', targets)
df_location

In [None]:
plot_woe(df_location, (16,5))

In [None]:
states = inputs['addr_state']
state_counts = {}
for state in states:
    if state in state_counts:
        state_counts[state] += 1
    else:
        state_counts[state] = 1

# Plot the frequency chart
plt.figure(figsize=(16,8))
plt.bar(state_counts.keys(), state_counts.values())
plt.xlabel('State Abbreviation')
plt.ylabel('Frequency')
plt.title('Frequency of State Abbreviations in the Dataset')
plt.show()

## Grouping the states based on states to create classes

This step of grouping the weights of evidence can be useful in credit risk modeling because it allows the model to group similar observations together. This can help to simplify the analysis and reduce the noise in the data, which can lead to more accurate predictions.

In credit risk modeling, the weights of evidence are often used to indicate the likelihood of a borrower defaulting on a loan. By grouping similar weights of evidence together, the analyst can identify patterns and relationships that may not be immediately apparent when examining each weight of evidence individually. This can help to inform the development of more accurate and effective credit risk models.

In the next several steps carry out the process of grouping the states. States with higher frequency in the data are taken as separate groups. States like California, New York, Florida and Texas are relatively more frequent than other states, therefore, each of these states constitute one group itself. For the rest, we employ the techniques to group them together and create classes based on weights of evidences.





In [None]:
# plot the weights of evidence removing the outliers
plot_woe(df_location.iloc[6:-6,:], (25,8))

In [None]:
# The 4 highly frequent states
inputs["st_group_TX"] = sum(inputs['addr_state_TX'])
inputs["st_group_FL"] = sum(inputs['addr_state_FL'])
inputs["st_group_NY"] = sum(inputs['addr_state_NY'])
inputs["st_group_CA"] = sum(inputs['addr_state_CA'])


# The rest 
inputs['st_group_NM_MD_NC_LA_MD'] = sum([inputs['addr_state_NM'],inputs['addr_state_MD'],inputs['addr_state_NC'], inputs['addr_state_LA'],
                                                inputs['addr_state_MD']])

inputs['st_group_MI_NJ_VA'] = sum([inputs['addr_state_MI'],inputs['addr_state_NJ'],inputs['addr_state_VA']])

inputs['st_group_OK_TN_AZ_DE_AR_UT'] = sum([inputs['addr_state_OK'],inputs['addr_state_TN'],inputs['addr_state_AZ'], inputs['addr_state_DE'],
                                                inputs['addr_state_AR'],inputs['addr_state_UT']])

inputs['st_group_KY_MN_NA_IN_OH'] = sum([inputs['addr_state_KY'],inputs['addr_state_MN'],inputs['addr_state_MA'],inputs['addr_state_IN'],
                                        inputs['addr_state_OH']])

inputs['st_group_RI_OR_GA_WA'] = sum([inputs['addr_state_RI'],inputs['addr_state_OR'],inputs['addr_state_GA'], inputs['addr_state_WA']])

inputs['st_group_SD_ID'] = sum([inputs['addr_state_SD'],inputs['addr_state_ID']])

inputs['st_group_MS_MT'] = sum([inputs['addr_state_MS'], inputs['addr_state_MT']])

inputs['st_group_IL_CT_CO'] = sum([inputs['addr_state_IL'], inputs['addr_state_CT'], inputs['addr_state_CT'], inputs['addr_state_CO']])

inputs['st_group_VT_SC'] = sum([inputs['addr_state_VT'], inputs['addr_state_SC']])

inputs['st_group_KS'] = sum([inputs['addr_state_KS']])



inputs.head()

In [None]:
inputs["term:36"] = np.where((inputs['term_int']==36),1,0)
inputs["term:60"] = np.where((inputs['term_int']==60),1,0)


In [None]:
df_ver_status = woe_iv_dis(inputs, 'verification_status', targets)
df_purpose = woe_iv_dis(inputs, "purpose", targets)
df_init_status = woe_iv_dis(inputs, "initial_list_status", targets)

In [None]:
plot_woe(df_ver_status, (16,5))

In [None]:
plot_woe(df_purpose, (20,8), x_degree=30)

In [None]:
plot_woe(df_init_status, (18,5))

# Continuous variables

In [None]:
def woe_iv_cont(df, varname, qualitative_var):
    """
    Calculates the Weight of Evidence (WoE) and Information Value (IV) of a categorical variable.
    
    Parameters:
    df (pd.DataFrame): DataFrame containing the target variable and the categorical variable.
    varname (str): Name of the target variable.
    qualitative_var (str): Name of the categorical variable.
    
    Returns:
    pd.DataFrame: DataFrame containing the WoE and IV values for each category of the categorical variable.
    
    The columns of the output DataFrame are:
    - the categorical variable
    - 'freq. of class': frequency of each category
    - 'mean of class': mean value of the target variable for each category
    - 'proportions': proportion of each category in the data
    - 'n_good': number of good outcomes in each category
    - 'n_bad': number of bad outcomes in each category
    - 'prop_good': proportion of good outcomes in each category
    - 'prop_bad': proportion of bad outcomes in each category
    - 'weight_of_evidence': WoE value for each category
    - 'delta_WoE': difference in absolute value of WoE between consecutive categories
    - 'info_value': IV value for the categorical variable
    """
    df = pd.concat([df[varname], qualitative_var], axis = 1)
    df = pd.concat([df.groupby(df.columns.values[0], as_index=False)[df.columns.values[1]].count(),
                    df.groupby(df.columns.values[0], as_index=False)[df.columns.values[1]].mean()], axis = 1 )

    df = df.iloc[:, [0,1,3]]
    df.columns = [df.columns.values[0], 'freq. of class', 'mean of class']
    df['proportions'] = df['freq. of class'] / df['freq. of class'].sum()
    df['n_good'] = df['mean of class'] * df['freq. of class']
    df['n_bad'] = (1 - df['mean of class']) * df['freq. of class']
    df['prop_good'] = df['n_good'] / df['n_good'].sum()
    df['prop_bad'] = df['n_bad'] / df['n_bad'].sum()
    df['weight_of_evidence'] = np.log(df['prop_good']/df['prop_bad'])
    
    # Unlike previous weight of evidence function, we do not sort the variable by weo, but instead, keep the variable's natural order

    df['delta_WoE'] = df['weight_of_evidence'].diff().abs()
    df['info_value'] = (df['prop_good'] - df['prop_bad']) * df['weight_of_evidence']
    df['info_value'] = df['info_value'].sum()
    
    return df
    

In [None]:
# employement length

df_emp_l = woe_iv_cont(inputs, 'emp_length_int', targets)
plot_woe(df_emp_l, (16,5))

In [None]:
df_emp_l

In [None]:
inputs['emp_length:0'] = np.where(inputs['emp_length'].isin([0]), 1, 0)
inputs['emp_length:1'] = np.where(inputs['emp_length'].isin([1]), 1, 0)
inputs['emp_length:2-4'] = np.where(inputs['emp_length'].isin(range(2,5)), 1, 0)
inputs['emp_length:5-6'] = np.where(inputs['emp_length'].isin(range(5,7)), 1, 0)
inputs['emp_length:7-9'] = np.where(inputs['emp_length'].isin(range(7,10)), 1, 0)
inputs['emp_length:10'] = np.where(inputs['emp_length'].isin([10]), 1, 0)

In [None]:
# since months_issued variable has a lot of classes, we have to decrease this number to make it easier to include in the model
inputs['months_factor'] = pd.cut(inputs['months_issue_d'], 10) 
inputs['months_factor']

In [None]:
df_months = woe_iv_cont(inputs, 'months_factor', targets)
df_months

In [None]:
plot_woe(df_months, x_degree=30)

In [None]:
# using graphical representation of the number of months passed since loan issued, we can create classes for all of the trends
inputs['months_since_issued:106'] = np.where(inputs['months_factor'].isin(range(95,106)), 1,0)
inputs['months_since_issued:115'] = np.where(inputs['months_factor'].isin(range(106,115)), 1,0)
inputs['months_since_issued:124'] = np.where(inputs['months_factor'].isin(range(115,124)), 1,0)
inputs['months_since_issued:133'] = np.where(inputs['months_factor'].isin(range(124,133)), 1,0)
inputs['months_since_issued:142'] = np.where(inputs['months_factor'].isin(range(133,142)), 1,0)
inputs['months_since_issued:151'] = np.where(inputs['months_factor'].isin(range(142,151)), 1,0)
inputs['months_since_issued:160'] = np.where(inputs['months_factor'].isin(range(151,160)), 1,0)
inputs['months_since_issued:169'] = np.where(inputs['months_factor'].isin(range(160,169)), 1,0)
inputs['months_since_issued:178'] = np.where(inputs['months_factor'].isin(range(169,178)), 1,0)
inputs['months_since_issued:187'] = np.where(inputs['months_factor'].isin(range(178,187)), 1,0)

In [None]:
inputs['int_rate_classes'] = pd.cut(inputs['int_rate'], 10)


In [None]:
df_interest = woe_iv_cont(inputs, 'int_rate_classes', targets)
plot_woe(df_interest)

In [None]:
inputs['int_rate:9.5'] = np.where((inputs['int_rate_classes'].cat.codes < 9.548), 1,0)
inputs['int_rate:13.6'] = np.where((inputs['int_rate_classes'].cat.codes<=13.676) & (inputs['int_rate_classes'].cat.codes > 9.548), 1,0)
inputs['int_rate:17.8'] = np.where((inputs['int_rate_classes'].cat.codes <=17.807) & (inputs['int_rate_classes'].cat.codes > 13.676) , 1,0)
inputs['int_rate:26.06'] = np.where((inputs['int_rate_classes'].cat.codes <=26.06) & (inputs['int_rate_classes'].cat.codes > 19.86), 1,0)


In [None]:
inputs['funded_amount_class'] = pd.cut(inputs['funded_amnt'],50)
df_funded = woe_iv_cont(inputs, 'funded_amount_class', targets)
plot_woe(df_funded, x_degree=30)

## Annual income - one of the most important variables in the model has to be carefully preprocessed 

In [None]:
inputs['income_classes'] = pd.cut(inputs['annual_inc'], 50)

In [None]:
df_income = woe_iv_cont(inputs, 'income_classes', targets)
df_income

In [None]:
'''
max_income_class = inputs['income_classes'].max()
inputs = inputs.reset_index(drop=True)
inputs['income_classes'] = inputs['income_classes'][inputs['income_classes'] != max_income_class]

'''


In [None]:
# inputs['income_classes'].unique()

In [None]:
max(inputs['annual_inc'])

In [None]:

max_income_observation = inputs.loc[inputs['annual_inc'].idxmax()]
print(max_income_observation)


In [None]:
inputs = inputs.reset_index(drop=True)
df_income = woe_iv_cont(inputs, 'income_classes', targets)

In [None]:
plt.figure(figsize=(8, 6))
inputs['annual_inc'].plot(kind='kde')

# Set the plot title and axis labels
plt.title('Density Plot of Annual Income')
plt.xlabel('Annual Income')
plt.ylabel('Density')

# Show the plot
plt.show()






In [None]:
plot_woe(df_income)