In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Impact of Borrowers Profile and Loan Characteristics on the Interest Rate Levels

# by Krutika Trivedi

# Investigation Overview

In this investigation, I wanted to look at the impact of loan characteristics but also of a borrower's profile on the interest rate. Indeed, it is often said that you can obtain better interest rates based on your financial situation or based on how big of a loan you would apply for or on how risky is this loan based on what kind of category it is in.

# Dataset Overview

There are 113,066 loans in the dataset. The attributes include 10 features out of which 5 features to describe the borrower's profile (Income Range, Employment Status, Employment Status Duration, Is Borrower Homeowner?, Total Credit Lines in the past 7 years) then Listing Creation Date, Borrower Rate (interest rate), Loan Category, Loan Original Amount and the Monthly Loan Payment.

In [None]:
# Code to Check the Path of the File
import os
for dirname in os.walk('/kaggle/input'):
    print(dirname)

In [None]:
# Import Important Libraries & Packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline

# Suppress Warnings from Final Output
import warnings
warnings.simplefilter("ignore")

In [None]:
# Load Data into DataFrame
loan_df = pd.read_csv('/kaggle/input/prosper-loan/prosperLoanData.csv')

In [None]:
loan_df.head(5)

Re-format the Dataset to only keep the Selected Variables

In [None]:
loan_df = loan_df[['ListingNumber', 'ListingCreationDate', 'BorrowerRate', 'ListingCategory (numeric)', 'IncomeRange', 'EmploymentStatus', 'EmploymentStatusDuration', 'IsBorrowerHomeowner', 'TotalCreditLinespast7years', 'LoanOriginalAmount', 'MonthlyLoanPayment']]

In [None]:
loan_df.head()

# Assess the Data

In [None]:
loan_df.info()

In [None]:
loan_df.describe()

**Check Duplicates**

In [None]:
loan_df[loan_df.duplicated()]

In [None]:
# Change Datatypes: ListingCreationDate to datetime
loan_df['ListingCreationDate'] = pd.to_datetime(loan_df['ListingCreationDate'])

In [None]:
# Remove Duplicate Rows
loan_df.drop_duplicates(inplace=True)

# Reset Index of the Dataset
loan_df.reset_index(drop= True, inplace=True)

In [None]:
# Dictionary of Listing Categories
listing_cat = {0: 'Not Available', 1: 'Debt Consolidation', 2: 'Home Improvement', 3: 'Business', 4: 'Personal Loan', 5: 'Student Use', 6: 'Auto', 7: 'Other', 8: 'Baby&Adoption', 9: 'Boat', 10: 'Cosmetic Procedure', 11: 'Engagement Ring', 12: 'Green Loans', 13: 'Household Expenses', 14: 'Large Purchases', 15: 'Medical/Dental', 16: 'Motorcycle', 17: 'RV', 18: 'Taxes', 19: 'Vacation', 20: 'Wedding Loans'}

# DataFrame for Listing Categories 
df_cat = pd.DataFrame(list(listing_cat.items()))

# Apply Correct Naming for Columns
df_cat.rename(columns={0: "CategoryNum", 1: "LoanCategory"}, inplace= True)

# Merge Listing Category Names & Drop the Listing Category Code
loan_df = loan_df.merge(df_cat, how= 'left', left_on= 'ListingCategory (numeric)', right_on='CategoryNum')
loan_df.drop(labels=['ListingCategory (numeric)', 'CategoryNum'], axis = 1, inplace= True)

# Turn into Category Datatype
loan_df['LoanCategory'] = loan_df['LoanCategory'].astype('category')

In [None]:
# Reset All Category Types

# Convert Datatype of ListingCreationDate to datetime 
loan_df['ListingCreationDate'] = pd.to_datetime(loan_df['ListingCreationDate'])

# Loan category to category
loan_df['LoanCategory'] = loan_df['LoanCategory'].astype('category')

# Converting the EmploymentStatus into an Ordered Category Type
order_empl_status = ['Employed', 'Full-time', 'Self-employed', 'Part-time', 'Retired', 'Not employed', 'Not available', 'Other']
ordered_empl_status = pd.api.types.CategoricalDtype(order_empl_status, ordered= True)
loan_df['EmploymentStatus'] = loan_df['EmploymentStatus'].astype(ordered_empl_status)

# Converting the IncomeRange into an Ordered Category Type
order_income_range = ['$100,000+', '$75,000-99,999', '$50,000-74,999', '$25,000-49,999', '$1-24,999', '$0', 'Not employed', 'Not displayed']
ordered_income_range = pd.api.types.CategoricalDtype(order_income_range, ordered= True)
loan_df['IncomeRange'] = loan_df['IncomeRange'].astype(ordered_income_range)

**Note** The above cells have been set as "Skip"-type slides, That means that when the notebook is rendered as http slides, those cells won't show up.

# Distribution of Loan's Interest Rates

The interest rate almost follows a normal distribution with the mean situated at around 19%. The interest rates are quite high in general with values oscillating between 0 and ±40%. The ditribution is slightly right skewed but, there is an isolated high peak at about 32%. There are very little loans lower than 5% or higher than 35%.

In [None]:
# Start with a Standard-Scaled Plot
binsize = 0.01
bins = np.arange(0, loan_df['BorrowerRate'].max()+binsize, binsize)
plt.figure(figsize=[8, 5])
plt.hist(data = loan_df, x = 'BorrowerRate', bins = bins)
plt.xlabel('Interest Rate')
plt.title('Distribution of Interest Rates')
plt.text(0.21, 6000, 'mean = 0.19', bbox=dict(facecolor='orange', alpha=0.5))
plt.axvline(loan_df['BorrowerRate'].mean(), color='orange', linestyle='-.', linewidth=1)
plt.show()

# Borrowers' profiles
A major part of the people are employed or full-time working and more likely homeowners. Only a few are self-employed and the rest are part-time, retired or under an unknown status of employment. The most common borrowers are in the ranges of 25K dollars to 75K dollars income, the second biggest group of borrowers earn more than 75k dollars and a minority of borrowers earn less than 25K dollars or nothing. Lastly, there is a bigger share of homeowners for borrowers earning 50K + dollars

In [None]:
# Plot Clustered Bar Charts to Evaluate Relationship Between Categorical Variables 
plt.figure(figsize = [14, 5])
plt.subplot(1,2,1)
ax = sb.countplot(data = loan_df, x = 'EmploymentStatus', hue = 'IsBorrowerHomeowner')
ax.legend(loc = 1, ncol = 3, framealpha = 1, title = 'IsBorrowerHomeowner?')
ax.set_title('Number of Loans by Employment Status of the Borrower')
ax.set_xlabel('Employment Status')
ax.set_ylabel('Number of Loans')

plt.xticks(rotation= 90)

plt.subplot(1,2,2)
ax = sb.countplot(data = loan_df, x = 'IncomeRange', hue = 'IsBorrowerHomeowner')
ax.legend(loc = 1, ncol = 3, framealpha = 1, title = 'Is Borrower Homeowner?')
ax.set_title('Number of Loans by Income Range of the Borrower')
ax.set_xlabel('Income Range')
ax.set_ylabel('Number of Loans')

plt.xticks(rotation= 90);

# Interest Rate vs. Borrower's Profile Features

As it can be expected employed borrowers have a lower interest rate than non-employed ones and on top of that if they are homeowner, they get an even lower rate. Unexpectedly, someone not employed but who is a homeowner is more likely to have a higher interest rate. Income range is a feature that has slight influence on the level of interest rate of a loan. As a general rule, the lower the income of a person, the higher is their loan interest rate. However, the income range at 0 dollars is getting a similar interest rate as average income range borrowers.

In [None]:
# Create Column Employed vs. Not Employed
loan_df['Employed'] = loan_df['EmploymentStatus'].apply(lambda x: 'False' if x == "Not employed" else 'True')

# Boxplot 2 Categorical Variables (Homeowner & Employed) vs. Interest Rate
g = sb.FacetGrid(data = loan_df, col = 'Is Borrower Homeowner', height = 4)
g.map(sb.boxplot, 'Employed', 'BorrowerRate')
plt.subplots_adjust(top=0.8)
g.fig.suptitle('Interest Rate by Employment Status & Homeownership')
g.axes[0,0].set_ylabel('Interest Rate');

In [None]:
# Plot boxplots to illustrate relationship between borrower's rate and categorical variables
plt.figure(figsize=(16, 6))
base_color = sb.color_palette()[0]
sb.boxplot(data = loan_df, x = 'IncomeRange', y = 'BorrowerRate', color = 'base_color')
plt.axhline(loan_df['BorrowerRate'].mean(), color='orange', linestyle='-.', linewidth=1)
plt.text(0, loan_df['BorrowerRate'].mean()+0.2, 'mean = 0.19', bbox=dict(facecolor='orange', alpha=0.5))
plt.xticks(rotation = 90)
plt.ylabel('Interest Rate')
plt.xlabel('Income Range')
plt.title('Interest Rate by Income Range');

# Interest Rate by Loan Amount & Split by Loan Category

As the loan amount gets higher, the more likelihood to obtain an interest rate that is lower that the general average (19%).

There are categories for which we find less unemployed borrowers like Motorcycle, Boat, Baby and adoption and RV. Also we see that unemployed borrowers generally borrow for lower amounts. But what we notice is that the plotting pattern does not really vary much by category. Certainly some categories like cosmetic procedure and motorcycle do not have large loan amounts but the interest rate varies still from very low to very high. Mostly for small loans, the interest rate can be quite high. And as the loan amount increases, the max observed interest rate gets lower and lower. The interest rate despite the category can really be at a high or low level as well as the loan amount can be.

In [None]:
# Scatter Plots of Loan Amounts vs. Interest Rate (Employed vs. Non-Employed) and also Separate by Loan Category
g = sb.FacetGrid(data = loan_df, col='LoanCategory' , col_wrap=3, hue = 'Employed', height = 5)
g.map(plt.scatter, 'LoanOriginalAmount', 'BorrowerRate')
g.add_legend()
g.set_titles('{col_name}')
g.axes[0].set_ylabel('Interest Rate')
g.axes[3].set_ylabel('Interest Rate')
g.axes[6].set_ylabel('Interest Rate')
g.axes[9].set_ylabel('Interest Rate')
g.axes[12].set_ylabel('Interest Rate')
g.axes[15].set_ylabel('Interest Rate')
g.axes[18].set_ylabel('Interest Rate')
g.axes[18].set_xlabel('Loan Amount')
g.axes[19].set_xlabel('Loan Amount')
g.axes[20].set_xlabel('Loan Amount')
plt.subplots_adjust(top=0.95)
g.fig.suptitle('Interest Rate by Loan Amount & Split by Loan Category (Employed vs. Non-Employed Borrowers)', fontsize=16);

# Conclusion
In conclusion, it looks like the borrowers profile has an impact on the interest rate but it is not very strong or not at all easily predictable and can vary largely even despite which category the loan could be in or how many other loans they have. What can be expected with a higher chance though is that the interest rate will likely be lower for a bigger loan.

Once you're ready to finish your presentation, check your output by using nbconvert to export the notebook and set up a server for the slides. From the terminal or command line, use the following expression:

jupyter nbconvert <file_name>.ipynb --to slides --post serve --template output_toggle

This should open a tab in your web browser where you can scroll through your presentation. Sub-slides can be accessed by pressing 'down' when viewing its parent slide. Make sure you remove all of the quote-formatted guide notes like this one before you finish your presentation!