In [None]:
!pip install pyjanitor
!pip install icecream
!pip install opendatasets
!pip install kaleido
!pip install --upgrade orjson
!pip install pingouin

# **<font color='#74a57f'>Bondora</font> Peer to Peer Lending Loan Data**
### Exploratory Data Analysis and Random Forest Classification

---
👨‍🔧 Work in progress..


## <b>Background</b>
Peer-to-peer lending has attracted considerable attention in recent years, largely because it offers a novel way of connecting borrowers and lenders. But as with other innovative approaches to doing business, there is more to it than that. Some might wonder, for example, what makes peer-to-peer lending so different–or, perhaps, so much better–than working with a bank, or why has it become popular in many parts of the world.

Certainly, the industry has witnessed strong growth in recent years. According to Business Insider, transaction volumes in the U.S. and Europe, the world’s leading P2P markets, have expanded at double and, in some cases, triple-digit percentage rates, bolstered by widespread acceptance of doing business online and a supportive regulatory environment.

For investors, "peer-2-peer lending," or "P2P," offers an attractive way to diversify portfolios and enhance long-term performance. When they invest through a peer-to-peer platform, they can profit from an asset class that has proven itself in both good times and bad. Equally important, they can avoid the risks associated with putting all their eggs in one basket, especially at a time when many experts believe that traditional favorites such as stocks and bonds are riskier than ever.

Default risk has long been a significant risk factor to test borrowers’ behaviour in Peer-to-Peer (P2P) lending. In P2P lending, loans are typically uncollateralized and lenders seek higher returns as compensation for the financial risk they take. In addition, they need to make decisions under information asymmetry that works in favor of the borrowers. In order to make rational decisions, lenders want to minimize the risk of default of each lending decision and realize the return that compensates for the risk.

As in the financial research domain, there are very few datasets available that can be utilized for building and analyzing credit risk models. This dataset will help the research community in building and performing research in the credit risk domain.

## Content
This dataset has been retrieved from a publicly available data set of a leading European P2P lending platform, Bondora (https://www.bondora.com/en). The retrieved data is a pool of both defaulted and non-defaulted loans from the time period between February 2009 and July 2021. The data comprises demographic and financial information of borrowers and loan transactions.



In [None]:
import pandas as pd
import numpy as np
import janitor
from icecream import ic
import json
import pickle
import orjson
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
import missingno as msno
import warnings
import time
from icecream import ic
from functools import lru_cache
from scipy.stats import iqr
import statsmodels.api as sm
from statsmodels.graphics.gofplots import qqplot


# kernel setttings
warnings.filterwarnings(action='ignore')
pd.set_option('display.max_columns', 200)
pio.templates.default = 'plotly_white'
colors = ["#074f57","#077187","#74a57f","#9ece9a","#e4c5af"]

# data filepath
data_fp = '../input/bondora-peer-to-peer-lending-loan-data/LoanData_Bondora.csv'


@lru_cache(maxsize=128)
def load_data(cols=None, fp=data_fp)-> pd.DataFrame:
    data = pd.read_csv(fp, low_memory=False,
        usecols=cols, parse_dates=True)

    data = data.remove_empty() \
        .clean_names().drop(columns=[
            'loanid', 'loannumber', 'username'])

    return data


def figure_show(plot, static: bool, **kwargs)-> go.Figure:
    """
    Sets figure with custom layout parameters
    """
    config = {'staticPlot': True}
    plot.update_layout(
        **kwargs,
        font=dict(color='Dark Gray', size=10),
        margin=dict(pad=10),
        width=780)
    
    if static:
        return plot.show(config=config)
    else:
        return plot.show()

# The Loan <font color='#74a57f'>Bondora Dataset</font>
## Exploratory Data Analysis

---

Feature Reference: https://www.bondora.com/en/public-reports

The goal of this exploratory data analysis is to better understand the features of this dataset. 
The demographics data such as age, education, employment, income. As well as how their application went
from their contract date and payment date.

In [None]:
df = load_data()
cols = sorted(df.columns.tolist())
df.head(3)

In [None]:
print(f"""
The dataset has {df.shape[0]} observations with {df.shape[1]} features.
""")

## What is the <font color='#74a57f'>**Dominant** Data Type</font>
---

In [None]:
df.info('dtypes')

In [None]:
# Create list comprehension for unique dtypes
bool_dtype = sorted([*df.select_dtypes('bool')])
obj_dtype = sorted([*df.select_dtypes('object')])
float_dtype = sorted([*df.select_dtypes('float64')])
int_dtype = sorted([*df.select_dtypes('int')])


def plot_dtypes()-> go.Figure:
    ":returns: Plotly Graph Obj. Data Type Counts"    
    title="<b>Bondora Loan Data</b><br> Data Types"
    
    # create dictionary for figure data reference
    d_counts = dict(
        DataType = ['Float', 'Object', 'Integer', 'Bool'],
        Counts = [len(float_dtype), len(obj_dtype),
            len(int_dtype), len(bool_dtype)])

    # create figure
    fig_colors = [colors[2]] * 4
    fig_colors[0] = colors[3]
    fig = px.bar(d_counts, x='DataType', y='Counts', text='Counts')
    
    # tweak traces
    fig.update_traces(
            textposition='outside',
            marker_color=fig_colors, 
            marker=dict(line=dict(
                width=1, color='Black'))) \
        .update_yaxes(visible=False)

    return figure_show(fig, title=title, static=False)

plot_dtypes()

## Top 15 <font color='#74a57f'> Missing Features</font> for Loan Data
---

In [None]:
def get_top_missing()-> pd.DataFrame:
    """
    :returns: Top 15 Missing Data
    """
    
    def plot_bar_missing(df):
        title = '<b>Top</b> 15 features <br> Missing data'
        
        # select the top 15 from data
        df = df[:15]
        
        # create colors
        miss_color = [colors[1]] * 15
        for i in range (0, 5):
            miss_color[i] = colors[3]
        
        # set labels and values for figure
        labels = [names for names in df['Cols']]
        vals = np.round(df.Percentage, 0)
        
        # create figure
        fig = go.Figure() \
        .add_trace(go.Bar(
            orientation='h',
            x=vals,
            y=labels)) \
        .update_traces(marker_color=miss_color,
            marker=dict(line=dict(
                width=1, color='Black'))) \
        .update_xaxes(title='% of missing values') \
        .update_yaxes(title='Features')
        
        return figure_show(fig, title=title, static=False)
        
    # count missing data
    missing = df.isna().sum() \
        .sort_values(ascending=False)

    # convert to dataframe
    missing_df = pd.DataFrame(missing) \
        .reset_index()\
        .rename(columns={
            'index': 'Cols',
            0 : 'Counts'})
        
    # get percentages
    missing_df['Percentage'] = missing_df.Counts\
        .apply(lambda val: np.round(val/len(df) * 100), 2)
    
    # get top 15 features
    top = missing_df.loc[:15]\
        .style.background_gradient(
            subset=['Percentage'],
            cmap='ocean')
    
    bar_plot = plot_bar_missing(missing_df)
    
    return bar_plot


In [None]:
get_top_missing()

## Bandora <font color='#74a57f'>Loan Transactions</font> 

---

In [None]:
def plot_ts_counts_yr()-> go.Figure:
    """
    Plots Bar Graph for loan issued yearly.
    """
    title='<b>Lifetime Loans Issues</b> Yearly'
    fig = go.Figure()

    loan_freq_color = [colors[2]] * len(loan_freq_year)
    loan_freq_color[0] = colors[3]

    fig.add_trace(go.Bar(
        x=loan_freq_year.index,
        y=loan_freq_year.values))

    fig.update_xaxes(tickmode='linear')
    fig.update_traces(
        marker_color=loan_freq_color,
        marker=dict(line=dict(width=1, color='Black')))
    
    return figure_show(fig, title=title, static=False)

loan_freq_ts = pd.to_datetime(df['loandate'])
loan_freq_year = loan_freq_ts.dt.year.value_counts()
plot_ts_counts_yr()

## Borrower <font color='#74a57f'>Demographics</font> 

---

In [None]:
demo = df[[
    'age', 
    'gender', 
    'country',
    'education',
    'incometotal']]

demo_employment = df[[
    'employmentdurationcurrentemployer',
    'employmentstatus',]]

### Borrower <font color='#74a57f'>Country</font> 

---

In [None]:
country_name = []
for country in demo.country:
    if country == 'EE':
        country_name.append('Estonia')
    if country == 'ES':
        country_name.append('Spain')
    if country == 'FI':
        country_name.append('Finland')
    if country == 'SK':
        country_name.append('Slovakia')

user_country = pd.Series(country_name).value_counts()
user_country = user_country.apply(lambda val: np.round(val / user_country.sum() * 100, 0))

In [None]:
def plot_waffle():

    z_val = np.zeros(100)
    i = 0
    counter = 0
    for vals in user_country[:3]:
        to = counter + int(vals)
        z_val[counter: to] = z_val[: int(vals)] + i
        i += 1
        counter += int(vals)

    title = "Borrower <b>Country</b>"
    fig = px.imshow(
        z_val.reshape(-1, 10),
        color_continuous_scale=[
            (0.00, colors[0]), (0.33, colors[0]),
            (0.33, colors[1]), (0.66, colors[1]),
            (0.66, colors[2]), (1.00, colors[2])])\
        .update_traces(ygap=2, xgap=2)\
        .update_layout(
            title=title,
            width=780,
            coloraxis_colorbar=dict(
                title="Country",
                tickvals=[0,1,2],
                ticktext=["Estonia", "Spain", "Finland"],
                lenmode="pixels", len=100))\
        .update_yaxes(visible=False)\
        .update_xaxes(visible=False)

    return figure_show(fig, static=True)

In [None]:
plot_waffle()

In [None]:
def plot_age_hist():
    title = 'Borrower <b>Age</b>'
    fig = px.histogram(demo, x='age') \
        .update_layout(title=title, width=780) \
        .update_traces(
            marker_color=colors[2],
            nbinsx=10)
    return figure_show(fig, static=False)

plot_age_hist()

In [None]:
education_list = []
for code in df.education:
    if code == 1.0:
        education_list.append('Primary Education')
    if code == 2.0:
        education_list.append('Basic Education')
    if code == 3.0:
        education_list.append('Vocational Education')
    if code == 4.0:
        education_list.append('Secondary Education')
    if code == 5.0:
        education_list.append('Higher Education')
        
education = pd.Series(education_list).value_counts()

In [None]:
def plot_education() -> go.Figure:
    title = 'Borrower <b>Education</b>'
    
    # create figure loop
    i = 0
    fig = go.Figure()
    for idx, vals, color in zip(education.index, education.values, colors):
        fig.add_trace(go.Bar(y=[idx], x=[vals], name=idx,
            marker_color=colors[i]))
        i += 1

    # update orientation
    fig = fig.update_traces(
        marker=dict(line=dict(width=1, color='Black')),
        orientation='h')\
        .update_layout(title=title)

    # show figure
    return figure_show(fig, static=False)

plot_education()

In [None]:
employment = []
for code in df.employmentstatus:
    if code == 1:
        employment.append('Unemployed')
    if code == 2:
        employment.append('Partially Employed')
    if code == 3:
        employment.append('Fully Employed')
    if code == 4:
        employment.append('Self-Employed')
    if code == 5:
        employment.append('Entrepreneur')
    if code == 6:
        employment.append('Retiree')

employment = pd.Series(employment).value_counts()

In [None]:
def plot_employment() -> go.Figure:
    title = 'Borrower <b>Employment</b>'
    
    # create figure loop
    i = 0
    fig = go.Figure()
    for idx, vals, color in zip(employment.index, employment.values, colors):
        fig.add_trace(go.Bar(y=[idx], x=[vals], name=idx,
            marker_color=colors[i]))
        i += 1

    # update orientation
    fig = fig.update_traces(
        marker=dict(line=dict(width=1, color='Black')),
        orientation='h')\
        .update_layout(title=title)

    # show figure
    return figure_show(fig, static=False)

plot_employment()

## The Loan

In [None]:
use_loan = []
for codes in df.useofloan:
    if codes == 1:
        use_loan.append('Real Estate')
    if codes == 2:
        use_loan.append('Home Improvement')
    if codes == 3:
        use_loan.append('Business')
    if codes == 4:
        use_loan.append('Education')
    if codes == 5:
        use_loan.append('Travel')
    if codes == 6:
        use_loan.append('Vehicle')
    if codes == 7:
        use_loan.append('Other')
    if codes == 8:
        use_loan.append('Health')
    if codes == 101:
        use_loan.append('Working Capital Financing')
    if codes == 102:
        use_loan.append('Purchase of Machinery Equipment')
    if codes == 103:
        use_loan.append('Renovation of Real Estate')
    if codes == 104:
        use_loan.append('Accounts Receivalbe Financing')
    if codes == 105:
        use_loan.append('Acquisition of Means of Transport')
    if codes == 106:
        use_loan.append('Construction Finance')
    if codes == 107:
        use_loan.append('Acquisition of Stocks')
    if codes == 108:
        use_loan.append('Acquisition of Real Estate')
    if codes == 109:
        use_loan.append('Guaranteeing Obligation')
    if codes >= 110:
        use_loan.append('Other Business')
        
use_loan = pd.Series(use_loan).value_counts()

def plot_loan_use() -> go.Figure:
    title = 'Borrower <b>Use of Loan</b>'
    
    # create figure loop
    i = 0
    fig = go.Figure()
    for idx, vals, color in zip(use_loan.index, use_loan.values, colors):
        fig.add_trace(go.Bar(y=[idx], x=[vals], name=idx,
            marker_color=colors[i]))
        i += 1

    # update orientation
    fig = fig.update_traces(
        marker=dict(line=dict(width=1, color='Black')),
        orientation='h')\
        .update_layout(title=title)

    # show figure
    return figure_show(fig, static=False)

plot_loan_use()

In [None]:
# sort values for by duration in days
loan_duration = df['loanduration'] \
    .value_counts() \
    .reset_index() \
    .sort_values(by='index', ascending=False)

# create label for 4 cuts
labels = [
    '1 to 8 days',
    '8 to 16 days',
    '16 to 27 days',
    '27 to 60 days']

# use pandas cut and sum counts
loan_duration['cat'] = pd.qcut(loan_duration['index'], q=4, labels=labels)
loan_duration_plot = loan_duration.groupby(['cat']).sum()['loanduration']

In [None]:
def plot_loan_duration() -> go.Figure:
    title = 'Loan <b>Duration</b>'
    
    # create figure loop
    i = 0
    fig = go.Figure()
    for idx, vals, color in zip(loan_duration_plot.index, loan_duration_plot.values, colors):
        fig.add_trace(go.Bar(y=[idx], x=[vals], name=idx,
            marker_color=colors[i]))
        i += 1

    # update orientation
    fig = fig.update_traces(
        marker=dict(line=dict(width=1, color='Black')),
        orientation='h')\
        .update_layout(title=title)

    # show figure
    return figure_show(fig, static=False)

plot_loan_duration()

## Current Income and Liabilities

We're doing outlier detection to have a better grasp of the values of income and liabilities of a borrower without the extreme outliers that skews our mean. Though for our model we don't have to normalize our data.

In [None]:
income_liab = df[['incometotal', 'liabilitiestotal']]

### Outlier Detection

---

In [None]:
def plot_qq(qqplot_data, plt_title)-> go.Figure:
    """
    returns: QQ-Plotly Figure of qq_plot_data
    """
    
    # get marker data
    x = qqplot_data[0].get_xdata()
    y = qqplot_data[0].get_ydata()

    # get line data
    x_lines = qqplot_data[1].get_xdata()
    y_lines = qqplot_data[1].get_ydata()

    # create qqplot figure
    fig = go.Figure()
    fig.add_trace(go.Scattergl(
        x=x, y=y,
        mode='markers',
        name='Scatter',
        marker_color=colors[3]))
    
    fig.add_trace(go.Scattergl(
        x=x_lines,
        y=y_lines,
        mode='lines',
        name='Line'))
    
    fig.update_xaxes(title='Theoretical Quantiles')
    fig.update_yaxes(title='Sample Quantiles')
    fig.update_traces(marker=dict(line=dict(width=1, color='Black')))
                      
    return figure_show(fig, static=True, title=plt_title)

def get_qq(data_array):
    return qqplot(data_array,line='s').gca().lines

In [None]:
income_qq = get_qq(income_liab.incometotal)
liab_qq = get_qq(income_liab.liabilitiestotal)

In [None]:
plot_qq(liab_qq, plt_title='<b>QQ Plot</b><br>Liabilities')

In [None]:
plot_qq(income_qq, plt_title='<b>QQ Plot</b><br>Income')

In [None]:
# drop outliers
income_iqr = iqr(income_liab.incometotal)
liabilities_iqr = iqr(income_liab.liabilitiestotal)
n_income = income_liab[income_liab.loc[:, 'incometotal'] <= income_iqr]
income_liab = n_income[n_income.loc[:, 'liabilitiestotal'] <= liabilities_iqr]
income_qq = get_qq(income_liab.incometotal)
liab_qq = get_qq(income_liab.liabilitiestotal)

In [None]:
plot_qq(income_qq, plt_title='<b>QQ Plot</b><br>Income')

In [None]:
plot_qq(liab_qq, plt_title='<b>QQ Plot</b><br>Liabilities')

In [None]:
income_liab.describe().T.style.bar(subset=['std', '50%'],
    color=colors[3])