In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import seaborn as sns
import plotly_express as px

----
-----
There are many LendingClub data sets on Kaggle. Here is the information on this particular data set:

<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>LoanStatNew</th>
      <th>Description</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>loan_amnt</td>
      <td>The listed amount of the loan applied for by the borrower. If at some point in time, the credit department reduces the loan amount, then it will be reflected in this value.</td>
    </tr>
    <tr>
      <th>1</th>
      <td>term</td>
      <td>The number of payments on the loan. Values are in months and can be either 36 or 60.</td>
    </tr>
    <tr>
      <th>2</th>
      <td>int_rate</td>
      <td>Interest Rate on the loan</td>
    </tr>
    <tr>
      <th>3</th>
      <td>installment</td>
      <td>The monthly payment owed by the borrower if the loan originates.</td>
    </tr>
    <tr>
      <th>4</th>
      <td>grade</td>
      <td>LC assigned loan grade</td>
    </tr>
    <tr>
      <th>5</th>
      <td>sub_grade</td>
      <td>LC assigned loan subgrade</td>
    </tr>
    <tr>
      <th>6</th>
      <td>emp_title</td>
      <td>The job title supplied by the Borrower when applying for the loan.*</td>
    </tr>
    <tr>
      <th>7</th>
      <td>emp_length</td>
      <td>Employment length in years. Possible values are between 0 and 10 where 0 means less than one year and 10 means ten or more years.</td>
    </tr>
    <tr>
      <th>8</th>
      <td>home_ownership</td>
      <td>The home ownership status provided by the borrower during registration or obtained from the credit report. Our values are: RENT, OWN, MORTGAGE, OTHER</td>
    </tr>
    <tr>
      <th>9</th>
      <td>annual_inc</td>
      <td>The self-reported annual income provided by the borrower during registration.</td>
    </tr>
    <tr>
      <th>10</th>
      <td>verification_status</td>
      <td>Indicates if income was verified by LC, not verified, or if the income source was verified</td>
    </tr>
    <tr>
      <th>11</th>
      <td>issue_d</td>
      <td>The month which the loan was funded</td>
    </tr>
    <tr>
      <th>12</th>
      <td>loan_status</td>
      <td>Current status of the loan</td>
    </tr>
    <tr>
      <th>13</th>
      <td>purpose</td>
      <td>A category provided by the borrower for the loan request.</td>
    </tr>
    <tr>
      <th>14</th>
      <td>title</td>
      <td>The loan title provided by the borrower</td>
    </tr>
    <tr>
      <th>15</th>
      <td>zip_code</td>
      <td>The first 3 numbers of the zip code provided by the borrower in the loan application.</td>
    </tr>
    <tr>
      <th>16</th>
      <td>addr_state</td>
      <td>The state provided by the borrower in the loan application</td>
    </tr>
    <tr>
      <th>17</th>
      <td>dti</td>
      <td>A ratio calculated using the borrower’s total monthly debt payments on the total debt obligations, excluding mortgage and the requested LC loan, divided by the borrower’s self-reported monthly income.</td>
    </tr>
    <tr>
      <th>18</th>
      <td>earliest_cr_line</td>
      <td>The month the borrower's earliest reported credit line was opened</td>
    </tr>
    <tr>
      <th>19</th>
      <td>open_acc</td>
      <td>The number of open credit lines in the borrower's credit file.</td>
    </tr>
    <tr>
      <th>20</th>
      <td>pub_rec</td>
      <td>Number of derogatory public records</td>
    </tr>
    <tr>
      <th>21</th>
      <td>revol_bal</td>
      <td>Total credit revolving balance</td>
    </tr>
    <tr>
      <th>22</th>
      <td>revol_util</td>
      <td>Revolving line utilization rate, or the amount of credit the borrower is using relative to all available revolving credit.</td>
    </tr>
    <tr>
      <th>23</th>
      <td>total_acc</td>
      <td>The total number of credit lines currently in the borrower's credit file</td>
    </tr>
    <tr>
      <th>24</th>
      <td>initial_list_status</td>
      <td>The initial listing status of the loan. Possible values are – W, F</td>
    </tr>
    <tr>
      <th>25</th>
      <td>application_type</td>
      <td>Indicates whether the loan is an individual application or a joint application with two co-borrowers</td>
    </tr>
    <tr>
      <th>26</th>
      <td>mort_acc</td>
      <td>Number of mortgage accounts.</td>
    </tr>
    <tr>
      <th>27</th>
      <td>pub_rec_bankruptcies</td>
      <td>Number of public record bankruptcies</td>
    </tr>
  </tbody>
</table>

---
----

In [None]:
#lending_club_info = pd.read_csv("../DATA/lending_club_info.csv")

In [None]:
#lending_club_info

In [None]:
df = pd.read_csv("../DATA/lending_club_loan_two.csv")

In [None]:
df.head().transpose()

# EDA

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
px.histogram(df, x="loan_status")

In [None]:
px.histogram(df, color="loan_status", x="annual_inc", nbins=100)

In [None]:
# The graph above shows us that there is a strong disparitu regarding the annual income. Let's look at it deeper.

In [None]:
px.box(df, x="loan_status", y="annual_inc")

In [None]:
px.histogram(df[df["annual_inc"] < 1000000], color="loan_status", x="annual_inc", nbins=100)

In [None]:
px.box(df[df["annual_inc"] < 1000000], x="loan_status", y="annual_inc")

In [None]:
# What about the richest ?

In [None]:
px.histogram(df[df["annual_inc"] >= 1000000], color="loan_status", x="annual_inc", nbins=100)

<a id='remove_high_incomes'></a>
note:

In [None]:
# We may want to remove theses extreme annual income as there are few and may add noise to the model,
#despite the fact that there are information on loan status.

In [None]:
px.histogram(df, x='loan_amnt', color='loan_status', nbins=50)

In [None]:
# It shows that there is a barrier at 35k, the bank may not give more than that often.
#So the loan status over 45k should be better as we can imagine that the choice of the
#bank to lend or not is more thoughtful.

In [None]:
charged = df[df["loan_status"] == "Charged Off"]

nb_unpaid_above_35 = charged["loan_amnt"][charged["loan_amnt"] > 35000].count()
total_above_35 = df["loan_amnt"][df["loan_amnt"] > 35000].count()
nb_unpaid_below_35 =charged["loan_amnt"][charged["loan_amnt"] <= 35000].count()
total_below_35 = df["loan_amnt"][df["loan_amnt"] <= 35000].count()
unpaid_above_35 = nb_unpaid_above_35 / total_above_35 * 100
unpaid_below_35 = nb_unpaid_below_35 / total_below_35 * 100

print("% of unpaid, for loan amount above 35k: {:.2f}%".format(unpaid_above_35))
print("% of unpaid, for loan amount below 35k: {:.2f}%".format(unpaid_below_35))

In [None]:
nb_unpaid_below_35

In [None]:
px.pie(values=[nb_unpaid_above_35, total_above_35], names=["Unpaid","Paid"],
      title="Loan amount above 35k")

In [None]:
px.pie(values=[nb_unpaid_below_35, total_below_35], names=["Unpaid","Paid"],
      title="Loan amount below 35k")

In [None]:
px.histogram(df, x='loan_amnt', color='loan_status', nbins=40, barnorm='percent')

In [None]:
# Hypothesis is not verified ! In fact I took 35k as a limit and indeed there are less charged_off because of 
#the ratio above 36k. But 35k is NOT the boundary. The graph below shows us that the boundary is 36k

In [None]:
px.scatter(df, x='annual_inc', y='loan_amnt', color='loan_status')

In [None]:
charged = df[df["loan_status"] == "Charged Off"]

print("% of unpaid, for incomes above 1M: {:.2f}%".format(charged["annual_inc"][charged["annual_inc"] > 1000000].count() / df["annual_inc"][df["annual_inc"] > 1000000].count() * 100))
print("% of unpaid, for incomes below 1M: {:.2f}%".format(charged["annual_inc"][charged["annual_inc"] <= 1000000].count() / df["annual_inc"][df["annual_inc"] <= 1000000].count() * 100))

In [None]:
df["loan_status"] = pd.get_dummies(df["loan_status"], drop_first=True)["Fully Paid"]

In [None]:
df["loan_status"].head()

In [None]:
plt.figure(figsize=(16,6))
sns.heatmap(df.corr(), annot=True)

In [None]:
data=df.corr()["loan_status"].sort_values().plot(kind='bar')

In [None]:
# Nothing seems to be directly correlated with the output.
#Attention: some features need to be converted to dummies to appears in corr() function

In [None]:
px.histogram(df, x="purpose", color="loan_status")

In [None]:
px.histogram(df, x="purpose", color="loan_status", barnorm='percent')

In [None]:
px.histogram(df, x="int_rate", color="loan_status", nbins=30)

In [None]:
px.histogram(df, x="int_rate", color="loan_status", nbins=30, barnorm="percent")

<a id='remove_int_rate'></a>
note:

In [None]:
# The higher the interest rate, he higher the risk.
#OR It is probably the inverse, Higher the risk, higher the interest rate.
# It is import as it could change the model if it is included or not.
#But if this data is computed from the other ones... Is there a need to keep it ?

# Cleaning

* [Remove high incomes](#remove_high_incomes) ?
* Missing values
* Deal with time
* Deal with adresses -> I think I'll remove it as even google does not find anything with those adresses
* [Remove interest rate](#remove_int_rate) ?