# Import packages

In [1]:
import sys
import os

# Add the project root directory to sys.path
project_root = os.path.abspath("..")
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [2]:
import pandas as pd 
import plotly.express as px
import kaleido
import os

from src.constants import PATH_RAW_DATA, PATH_VISUALIZATION_EDA
from src.data_cleaning import check_missing_values, check_outliers
from src.visualization_functions import (
    plot_histogram, plot_histogram_per_category, plot_boxplot, plot_boxplot_per_category, plot_violin, plot_violin_per_category,
    plot_scatter,
    plot_bar_frequency,
    plot_bar_frequency_per_category,
    plot_correlation_heatmap)

# 1. Dataset

## 1.1. Read data

In [3]:
df = pd.read_csv(PATH_RAW_DATA, encoding='ISO-8859-1')

In [8]:
df.head()

Unnamed: 0,id,address,amount,annual_income,application_type,date_funded,debt_to_income,earliest_cr_line,emp_length,emp_title,...,num_mortgages,num_open_credit,num_records,num_total_credit,purpose,revol_balance,revol_util,sub_grade,term,default
0,135197,"PSC 7791, Box 3858\r\r\nAPO AA 93700",13700,45000.0,INDIVIDUAL,Jun-2014,12.32,Aug-2000,5 years,financial analyst,...,4.0,8,0.0,23.0,debt_consolidation,11751,56.0,C2,60 months,1
1,277103,"502 Barrera Dam\r\r\nBlackbury, OR 00813",10000,,INDIVIDUAL,Aug-2015,3.56,Dec-1997,10+ years,secretary/Fingerprinting Technician,...,0.0,5,1.0,10.0,credit_card,7092,34.4,B1,36 months,0
2,384924,22486 Leonard Views Suite 601\r\r\nSouth Thoma...,10000,47620.0,INDIVIDUAL,Aug-2012,16.25,Apr-2002,9 years,City of Hillsboro,...,3.0,5,0.0,8.0,debt_consolidation,8343,92.7,C4,36 months,0
3,181727,"22309 Gould Freeway Apt. 620\r\r\nLynnberg, MT...",10000,26000.0,INDIVIDUAL,May-2014,6.6,Oct-1999,3 years,RPW,...,7.0,11,0.0,31.0,home_improvement,1408,8.2,C2,60 months,1
4,148886,27523 Chad Flats Suite 920\r\r\nEast Nathanlan...,23300,119000.0,INDIVIDUAL,Jan-2016,22.53,Feb-2003,8 years,Manager,...,2.0,11,0.0,43.0,debt_consolidation,51887,78.6,E3,60 months,0


## 1.2. DataFrame info

In [4]:
# Shape
df.shape

(70954, 26)

In [5]:
# Info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70954 entries, 0 to 70953
Data columns (total 26 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   70954 non-null  int64  
 1   address              70954 non-null  object 
 2   amount               70954 non-null  int64  
 3   annual_income        66492 non-null  float64
 4   application_type     70954 non-null  object 
 5   date_funded          70954 non-null  object 
 6   debt_to_income       70954 non-null  float64
 7   earliest_cr_line     70954 non-null  object 
 8   emp_length           66363 non-null  object 
 9   emp_title            66898 non-null  object 
 10  grade                70954 non-null  object 
 11  home_status          53171 non-null  object 
 12  income_verif_status  70954 non-null  object 
 13  interest_rate        70954 non-null  float64
 14  monthly_payment      68441 non-null  float64
 15  num_bankrupts        66284 non-null 

# 2. Univariate Analysis

In [6]:
# Split numerical and categorical columns
numerical_columns = ['amount', 'annual_income', 'debt_to_income', 'interest_rate', 'monthly_payment', 'num_bankrupts', 
                     'num_mortgages', 'num_open_credit', 'num_records', 'num_total_credit',  'revol_balance', 'revol_util']
categorical_columns = ['application_type', 'emp_length', 'grade', 'home_status', 'income_verif_status', 
                       'purpose', 'sub_grade', 'term']
other_columns = ['id', 'address', 'emp_title', 'date_funded', 'earliest_cr_line']

## 2.1. Numerical data

### 2.1.1. Summary statistics

### 2.1.2. Histogram

In [7]:
# Plot histogram for each column
for col in numerical_columns:
    plot_histogram(df, col, True)

### 2.1.3. Boxplot

In [8]:
for col in numerical_columns:
    plot_boxplot(df, col, True)

## 2.2. Categorical Data

In [9]:
# Get unique values
for col in categorical_columns:
    print(f"Column: {col}")
    print(df[col].value_counts())
    print(f"Number of unique values: {df[col].nunique()}\n")

Column: application_type
application_type
INDIVIDUAL    70813
DIRECT_PAY       75
JOINT            66
Name: count, dtype: int64
Number of unique values: 3

Column: emp_length
emp_length
10+ years    22183
2 years       6160
< 1 year      5648
3 years       5516
5 years       4729
1 year        4494
4 years       4287
7 years       3712
6 years       3659
8 years       3296
9 years       2679
Name: count, dtype: int64
Number of unique values: 11

Column: grade
grade
C    25880
D    15463
B    14317
E     7685
A     3915
F     2975
G      719
Name: count, dtype: int64
Number of unique values: 7

Column: home_status
home_status
MORTGAGE    25818
RENT        22185
OWN          5147
OTHER          15
NONE            4
ANY             2
Name: count, dtype: int64
Number of unique values: 6

Column: income_verif_status
income_verif_status
Verified           26969
Source Verified    24059
Not Verified       19926
Name: count, dtype: int64
Number of unique values: 3

Column: purpose
purpose
debt

In [10]:
for col in categorical_columns:
    plot_bar_frequency(df, col, True)

# 3. Multivariate Analysis

In [13]:
plot_correlation_heatmap(df[numerical_columns], True)

# 4. Dependent variable

In [14]:
# Check distribution default variable
plot_bar_frequency(df, 'default', True)

In [15]:
# Check numerical distributions per category
for col in numerical_columns:
    plot_boxplot_per_category(df, col, 'default', True)

In [16]:
# Check frequency per category
for col in categorical_columns:
    plot_bar_frequency_per_category(df, col, 'default', True)

# 5. Check missing values and outliers

In [4]:
# Null values
check_missing_values(df)

Missing values:

id: 0.0% (0/70954)
address: 0.0% (0/70954)
amount: 0.0% (0/70954)
annual_income: 6.289% (4462/70954)
application_type: 0.0% (0/70954)
date_funded: 0.0% (0/70954)
debt_to_income: 0.0% (0/70954)
earliest_cr_line: 0.0% (0/70954)
emp_length: 6.47% (4591/70954)
emp_title: 5.716% (4056/70954)
grade: 0.0% (0/70954)
home_status: 25.063% (17783/70954)
income_verif_status: 0.0% (0/70954)
interest_rate: 0.0% (0/70954)
monthly_payment: 3.542% (2513/70954)
num_bankrupts: 6.582% (4670/70954)
num_mortgages: 8.577% (6086/70954)
num_open_credit: 0.0% (0/70954)
num_records: 6.441% (4570/70954)
num_total_credit: 2.872% (2038/70954)
purpose: 0.0% (0/70954)
revol_balance: 0.0% (0/70954)
revol_util: 1.87% (1327/70954)
sub_grade: 0.0% (0/70954)
term: 0.0% (0/70954)
default: 0.0% (0/70954)



In [7]:
# Outliers for num cols
check_outliers(df[numerical_columns])

Outliers:

amount: 0.038% (27/70954)
annual_income: 3.99% (2831/70954)
debt_to_income: 0.089% (63/70954)
interest_rate: 1.023% (726/70954)
monthly_payment: 3.012% (2137/70954)
num_bankrupts: 11.653% (8268/70954)
num_mortgages: 1.672% (1186/70954)
num_open_credit: 2.713% (1925/70954)
num_records: 14.873% (10553/70954)
num_total_credit: 1.649% (1170/70954)
revol_balance: 5.192% (3684/70954)
revol_util: 0.004% (3/70954)

