# Import packages

In [1]:
import sys
import os

# Add the project root directory to sys.path
project_root = os.path.abspath("..")
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [2]:
import pandas as pd 
import plotly.express as px
import kaleido
import os

from src.constants import PATH_RAW_DATA, PATH_VISUALIZATION_EDA
from src.data_understanding import get_dataframe_info
from src.visualization_functions import (
    plot_histogram, plot_histogram_per_category, plot_boxplot, plot_boxplot_per_category, plot_violin, plot_violin_per_category,
    plot_scatter,
    plot_bar_frequency,
    plot_bar_frequency_per_category)

# 1. Dataset

## 1.1. Read data

In [3]:
df = pd.read_csv(PATH_RAW_DATA, encoding='ISO-8859-1')

## 1.2. DataFrame info

In [4]:
get_dataframe_info(df)

TypeError: 'tuple' object is not callable

# 2. Univariate Analysis

In [None]:
# Split numerical and categorical columns
numerical_columns = ['amount', 'annual_income', 'debt_to_income', 'interest_rate', 'monthly_payment', 'num_bankrupts', 
                     'num_mortgages', 'num_open_credit', 'num_records', 'num_total_credit',  'revol_balance', 'revol_util']
categorical_columns = ['application_type', 'emp_length', 'grade', 'home_status', 'income_verif_status', 
                       'purpose', 'sub_grade', 'term']
other_columns = ['id', 'address', 'emp_title', 'date_funded', 'earliest_cr_line']

## 2.1. Numerical data

### 2.1.1. Summary statistics

### 2.1.2. Histogram

In [None]:
# Plot histogram for each column
for col in numerical_columns:
    plot_histogram(df, col, True)

### 2.1.3. Boxplot

In [None]:
for col in numerical_columns:
    plot_boxplot(df, col, True)

## 2.2. Categorical Data

In [None]:
# Get unique values
for col in categorical_columns:
    print(f"Column: {col}")
    print(df[col].value_counts())
    print(f"Number of unique values: {df[col].nunique()}\n")

In [None]:
for col in categorical_columns:
    plot_bar_frequency(df, col, True)

# 3. Multivariate Analysis

## 3.1. Scatter Plots

In [None]:
# sns.pairplot(df[numerical_columns])
# plt.show()

## 3.2. Correlation Matrix

In [None]:
plot_correlation_heatmap(df[numerical_columns], True)

# 4. Dependent variable

In [None]:
# Check distribution default variable
plot_bar_frequency(df, 'default', True)

In [None]:
# Check numerical distributions per category
for col in numerical_columns:
    plot_boxplot_per_category(df, col, 'default', True)

In [None]:
# Check numerical distributions per category
# for col in numerical_columns:
#     plot_histogram_per_category(df, col, 'default')

In [None]:
# Check frequency per category
for col in categorical_columns:
    plot_bar_frequency_per_category(df, col, 'default', True)

# 5. Check missing values and outliers

In [None]:
# Null values
check_missing_values(df)

In [None]:
# Outliers for num cols
check_outliers(df[numerical_columns])