# EDA: Financial Generation Z in Indonesia
Notebook ini memuat langkah EDA reproducible untuk dataset kompetisi.

In [None]:
# Import libraries
import os, json, subprocess
import numpy as np
import pandas as pd
import matplotlib
# Use inline backend for notebooks; switch to Agg if exporting only
try:
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from pathlib import Path

def find_repo_root(start: str | None = None):
















os.makedirs(fig_dir, exist_ok=True)fig_dir = os.path.join(base, 'notebooks', 'figures')DATA = os.path.join(base, 'DATASET')base = find_repo_root()    return start            return str(parent)        if (parent/'.git').exists() or (parent/'DATASET').exists():    for parent in [p] + list(p.parents):    p = Path(start).resolve()        pass    except Exception:        return root        root = subprocess.check_output(['git','rev-parse','--show-toplevel'], cwd=start, stderr=subprocess.DEVNULL).decode().strip()    try:    start = start or os.getcwd()base = '/home/timtam/Documents/code/Dashboard_WIP'
DATA = os.path.join(base, 'DATASET')
fig_dir = os.path.join(base, 'notebooks', 'figures')
os.makedirs(fig_dir, exist_ok=True)

In [None]:
# Helper to read semicolon-delimited CSV with common encodings
def read_csv_any(path):
    for enc in ['utf-8', 'utf-8-sig', 'latin-1']:
        try:
            return pd.read_csv(path, encoding=enc, sep=';')
        except UnicodeDecodeError:
            continue
    return pd.read_csv(path, engine='python', sep=';')
survey = read_csv_any(os.path.join(DATA, 'GenZ_Financial_Literacy_Survey.csv'))
profile = read_csv_any(os.path.join(DATA, 'GenZ_Financial_Profile.csv'))
regional = read_csv_any(os.path.join(DATA, 'Regional_Economic_Indicators.csv'))
survey.shape, profile.shape, regional.shape

In [None]:
# Cleaning: convert monetary strings to numeric (new columns)
def to_num(s):
    return pd.to_numeric(str(s).replace(',', '').replace('.', '').replace(' ', ''), errors='coerce')
if 'Est. Monthly Income' in profile.columns:
    profile['income_num'] = pd.to_numeric(profile['Est. Monthly Income']
if 'Est. Monthly Expenditure' in profile.columns:
    profile['expense_num'] = pd.to_numeric(profile['Est. Monthly Expenditure']
# Financial anxiety proxy score (average of negative-affect items)
anx_items = [
    'Because of my money situation, I feel I will never have the things I want in life',
    'I am behind with my finances',
    'My finances control my life',
    'Whenever I feel in control of my finances, something happens that sets me back',
    'I am unable to enjoy life because I obsess too much about money'
]
cols = [c for c in anx_items if c in profile.columns]
profile['financial_anxiety_score'] = profile[cols].apply(pd.to_numeric, errors='coerce').mean(axis=1)
# Saving rate
if {'income_num','expense_num'}.issubset(profile.columns):
    with np.errstate(divide='ignore', invalid='ignore'):
        profile['saving_rate'] = (profile['income_num'] - profile['expense_num']) / profile['income_num']
    profile['saving_rate'] = profile['saving_rate'].replace([np.inf, -np.inf], np.nan)
else:
    profile['saving_rate'] = np.nan
profile[['financial_anxiety_score','income_num','expense_num','saving_rate']].head()

In [None]:
# Descriptive stats
num_desc = profile.select_dtypes(include=[np.number]).describe(percentiles=[.05,.25,.5,.75,.95])
num_desc.head()

In [None]:
# Categorical distributions
cats = {}
for c in ['Gender','Province of Origin','Last Education','Job','Residence Status']:
    if c in profile.columns:
        vc = profile[c].value_counts(dropna=False)
        cats[c] = {'top': vc.head(10), 'pct': (vc/len(profile)*100).round(2).head(10)}
cats

In [None]:
# Correlation matrix (selected)
corr_cols = [c for c in ['financial_anxiety_score','income_num','expense_num','saving_rate'] if c in profile.columns]
corr = profile[corr_cols].corr(method='pearson')
plt.figure(figsize=(6,5))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='BrBG', center=0)
plt.title('Korelasi Variabel Utama')
plt.tight_layout()
plt.savefig(os.path.join(fig_dir, 'correlation_heatmap.png'), dpi=200)
corr

In [None]:
# Plots: distribution & scatter
if 'income_num' in profile.columns and profile['income_num'].notna().sum()>0:
    plt.figure(figsize=(8,5))
    sns.histplot(profile['income_num'].dropna(), bins=40, kde=True)
    plt.title('Distribusi Pendapatan Bulanan (IDR)')
    plt.tight_layout()
    plt.savefig(os.path.join(fig_dir, 'distribution_income.png'), dpi=200)
if {'financial_anxiety_score','income_num'}.issubset(profile.columns):
    plt.figure(figsize=(6,5))
    sns.scatterplot(data=profile, x='income_num', y='financial_anxiety_score', alpha=0.4)
    try:
        sns.regplot(data=profile, x='income_num', y='financial_anxiety_score', scatter=False, color='red')
    except Exception:
        pass
    plt.title('Anxiety vs Pendapatan')
    plt.tight_layout()
    plt.savefig(os.path.join(fig_dir, 'fa_vs_income_scatter.png'), dpi=200)