In [None]:
import pandas as pd
import numpy as np
import psutil
import matplotlib.pyplot as plt
import statsmodels.api as sm
from linearmodels.panel import PanelOLS
from pathlib import Path
import math
import os, sys

notebook_dir = Path().cwd()
project_root = notebook_dir.parent
sys.path.insert(0, str(project_root))
import importlib
import user_function as uf

In [None]:
path0 = '/mnt/sda1/RA5/data'
path1 = '/mnt/sda1/RA5/intermediate/siyoung'

This code conduct panel event study analysis

In [None]:
directory = os.path.join(path0, 'shc_birth_KCB_all.parquet')
df_kcb = pd.read_parquet(directory)
df_kcb = uf.clean_data(df_kcb)

In [None]:
df_kcb.shape
# Result: (16128856, 135)

In [None]:
df_hshd_birth_months = df_kcb[df_kcb['shc_ch_birth'] == 1][['HSHD_SEQNO','BS_YR_MON']].drop_duplicates()
df_hshd_birth_months.rename(columns={'BS_YR_MON':'BS_YR_MON_birth'},inplace=True)
df_hshd_birth_months.sort_values(['HSHD_SEQNO','BS_YR_MON_birth'], inplace=True)

df_hshd_birth_months['diff_months'] = df_hshd_birth_months.groupby(['HSHD_SEQNO'])['BS_YR_MON_birth'].transform(lambda x: (x.dt.year - x.shift().dt.year)*12 + (x.dt.month - x.shift().dt.month))
df_hshd_birth_months = df_hshd_birth_months[(df_hshd_birth_months['diff_months'].isna())| (df_hshd_birth_months['diff_months'] > 10)]

df_hshd_birth_months.drop(columns={'diff_months'}, inplace = True)

df_hshd_birth_months['birth_order'] = df_hshd_birth_months.groupby('HSHD_SEQNO')\
    .cumcount() + 1

df_hshd_birth_months = df_hshd_birth_months[df_hshd_birth_months['birth_order'] == 1]
df_kcb = df_kcb.merge(df_hshd_birth_months, on='HSHD_SEQNO', how='left')

In [None]:
df_kcb['is_m_20_40'] = (df_kcb['SEX'] == 'Male') & (df_kcb['AGE'].between(20,40))
df_kcb['is_f_20_40'] = (df_kcb['SEX'] == 'Female') & (df_kcb['AGE'].between(20,40))

df_kcb['hshd_is_two_parents'] = ((df_kcb.groupby(['HSHD_SEQNO','BS_YR_MON'])['is_f_20_40'].transform('sum') == 1)
                                   & (df_kcb.groupby(['HSHD_SEQNO','BS_YR_MON'])['is_m_20_40'].transform('sum') == 1))

df_kcb['n_obs'] = df_kcb.groupby(['HSHD_SEQNO','BS_YR_MON'])['KEY'].transform('count')
df_kcb = df_kcb[(df_kcb['hshd_is_two_parents']== True) & (df_kcb['n_obs']==2)]

df_kcb['months_from_birth'] = ((df_kcb['BS_YR_MON'].dt.year - df_kcb['BS_YR_MON_birth'].dt.year) * 12 +
                                   (df_kcb['BS_YR_MON'].dt.month - df_kcb['BS_YR_MON_birth'].dt.month))

df_kcb.sort_values(['HSHD_SEQNO','BS_YR_MON','KEY'], inplace=True)

df_kcb['months_from_birth'] = ((df_kcb['BS_YR_MON'].dt.year - df_kcb['BS_YR_MON_birth'].dt.year) * 12 +
                                   (df_kcb['BS_YR_MON'].dt.month - df_kcb['BS_YR_MON_birth'].dt.month))

In [None]:
# Random Household
directory = os.path.join(path0, 'random_hshd_KCB_all_1.parquet')
ds_random_kcb = pd.read_parquet(directory)
cols = [col for col in ds_random_kcb.columns if col in df_kcb.columns]
ds_random_kcb = ds_random_kcb[cols]

birth_hshd_list = df_kcb['HSHD_SEQNO'].unique()
ds_random_kcb = ds_random_kcb[~ds_random_kcb['HSHD_SEQNO'].isin(birth_hshd_list)]
ds_random_kcb = uf.clean_data(ds_random_kcb)
ds_random_kcb['is_f_20_40'] = (ds_random_kcb['SEX'] == 'Female') & (ds_random_kcb['AGE'].between(20,40))
ds_random_kcb['is_m_20_40'] = (ds_random_kcb['SEX'] == 'Male') & (ds_random_kcb['AGE'].between(20,40))

ds_random_kcb['hshd_is_two_parents'] = ((ds_random_kcb.groupby(['HSHD_SEQNO','BS_YR_MON'])['is_f_20_40'].transform('max') == 1) &
                                       (ds_random_kcb.groupby(['HSHD_SEQNO','BS_YR_MON'])['is_m_20_40'].transform('max') == 1))
ds_random_kcb['n_obs'] = ds_random_kcb.groupby(['HSHD_SEQNO','BS_YR_MON'])['KEY'].transform('count')

ds_random_kcb = ds_random_kcb[(ds_random_kcb['hshd_is_two_person'] == True) &
                              (ds_random_kcb['n_obs'] == 2)]
ds_random_kcb.sort_values(['HSHD_SEQNO','BS_YR_MON','KEY'], inplace = True)


ds_random_kcb['months_from_birth'] = -1

In [None]:
ds_combined = pd.concat([df_kcb, ds_random_kcb], axis=0, ignore_index=True)

new_var_name ={
    'CD_USE_AMT':'Card Spending AMT',
    'SIN_CD_USE_AMT':'Credit Card Spending AMT',
    'CHK_CD_USE_AMT': 'Debit Card Spending AMT',
    'CD_FUL_USE_AMT': 'Lump-sum Payment AMT',
    'CD_INSTL_USE_AMT':'Installment Payment AMT',
    'CD_CA_USE_AMT':'Cash Advance AMT',
    'CD_ABRD_USE_AMT':'Overseas Card Spending AMT'
}

ds_combined.rename(columns = new_var_name, inplace = True)


var_interested = ['Card Spending AMT', 'Credit Card Spending AMT','Debit Card Spending AMT',
  'Lump-sum Payment AMT', 'Installment Payment AMT', 'Cash Advance AMT','Overseas Card Spending AMT']

control_var = ['ICM','TOT_ASST']


LAG = 6
LEAD = 6

In [None]:
agg_dict = {col: 'sum' for col in var_interested + control_var}
agg_dict['months_from_birth'] = 'max'

df_hshd = (
    ds_combined
    .groupby(['HSHD_SEQNO', 'BS_YR_MON'])
    .agg(agg_dict)
    .reset_index()
)

evt = df_hshd['months_from_birth']
evt_cap = evt.clip(lower=-LAG, upper=LEAD)

ks = list(range(-LAG,LEAD+1))
ks = [k for k in ks if k != -1]

for k in ks:
    df_hshd[f"e_{k}"] = (evt_cap == k).astype(int)

df_hshd = df_hshd.set_index(['HSHD_SEQNO','BS_YR_MON']).sort_index()

In [None]:
event_cols = [c for c in df_hshd.columns if c.startswith('e_')]
X = df_hshd[event_cols + control_var]

for var in var_interested:

    Y = df_hshd[var]
    res = (PanelOLS(
                    Y, X,
                    entity_effects=True,
                    time_effects=True
                    )
        .fit(cov_type='clustered', clusters = pd.Series(df_hshd.index.get_level_values(0), index=df_hshd.index)))
    print(res.summary)


    # Plot

    b = res.params[event_cols]
    se = res.std_errors[event_cols]
    k_vals = [int(c.split('_')[1]) for c in event_cols]
    order = np.argsort(k_vals)

    k_sorted = np.array(k_vals)[order]
    b_sorted = b.values[order]
    se_sorted = se.values[order]

    ci_low = b_sorted - 1.96*se_sorted
    ci_high = b_sorted + 1.96*se_sorted

    plt.figure(figsize=(8,5))
    plt.axhline(0, linestyle='--')
    plt.axvline(-1, linestyle=':')
    plt.plot(k_sorted, b_sorted, marker='o')
    plt.fill_between(k_sorted, ci_low, ci_high, alpha=0.3)
    plt.xlabel('Event time (t − start)')
    plt.ylabel('Effect on y vs. k = −1')
    plt.title(f'{var}')
    plt.tight_layout()
    plt.show()