In [None]:
import os
import pandas as pd
import json
from datetime import datetime
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Load JSON inputs
params_df = pd.read_json("json_outputs/customer_params_df_clean.json", lines=True)
age_df = pd.read_json("json_outputs/age_analysis_clean.json", lines=True)[['CUSTOMER_NUMBER', 'TOTAL_DUE', 'OVERDUE_RATIO']]
payments_df = pd.read_json("json_outputs/payment_lines_clean.json", lines=True)
rep_df = pd.read_json("json_outputs/representatives_clean.json", lines=True)[['REP_CODE', 'REP_DESC', 'COMM_METHOD', 'COMMISSION', 'REP_GROUP']]
time_df = pd.read_json("json_outputs/dim_time.json", lines=True)
customer_df = pd.read_json("json_outputs/customer_df_clean.json", lines=True)[[
    'CUSTOMER_NUMBER', 'CCAT_CODE', 'REGION_CODE', 'REP_CODE',
    'SETTLE_TERMS', 'NORMAL_PAYTERMS', 'DISCOUNT', 'CREDIT_LIMIT'
]]

In [None]:
csv_folder = os.path.join(os.getcwd(), "csv_outputs")
json_folder = os.path.join(os.getcwd(), "json_outputs")

In [None]:
# Load customer master (fact table)
customer_df = pd.read_json("json_outputs/customer_df_clean.json", lines=True)

# Merge rep info into master
customer_df = customer_df.merge(rep_df, on="REP_CODE", how="left")

In [None]:
# Merge core customer data
merged_df = customer_df \
    .merge(params_df[['CUSTOMER_NUMBER', 'PARAMETER', 'PARAMETER_GROUP']], on="CUSTOMER_NUMBER", how="left") \
    .merge(age_df, on="CUSTOMER_NUMBER", how="left")

# Add account status only
account_status_df = params_df[params_df['PARAMETER_GROUP'] == 'Account Status'][['CUSTOMER_NUMBER', 'PARAMETER']]
account_status_df = account_status_df.drop_duplicates().rename(columns={"PARAMETER": "ACCOUNT_STATUS"})
merged_df = merged_df.merge(account_status_df, on='CUSTOMER_NUMBER', how='left')

In [None]:
# Normalize time dimensions
time_df['FIN_PERIOD'] = pd.to_datetime(time_df['FIN_PERIOD']).dt.strftime('%Y%m').astype(int)
payments_df['FIN_PERIOD'] = payments_df['FIN_PERIOD'].astype(int)

In [None]:
# Create nested time_segments
customer_periods = payments_df[['CUSTOMER_NUMBER', 'FIN_PERIOD']].drop_duplicates()
customer_time_segments = customer_periods.merge(time_df, on='FIN_PERIOD', how='left')
time_grouped = customer_time_segments.groupby("CUSTOMER_NUMBER", group_keys=False) \
    .apply(lambda x: x.drop(columns=["CUSTOMER_NUMBER"]).to_dict(orient="records")) \
    .reset_index()
time_grouped.columns = ["CUSTOMER_NUMBER", "time_segments"]

In [None]:
# Create nested payment_history
payment_grouped = payments_df.groupby("CUSTOMER_NUMBER", group_keys=False) \
    .apply(lambda x: x.drop(columns=["CUSTOMER_NUMBER"]).to_dict(orient="records")) \
    .reset_index()
payment_grouped.columns = ["CUSTOMER_NUMBER", "payment_history"]

In [None]:
# Merge nested arrays
merged_df = merged_df \
    .merge(payment_grouped, on="CUSTOMER_NUMBER", how="left") \
    .merge(time_grouped, on="CUSTOMER_NUMBER", how="left")

In [None]:
#IS_OVERDUE (Boolean flag)
merged_df['IS_OVERDUE'] = merged_df['OVERDUE_RATIO'].fillna(0) > 0

#NUM_PAYMENTS (Count of payment records)
payment_grouped['NUM_PAYMENTS'] = payment_grouped['payment_history'].apply(lambda x: len(x) if isinstance(x, list) else 0)
merged_df = merged_df.merge(payment_grouped[['CUSTOMER_NUMBER', 'NUM_PAYMENTS']], on='CUSTOMER_NUMBER', how='left')

#LAST_PAYMENT_DATE (Recency indicator)
def get_last_payment(payments):
    dates = [p.get('DEPOSIT_DATE') for p in payments if 'DEPOSIT_DATE' in p]
    return max(dates) if dates else None

payment_grouped['LAST_PAYMENT_DATE'] = payment_grouped['payment_history'].apply(get_last_payment)
merged_df = merged_df.merge(payment_grouped[['CUSTOMER_NUMBER', 'LAST_PAYMENT_DATE']], on='CUSTOMER_NUMBER', how='left')

In [None]:
merged_df.shape

In [None]:
merged_df.columns.tolist()

In [None]:
merged_df

In [None]:
merged_df.to_csv(os.path.join(csv_folder, "customer_merged.csv"), index=False)
merged_df.to_json(os.path.join(json_folder, "customer_merged.json"), orient="records", lines=True)