In [75]:
import os
import pandas as pd
import json
from datetime import datetime
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Load JSON inputs
params_df = pd.read_json("json_outputs/customer_params_df_clean.json", lines=True)
age_df = pd.read_json("json_outputs/age_analysis_clean.json", lines=True)[['CUSTOMER_NUMBER', 'TOTAL_DUE', 'OVERDUE_RATIO']]
payments_df = pd.read_json("json_outputs/payment_lines_clean.json", lines=True)
rep_df = pd.read_json("json_outputs/representatives_clean.json", lines=True)[['REP_CODE', 'REP_DESC', 'COMM_METHOD', 'COMMISSION', 'REP_GROUP']]
time_df = pd.read_json("json_outputs/dim_time.json", lines=True)
customer_df = pd.read_json("json_outputs/customer_df_clean.json", lines=True)[[
    'CUSTOMER_NUMBER', 'CCAT_CODE', 'REGION_CODE', 'REP_CODE',
    'SETTLE_TERMS', 'NORMAL_PAYTERMS', 'DISCOUNT', 'CREDIT_LIMIT'
]]

In [76]:
csv_folder = os.path.join(os.getcwd(), "csv_outputs")
json_folder = os.path.join(os.getcwd(), "json_outputs")

In [77]:
# Load customer master (fact table)
customer_df = pd.read_json("json_outputs/customer_df_clean.json", lines=True)

# Merge rep info into master
customer_df = customer_df.merge(rep_df, on="REP_CODE", how="left")

In [78]:
# Merge core customer data
merged_df = customer_df \
    .merge(params_df[['CUSTOMER_NUMBER', 'PARAMETER', 'PARAMETER_GROUP']], on="CUSTOMER_NUMBER", how="left") \
    .merge(age_df, on="CUSTOMER_NUMBER", how="left")

# Add account status only
account_status_df = params_df[params_df['PARAMETER_GROUP'] == 'Account Status'][['CUSTOMER_NUMBER', 'PARAMETER']]
account_status_df = account_status_df.drop_duplicates().rename(columns={"PARAMETER": "ACCOUNT_STATUS"})
merged_df = merged_df.merge(account_status_df, on='CUSTOMER_NUMBER', how='left')

In [79]:
# Normalize time dimensions
time_df['FIN_PERIOD'] = pd.to_datetime(time_df['FIN_PERIOD']).dt.strftime('%Y%m').astype(int)
payments_df['FIN_PERIOD'] = payments_df['FIN_PERIOD'].astype(int)

In [80]:
# Create nested time_segments
customer_periods = payments_df[['CUSTOMER_NUMBER', 'FIN_PERIOD']].drop_duplicates()
customer_time_segments = customer_periods.merge(time_df, on='FIN_PERIOD', how='left')
time_grouped = customer_time_segments.groupby("CUSTOMER_NUMBER", group_keys=False) \
    .apply(lambda x: x.drop(columns=["CUSTOMER_NUMBER"]).to_dict(orient="records")) \
    .reset_index()
time_grouped.columns = ["CUSTOMER_NUMBER", "TIME_SEGMENTS"]

In [81]:
# Create nested payment_history
payment_grouped = payments_df.groupby("CUSTOMER_NUMBER", group_keys=False) \
    .apply(lambda x: x.drop(columns=["CUSTOMER_NUMBER"]).to_dict(orient="records")) \
    .reset_index()
payment_grouped.columns = ["CUSTOMER_NUMBER", "PAYMENT_HISTORY"]

In [82]:
# Merge nested arrays
merged_df = merged_df \
    .merge(payment_grouped, on="CUSTOMER_NUMBER", how="left") \
    .merge(time_grouped, on="CUSTOMER_NUMBER", how="left")

In [83]:
#IS_OVERDUE (Boolean flag)
merged_df['IS_OVERDUE'] = merged_df['OVERDUE_RATIO'].fillna(0) > 0

#NUM_PAYMENTS (Count of payment records)
payment_grouped['NUM_PAYMENTS'] = payment_grouped['PAYMENT_HISTORY'].apply(lambda x: len(x) if isinstance(x, list) else 0)
merged_df = merged_df.merge(payment_grouped[['CUSTOMER_NUMBER', 'NUM_PAYMENTS']], on='CUSTOMER_NUMBER', how='left')

#LAST_PAYMENT_DATE (Recency indicator)
def get_last_payment(payments):
    dates = [p.get('DEPOSIT_DATE') for p in payments if 'DEPOSIT_DATE' in p]
    return max(dates) if dates else None

payment_grouped['LAST_PAYMENT_DATE'] = payment_grouped['PAYMENT_HISTORY'].apply(get_last_payment)
merged_df = merged_df.merge(payment_grouped[['CUSTOMER_NUMBER', 'LAST_PAYMENT_DATE']], on='CUSTOMER_NUMBER', how='left')

In [84]:
merged_df.drop(columns=['FIN_PERIOD_x', 'FIN_PERIOD_y', 'YEAR', 'MONTH', 'MONTH_NAME'], errors='ignore', inplace=True)

null_ratio = merged_df.isnull().mean()
columns_to_drop = null_ratio[null_ratio > 0.95].index.tolist()
merged_df.drop(columns=columns_to_drop, inplace=True)

merged_df.drop_duplicates(subset=['CUSTOMER_NUMBER'], keep='last', inplace=True)
merged_df['OVERDUE_RATIO'] = merged_df['OVERDUE_RATIO'].round(4)

def trim_history(payments):
    return payments[-12:] if isinstance(payments, list) else []

merged_df['PAYMENT_HISTORY'] = merged_df['PAYMENT_HISTORY'].apply(trim_history)


In [85]:
latest_time = merged_df['TIME_SEGMENTS'].apply(lambda x: x[-1] if isinstance(x, list) and x else {})
latest_time_df = pd.json_normalize(latest_time)
merged_df['YEAR'] = latest_time_df['YEAR']
merged_df['MONTH'] = latest_time_df['MONTH']
merged_df['MONTH_NAME'] = latest_time_df['MONTH_NAME']

In [86]:
def get_avg_payment(payments):
    amounts = [p.get('AMOUNT') for p in payments if 'AMOUNT' in p and isinstance(p.get('AMOUNT'), (int, float))]
    return round(sum(amounts) / len(amounts), 2) if amounts else None

merged_df['AVG_PAYMENT_AMOUNT'] = merged_df['PAYMENT_HISTORY'].apply(get_avg_payment)

In [87]:
def classify_customer(row):
    rep = str(row.get('REP_GROUP', '')).upper()
    channel = str(row.get('PARAMETER_GROUP', '')).upper()
    ccat = str(row.get('CCAT_CODE', ''))

    if 'CONSIGNMENT' in rep or 'CONSIGNMENT' in channel:
        return 'Consignment'
    elif ccat in ['30', '999999']:
        return 'Retail'
    elif 'AGENT' in rep:
        return 'Agent'
    elif 'PROMO' in rep or 'PROMOTIONAL' in channel:
        return 'Promotional'
    elif 'BAD CREDIT' in rep or 'BAD CREDIT' in channel:
        return 'Bad Credit'
    elif 'EXPORT' in rep or 'EXPORT' in channel:
        return 'Export'
    elif 'TRADING ACCOUNT' in rep or 'TRADING ACCOUNT' in channel:
        return 'Trading Account'
    elif 'UNCLASSIFIED' in rep:
        return 'Unclassified'
    else:
        return 'Other'

merged_df['CUSTOMER_TYPE'] = merged_df.apply(classify_customer, axis=1)

In [88]:
merged_df.shape

(2757, 27)

In [89]:
merged_df.columns.tolist()

['CUSTOMER_NUMBER',
 'CCAT_CODE',
 'REGION_CODE',
 'REP_CODE',
 'SETTLE_TERMS',
 'NORMAL_PAYTERMS',
 'DISCOUNT',
 'CREDIT_LIMIT',
 'REP_DESC',
 'COMM_METHOD',
 'COMMISSION',
 'REP_GROUP',
 'PARAMETER',
 'PARAMETER_GROUP',
 'TOTAL_DUE',
 'OVERDUE_RATIO',
 'ACCOUNT_STATUS',
 'PAYMENT_HISTORY',
 'TIME_SEGMENTS',
 'IS_OVERDUE',
 'NUM_PAYMENTS',
 'LAST_PAYMENT_DATE',
 'YEAR',
 'MONTH',
 'MONTH_NAME',
 'AVG_PAYMENT_AMOUNT',
 'CUSTOMER_TYPE']

In [90]:
merged_df

Unnamed: 0,CUSTOMER_NUMBER,CCAT_CODE,REGION_CODE,REP_CODE,SETTLE_TERMS,NORMAL_PAYTERMS,DISCOUNT,CREDIT_LIMIT,REP_DESC,COMM_METHOD,...,PAYMENT_HISTORY,TIME_SEGMENTS,IS_OVERDUE,NUM_PAYMENTS,LAST_PAYMENT_DATE,YEAR,MONTH,MONTH_NAME,AVG_PAYMENT_AMOUNT,CUSTOMER_TYPE
0,599000,30,7a,05,0,60,0,6000,RL,Sales,...,[],,False,,,,,,,Retail
9,AACJ01,21,25b,ZZZ5,0,90,0,999999,,,...,[],,True,,,2019.0,8.0,August,,Other
10,AACJC1,21,25b,ZZZ5,0,120,0,999999,,,...,[],,False,,,2019.0,9.0,September,,Other
16,AACJC2,5,20a,CONS4,0,120,0,999999,CONSIGNMENTS BM,Gross Profit,...,[],,True,,,,,,,Consignment
38,AADPRG,6,21a,XX,0,120,0,999999,,,...,[],,True,,,,,,,Other
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21799,ZHAY02,19,4b,03,0,120,0,2000,BJ,Sales,...,"[{'FIN_PERIOD': 201904, 'DEPOSIT_DATE': 155952...","[{'FIN_PERIOD': 201904, 'YEAR': 2019.0, 'MONTH...",False,2.0,1.562112e+12,,,,,Agent
21805,ZMAU01,37,11a,03,0,120,0,0,BJ,Sales,...,"[{'FIN_PERIOD': 201903, 'DEPOSIT_DATE': 155891...","[{'FIN_PERIOD': 201903, 'YEAR': 2019.0, 'MONTH...",False,1.0,1.558915e+12,,,,,Agent
21828,ZNAE01,46,2b,05,0,120,0,30000,RL,Sales,...,"[{'FIN_PERIOD': 201903, 'DEPOSIT_DATE': 155831...","[{'FIN_PERIOD': 201903, 'YEAR': 2019.0, 'MONTH...",True,7.0,1.574208e+12,,,,,Agent
21829,ZNAEOC,5,20a,STAND,0,120,0,999999,,,...,[],,False,,,,,,,Other


In [91]:
merged_df.to_csv(os.path.join(csv_folder, "customer_merged.csv"), index=False)
merged_df.to_json(os.path.join(json_folder, "customer_merged.json"), orient="records", lines=True)