In [217]:
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import random
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import pprint
import pyspark
import pyspark.sql.functions as F

from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder

from pyspark.sql.functions import col, row_number, to_date, count, percentile_approx, desc, udf
from pyspark.sql.types import StringType, IntegerType, FloatType, DateType, DoubleType, ArrayType
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType
from pyspark.sql.window import Window
from pyspark.sql.functions import regexp_extract, regexp_replace, when, trim, initcap, round, lower, split, explode, array_distinct, array_sort, concat_ws, expr

import utils.features_financials_bronze_table
import utils.features_financials_silver_table
import utils.features_financials_gold_table

In [178]:
# Initialize SparkSession
spark = pyspark.sql.SparkSession.builder \
    .appName("dev") \
    .master("local[*]") \
    .getOrCreate()

# Set log level to ERROR to hide warnings
spark.sparkContext.setLogLevel("ERROR")

In [179]:
# set up config
snapshot_date_str = "2023-01-01"

start_date_str = "2023-01-01"
end_date_str = "2024-12-01"

In [180]:
# generate list of dates to process
def generate_first_of_month_dates(start_date_str, end_date_str):
    # Convert the date strings to datetime objects
    start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
    end_date = datetime.strptime(end_date_str, "%Y-%m-%d")
    
    # List to store the first of month dates
    first_of_month_dates = []

    # Start from the first of the month of the start_date
    current_date = datetime(start_date.year, start_date.month, 1)

    while current_date <= end_date:
        # Append the date in yyyy-mm-dd format
        first_of_month_dates.append(current_date.strftime("%Y-%m-%d"))
        
        # Move to the first of the next month
        if current_date.month == 12:
            current_date = datetime(current_date.year + 1, 1, 1)
        else:
            current_date = datetime(current_date.year, current_date.month + 1, 1)

    return first_of_month_dates

dates_str_lst = generate_first_of_month_dates(start_date_str, end_date_str)
dates_str_lst

['2023-01-01',
 '2023-02-01',
 '2023-03-01',
 '2023-04-01',
 '2023-05-01',
 '2023-06-01',
 '2023-07-01',
 '2023-08-01',
 '2023-09-01',
 '2023-10-01',
 '2023-11-01',
 '2023-12-01',
 '2024-01-01',
 '2024-02-01',
 '2024-03-01',
 '2024-04-01',
 '2024-05-01',
 '2024-06-01',
 '2024-07-01',
 '2024-08-01',
 '2024-09-01',
 '2024-10-01',
 '2024-11-01',
 '2024-12-01']

In [181]:
# connect to source back end - IRL connect to back end source system
csv_file_path = "data/features_financials.csv"

# load data - IRL ingest from back end source system
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)

In [182]:
df.show(10)

+-----------+------------------+---------------------+-----------------+---------------+-------------+-----------+--------------------+-------------------+----------------------+--------------------+--------------------+----------+----------------+------------------------+--------------------+---------------------+-------------------+-----------------------+--------------------+------------------+-------------+
|Customer_ID|     Annual_Income|Monthly_Inhand_Salary|Num_Bank_Accounts|Num_Credit_Card|Interest_Rate|Num_of_Loan|        Type_of_Loan|Delay_from_due_date|Num_of_Delayed_Payment|Changed_Credit_Limit|Num_Credit_Inquiries|Credit_Mix|Outstanding_Debt|Credit_Utilization_Ratio|  Credit_History_Age|Payment_of_Min_Amount|Total_EMI_per_month|Amount_invested_monthly|   Payment_Behaviour|   Monthly_Balance|snapshot_date|
+-----------+------------------+---------------------+-----------------+---------------+-------------+-----------+--------------------+-------------------+---------------

In [183]:
df.columns

['Customer_ID',
 'Annual_Income',
 'Monthly_Inhand_Salary',
 'Num_Bank_Accounts',
 'Num_Credit_Card',
 'Interest_Rate',
 'Num_of_Loan',
 'Type_of_Loan',
 'Delay_from_due_date',
 'Num_of_Delayed_Payment',
 'Changed_Credit_Limit',
 'Num_Credit_Inquiries',
 'Credit_Mix',
 'Outstanding_Debt',
 'Credit_Utilization_Ratio',
 'Credit_History_Age',
 'Payment_of_Min_Amount',
 'Total_EMI_per_month',
 'Amount_invested_monthly',
 'Payment_Behaviour',
 'Monthly_Balance',
 'snapshot_date']

### Clean data for silver table

In [184]:
# Clean annual income
# Remove non-numeric characters
df_cleaned = df.withColumn(
    "Annual_Income",
    regexp_replace(col("Annual_Income"), r"[^\d.]", "")
    .cast("double")  # or "decimal(10,2)" for stricter typing
)

# Round to 2 decimal places
df_cleaned = df_cleaned.withColumn("Annual_Income", round(col("Annual_Income"), 2))

In [185]:
# Monthly_Inhand_Salary
# Clean non-numeric characters (keep digits and dot)
df_cleaned = df_cleaned.withColumn(
    "Monthly_Inhand_Salary",
    regexp_replace(col("Monthly_Inhand_Salary"), r"[^\d.]", "").cast("double")
)

# Round to 2 decimal places
df_cleaned = df_cleaned.withColumn(
    "Monthly_Inhand_Salary",
    round(col("Monthly_Inhand_Salary"), 2)
)

In [186]:
# Clean Num_Bank_Accounts
# Convert to integer and handle invalid values
df_cleaned = df_cleaned.withColumn(
    "Num_Bank_Accounts",
    when(col("Num_Bank_Accounts").cast("int") < 0, None)  # Set negative values to null
    .otherwise(col("Num_Bank_Accounts").cast("int"))
)

In [187]:
# Clean Num_of_Loan
# Remove non-digit characters and cast to integer
df_cleaned = df_cleaned.withColumn(
    "Num_of_Loan",
    regexp_replace(col("Num_of_Loan"), r"[^\d\-]", "").cast("int")
)

# Replace invalid values (e.g. negative loans) with null
df_cleaned = df_cleaned.withColumn(
    "Num_of_Loan",
    when(col("Num_of_Loan") < 0, None).otherwise(col("Num_of_Loan"))
)

In [188]:
# Type_of_Loan
# Split on ',' and 'and', normalize separators
df_cleaned = df_cleaned.withColumn(
    "Type_of_Loan_Array",
    split(
        regexp_replace(col("Type_of_Loan"), r"\s*and\s*|\s*,\s*", ","), 
        ","
    )
)

# Trim entries, remove "Not Specified", deduplicate, sort
df_cleaned = df_cleaned.withColumn(
    "Type_of_Loan",
    concat_ws(
        ", ",
        array_sort(  # sort for consistency
            array_distinct(  # Remove duplicates
                expr("filter(transform(Type_of_Loan_Array, x -> trim(x)), x -> x != 'Not Specified')")  # Remove 'Not Specified'
            )
        )
    )
)

# Drop the intermediate Type_of_Loan_Array column
df_cleaned = df_cleaned.drop("Type_of_Loan_Array")

In [189]:
# Delay_from_due_date
# Cast to integer and replace negative delays with null
df_cleaned = df_cleaned.withColumn(
    "Delay_from_due_date",
    when(col("Delay_from_due_date").cast("int") < 0, None)
    .otherwise(col("Delay_from_due_date").cast("int"))
)

In [190]:
# Clean Num_of_Delayed_Payment
# Remove non-digit characters and cast to integer
df_cleaned = df_cleaned.withColumn(
    "Num_of_Delayed_Payment",
    regexp_replace(col("Num_of_Delayed_Payment"), r"[^\d\-]", "").cast("int")
)

# Replace invalid values (e.g. negative loans) with null
df_cleaned = df_cleaned.withColumn(
    "Num_of_Delayed_Payment",
    when(col("Num_of_Delayed_Payment") < 0, None).otherwise(col("Num_of_Delayed_Payment"))
)

In [191]:
# Changed_Credit_Limit
df_cleaned = df_cleaned.withColumn(
    "Changed_Credit_Limit",
    regexp_replace(col("Changed_Credit_Limit"), r"[^\d\.-]", "").cast("double")
)

df_cleaned = df_cleaned.withColumn(
    "Changed_Credit_Limit",
    round(col("Changed_Credit_Limit"), 2)
)

In [192]:
# Num_Credit_Inquiries
# Cast to integer and replace negative delays with null
df_cleaned = df_cleaned.withColumn(
    "Num_Credit_Inquiries",
    when(col("Num_Credit_Inquiries").cast("int") < 0, None)
    .otherwise(col("Num_Credit_Inquiries").cast("int"))
)

In [193]:
# Credit_Mix
df_cleaned = df_cleaned.withColumn(
    "Credit_Mix",
    when(trim(col("Credit_Mix")) == "_", None)  # Replace "_" with null
    .otherwise(initcap(trim(col("Credit_Mix"))))  # Standardize casing
)

In [194]:
# Outstanding_Debt
# Clean non-numeric characters (keep digits and dot)
df_cleaned = df_cleaned.withColumn(
    "Outstanding_Debt",
    regexp_replace(col("Outstanding_Debt"), r"[^\d.]", "").cast("double")
)

# Round to 2 decimal places
df_cleaned = df_cleaned.withColumn(
    "Outstanding_Debt",
    round(col("Outstanding_Debt"), 2)
)

In [195]:
# Credit_Utilization_Ratio
# Clean non-numeric characters (keep digits and dot)
df_cleaned = df_cleaned.withColumn(
    "Credit_Utilization_Ratio",
    regexp_replace(col("Credit_Utilization_Ratio"), r"[^\d.]", "").cast("double")
)

# Round to 2 decimal places
df_cleaned = df_cleaned.withColumn(
    "Credit_Utilization_Ratio",
    round(col("Credit_Utilization_Ratio"), 2)
)

In [196]:
# Credit_History_Age
# Extract years and months using regex
df_cleaned = df_cleaned.withColumn("Years", regexp_extract(col("Credit_History_Age"), r"(\d+)\s+Years", 1).cast("int")) \
               .withColumn("Months", regexp_extract(col("Credit_History_Age"), r"(\d+)\s+Months", 1).cast("int"))

# Convert to float years: years + (months / 12), rounded to 2 decimals
df_cleaned = df_cleaned.withColumn(
    "Credit_History_Age",
    round(col("Years") + (col("Months") / 12), 2)
)

# Drop intermediate columns if desired
df_cleaned = df_cleaned.drop("Years", "Months")

In [197]:
# Payment_of_Min_Amount
# Clean and standardize
df_cleaned = df_cleaned.withColumn(
    "Payment_of_Min_Amount",
    when(trim(lower(col("Payment_of_Min_Amount"))) == "yes", "Yes")
    .when(trim(lower(col("Payment_of_Min_Amount"))) == "no", "No")
    .otherwise(None)  # Replace NM or any unexpected values with null
)


In [198]:
# Total_EMI_per_month
# Clean non-numeric characters (keep digits and dot)
df_cleaned = df_cleaned.withColumn(
    "Total_EMI_per_month",
    regexp_replace(col("Total_EMI_per_month"), r"[^\d.]", "").cast("double")
)

# Round to 2 decimal places
df_cleaned = df_cleaned.withColumn(
    "Total_EMI_per_month",
    round(col("Total_EMI_per_month"), 2)
)

In [199]:
# Amount_invested_monthly
# Clean non-numeric characters (keep digits and dot)
df_cleaned = df_cleaned.withColumn(
    "Amount_invested_monthly",
    regexp_replace(col("Amount_invested_monthly"), r"[^\d.]", "").cast("double")
)

# Round to 2 decimal places
df_cleaned = df_cleaned.withColumn(
    "Amount_invested_monthly",
    round(col("Amount_invested_monthly"), 2)
)

In [200]:
# Payment_Behaviour
# Define valid patterns 
valid_pattern = r"^(High|Low)_spent_(Small|Medium)_value_payments$"

# Clean Payment_Behaviour
df_cleaned = df_cleaned.withColumn(
    "Payment_Behaviour",
    when(col("Payment_Behaviour").rlike(valid_pattern), col("Payment_Behaviour"))
    .otherwise(None)  # Set invalid values to null
)

In [201]:
# Monthly_Balance
# Clean non-numeric characters (keep digits and dot)
df_cleaned = df_cleaned.withColumn(
    "Monthly_Balance",
    regexp_replace(col("Monthly_Balance"), r"[^\d.]", "").cast("double")
)

# Round to 2 decimal places
df_cleaned = df_cleaned.withColumn(
    "Monthly_Balance",
    round(col("Monthly_Balance"), 2)
)

In [202]:
df_cleaned.columns

['Customer_ID',
 'Annual_Income',
 'Monthly_Inhand_Salary',
 'Num_Bank_Accounts',
 'Num_Credit_Card',
 'Interest_Rate',
 'Num_of_Loan',
 'Type_of_Loan',
 'Delay_from_due_date',
 'Num_of_Delayed_Payment',
 'Changed_Credit_Limit',
 'Num_Credit_Inquiries',
 'Credit_Mix',
 'Outstanding_Debt',
 'Credit_Utilization_Ratio',
 'Credit_History_Age',
 'Payment_of_Min_Amount',
 'Total_EMI_per_month',
 'Amount_invested_monthly',
 'Payment_Behaviour',
 'Monthly_Balance',
 'snapshot_date']

In [203]:
df_cleaned.show(10)

+-----------+-------------+---------------------+-----------------+---------------+-------------+-----------+--------------------+-------------------+----------------------+--------------------+--------------------+----------+----------------+------------------------+------------------+---------------------+-------------------+-----------------------+--------------------+---------------+-------------+
|Customer_ID|Annual_Income|Monthly_Inhand_Salary|Num_Bank_Accounts|Num_Credit_Card|Interest_Rate|Num_of_Loan|        Type_of_Loan|Delay_from_due_date|Num_of_Delayed_Payment|Changed_Credit_Limit|Num_Credit_Inquiries|Credit_Mix|Outstanding_Debt|Credit_Utilization_Ratio|Credit_History_Age|Payment_of_Min_Amount|Total_EMI_per_month|Amount_invested_monthly|   Payment_Behaviour|Monthly_Balance|snapshot_date|
+-----------+-------------+---------------------+-----------------+---------------+-------------+-----------+--------------------+-------------------+----------------------+-----------------

In [204]:
# clean data: enforce schema / data type
# Dictionary specifying columns and their desired datatypes
column_type_map = {
    "Customer_ID": StringType(),
    "Annual_Income": DoubleType(),
    "Monthly_Inhand_Salary": DoubleType(),
    "Num_Bank_Accounts": IntegerType(),
    "Num_Credit_Card": IntegerType(),
    "Interest_Rate": IntegerType(),
    "Type_of_Loan": StringType(),
    "Delay_from_due_date": IntegerType(),
    "Num_of_Delayed_Payment": IntegerType(),
    "Changed_Credit_Limit": DoubleType(),
    "Num_Credit_Inquiries": IntegerType(),
    "Credit_Mix": StringType(),
    "Outstanding_Debt": DoubleType(),
    "Credit_Utilization_Ratio": DoubleType(),
    "Credit_History_Age": DoubleType(),
    "Payment_of_Min_Amount": StringType(),
    "Total_EMI_per_month": DoubleType(),
    "Amount_invested_monthly": DoubleType(),
    "Payment_Behaviour": StringType(),
    "Monthly_Balance": DoubleType(),      
    "snapshot_date": DateType(),
}

#change to new dtype
for column, new_type in column_type_map.items():
    df = df_cleaned.withColumn(column, col(column).cast(new_type))

### Gold

### Imputation

In [205]:
# List of excluded columns
excluded_cols = ["Customer_ID", "snapshot_date"]

# Identify column types (exclude the excluded columns)
string_cols = [c for c, t in column_type_map.items() if isinstance(t, StringType) and c not in excluded_cols]
numeric_cols = [c for c, t in column_type_map.items() if isinstance(t, (IntegerType, DoubleType)) and c not in excluded_cols]

# Impute empty or space-only strings with the most frequent value
df_imputed = df

for col_name in string_cols:
    # Find the most frequent (mode) value in the column
    mode_row = df_imputed.groupBy(col_name).count().orderBy(F.desc("count")).first()
    mode_val = mode_row[0] if mode_row and mode_row[0] is not None else None

    # If a mode is found, replace empty or space-only strings with the mode value
    if mode_val:
        df_imputed = df_imputed.withColumn(
            col_name,
            when((col(col_name).isNull()) | (trim(col(col_name)) == ""), mode_val).otherwise(col(col_name))
        )
    else:
        # If no mode found (which shouldn't happen), replace with "Unknown"
        df_imputed = df_imputed.withColumn(
            col_name,
            when((col(col_name).isNull()) | (trim(col(col_name)) == ""), "Unknown").otherwise(col(col_name))
        )

# Impute numeric columns with the median
for col_name in numeric_cols:
    median = df_imputed.approxQuantile(col_name, [0.5], 0.01)[0]
    df_imputed = df_imputed.fillna({col_name: median})

### OHE

In [206]:
categorical_cols = ["Type_of_Loan", "Credit_Mix", "Payment_Behaviour"]

# Step 1: Index and one-hot encode (with dropLast=True to mimic drop_first=True)
indexers = [StringIndexer(inputCol=c, outputCol=c+"_index", handleInvalid="keep") for c in categorical_cols]
encoders = [OneHotEncoder(inputCol=c+"_index", outputCol=c+"_ohe", dropLast=True) for c in categorical_cols]

pipeline = Pipeline(stages=indexers + encoders)
model = pipeline.fit(df_imputed)
df_encoded = model.transform(df_imputed)

# Step 2: Convert vector to array
vector_to_array_udf = udf(lambda v: v.toArray().tolist(), ArrayType(DoubleType()))

# Step 3: Split each one-hot vector into binary columns
for c in categorical_cols:
    arr_col = c + "_arr"
    df_encoded = df_encoded.withColumn(arr_col, vector_to_array_udf(col(c + "_ohe")))

    # Get array length to know number of binary columns
    num_categories = df_encoded.select(arr_col).head()[arr_col].__len__()

    # Create binary columns (0/1)
    for i in range(num_categories):
        df_encoded = df_encoded.withColumn(f"{c}_{i}", col(arr_col)[i].cast("int"))

    # Drop original column + intermediates
    df_encoded = df_encoded.drop(c, c + "_index", c + "_ohe", arr_col)

### Engineer features

In [208]:
# recent delay is > 10 days, else 0
df = df_encoded.withColumn(
    "Recent_Delays",
    when(col("Delay_from_due_date") > 10, 1).otherwise(0)
)

df = df.withColumn(
    "Income_to_Debt_Ratio",
    (col("Annual_Income") / col("Outstanding_Debt"))
)

df = df.withColumn(
    "EMI_to_Income_Ratio",
    (col("Total_EMI_per_month") / (col("Annual_Income") / 12))
)

df = df.withColumn(
    "Monthly_Saving",
    col("Monthly_Inhand_Salary") - col("Total_EMI_per_month") - col("Amount_invested_monthly")
)

# Add Num_Bank_Accounts + Num_Credit_Card and flag if above threshold
df = df.withColumn(
    "Is_Multi_Borrower",
    when((col("Num_Bank_Accounts") + col("Num_Credit_Card")) > 6, 1).otherwise(0)
)

# Round to 2dp
df = df.withColumn("Income_to_Debt_Ratio", round(col("Income_to_Debt_Ratio"), 2)) \
       .withColumn("EMI_to_Income_Ratio", round(col("EMI_to_Income_Ratio"), 2)) \
       .withColumn("Monthly_Saving", round(col("Monthly_Saving"), 2))

### Get latest snapshot

In [209]:
# Get only latest snapshot for ML
# Ensure snapshot_date is in date format
df_cleaned = df.withColumn("snapshot_date", to_date("snapshot_date", "yyyy-MM-dd"))

# Define a window partitioned by Customer_ID, ordered by snapshot_date descending
window_spec = Window.partitionBy("Customer_ID").orderBy(col("snapshot_date").desc())

# Add row_number to rank records
ranked_df = df_cleaned.withColumn("rn", row_number().over(window_spec))

# Filter only latest records (row_number == 1)
df = ranked_df.filter(col("rn") == 1).drop("rn")

### Generate Bronze, Silver, Gold

In [210]:
# create bronze datalake
bronze_features_financials_directory = "datamart/bronze/features_financials/"

if not os.path.exists(bronze_features_financials_directory):
    os.makedirs(bronze_features_financials_directory)

In [212]:
# run bronze backfill
for date_str in dates_str_lst:
    utils.features_financials_bronze_table.features_financials_bronze_table(date_str, bronze_features_financials_directory, spark)

2023-01-01row count: 530
saved to: datamart/bronze/features_financials/bronze_features_financials_daily_2023_01_01.csv
2023-02-01row count: 501
saved to: datamart/bronze/features_financials/bronze_features_financials_daily_2023_02_01.csv
2023-03-01row count: 506
saved to: datamart/bronze/features_financials/bronze_features_financials_daily_2023_03_01.csv
2023-04-01row count: 510
saved to: datamart/bronze/features_financials/bronze_features_financials_daily_2023_04_01.csv
2023-05-01row count: 521
saved to: datamart/bronze/features_financials/bronze_features_financials_daily_2023_05_01.csv
2023-06-01row count: 517
saved to: datamart/bronze/features_financials/bronze_features_financials_daily_2023_06_01.csv
2023-07-01row count: 471
saved to: datamart/bronze/features_financials/bronze_features_financials_daily_2023_07_01.csv
2023-08-01row count: 481
saved to: datamart/bronze/features_financials/bronze_features_financials_daily_2023_08_01.csv
2023-09-01row count: 454
saved to: datamart/bron

In [214]:
# create silver datalake
silver_features_financials_directory = "datamart/silver/features_financials/"

if not os.path.exists(silver_features_financials_directory):
    os.makedirs(silver_features_financials_directory)

In [215]:
# run silver backfill
for date_str in dates_str_lst:
    utils.features_financials_silver_table.features_financials_silver_table(date_str, bronze_features_financials_directory, silver_features_financials_directory, spark)

loaded from: datamart/bronze/features_financials/bronze_features_financials_daily_2023_01_01.csv row count: 530


                                                                                

saved to: datamart/silver/features_financials/silver_features_financials_daily_2023_01_01.parquet
loaded from: datamart/bronze/features_financials/bronze_features_financials_daily_2023_02_01.csv row count: 501
saved to: datamart/silver/features_financials/silver_features_financials_daily_2023_02_01.parquet
loaded from: datamart/bronze/features_financials/bronze_features_financials_daily_2023_03_01.csv row count: 506
saved to: datamart/silver/features_financials/silver_features_financials_daily_2023_03_01.parquet
loaded from: datamart/bronze/features_financials/bronze_features_financials_daily_2023_04_01.csv row count: 510
saved to: datamart/silver/features_financials/silver_features_financials_daily_2023_04_01.parquet
loaded from: datamart/bronze/features_financials/bronze_features_financials_daily_2023_05_01.csv row count: 521
saved to: datamart/silver/features_financials/silver_features_financials_daily_2023_05_01.parquet
loaded from: datamart/bronze/features_financials/bronze_featur

In [216]:
# create gold datalake
gold_features_financials_directory = "datamart/gold/features_financials/"

if not os.path.exists(gold_features_financials_directory):
    os.makedirs(gold_features_financials_directory)

In [218]:
# run gold backfill
for date_str in dates_str_lst:
    utils.features_financials_gold_table.features_financials_gold_table(date_str, silver_features_financials_directory, gold_features_financials_directory, spark)

loaded from: datamart/silver/features_financials/silver_features_financials_daily_2023_01_01.parquet row count: 530


                                                                                

saved to: datamart/gold/features_financials/gold_features_financials_daily_2023_01_01.parquet
loaded from: datamart/silver/features_financials/silver_features_financials_daily_2023_02_01.parquet row count: 501
saved to: datamart/gold/features_financials/gold_features_financials_daily_2023_02_01.parquet
loaded from: datamart/silver/features_financials/silver_features_financials_daily_2023_03_01.parquet row count: 506


                                                                                

saved to: datamart/gold/features_financials/gold_features_financials_daily_2023_03_01.parquet
loaded from: datamart/silver/features_financials/silver_features_financials_daily_2023_04_01.parquet row count: 510


                                                                                

saved to: datamart/gold/features_financials/gold_features_financials_daily_2023_04_01.parquet
loaded from: datamart/silver/features_financials/silver_features_financials_daily_2023_05_01.parquet row count: 521
saved to: datamart/gold/features_financials/gold_features_financials_daily_2023_05_01.parquet
loaded from: datamart/silver/features_financials/silver_features_financials_daily_2023_06_01.parquet row count: 517


                                                                                

saved to: datamart/gold/features_financials/gold_features_financials_daily_2023_06_01.parquet
loaded from: datamart/silver/features_financials/silver_features_financials_daily_2023_07_01.parquet row count: 471
saved to: datamart/gold/features_financials/gold_features_financials_daily_2023_07_01.parquet
loaded from: datamart/silver/features_financials/silver_features_financials_daily_2023_08_01.parquet row count: 481
saved to: datamart/gold/features_financials/gold_features_financials_daily_2023_08_01.parquet
loaded from: datamart/silver/features_financials/silver_features_financials_daily_2023_09_01.parquet row count: 454
saved to: datamart/gold/features_financials/gold_features_financials_daily_2023_09_01.parquet
loaded from: datamart/silver/features_financials/silver_features_financials_daily_2023_10_01.parquet row count: 487


                                                                                

saved to: datamart/gold/features_financials/gold_features_financials_daily_2023_10_01.parquet
loaded from: datamart/silver/features_financials/silver_features_financials_daily_2023_11_01.parquet row count: 491


                                                                                

saved to: datamart/gold/features_financials/gold_features_financials_daily_2023_11_01.parquet
loaded from: datamart/silver/features_financials/silver_features_financials_daily_2023_12_01.parquet row count: 489


                                                                                

saved to: datamart/gold/features_financials/gold_features_financials_daily_2023_12_01.parquet
loaded from: datamart/silver/features_financials/silver_features_financials_daily_2024_01_01.parquet row count: 485


                                                                                

saved to: datamart/gold/features_financials/gold_features_financials_daily_2024_01_01.parquet
loaded from: datamart/silver/features_financials/silver_features_financials_daily_2024_02_01.parquet row count: 518


                                                                                

saved to: datamart/gold/features_financials/gold_features_financials_daily_2024_02_01.parquet
loaded from: datamart/silver/features_financials/silver_features_financials_daily_2024_03_01.parquet row count: 511


                                                                                

saved to: datamart/gold/features_financials/gold_features_financials_daily_2024_03_01.parquet
loaded from: datamart/silver/features_financials/silver_features_financials_daily_2024_04_01.parquet row count: 513


                                                                                

saved to: datamart/gold/features_financials/gold_features_financials_daily_2024_04_01.parquet
loaded from: datamart/silver/features_financials/silver_features_financials_daily_2024_05_01.parquet row count: 491
saved to: datamart/gold/features_financials/gold_features_financials_daily_2024_05_01.parquet
loaded from: datamart/silver/features_financials/silver_features_financials_daily_2024_06_01.parquet row count: 498
saved to: datamart/gold/features_financials/gold_features_financials_daily_2024_06_01.parquet
loaded from: datamart/silver/features_financials/silver_features_financials_daily_2024_07_01.parquet row count: 505


                                                                                

saved to: datamart/gold/features_financials/gold_features_financials_daily_2024_07_01.parquet
loaded from: datamart/silver/features_financials/silver_features_financials_daily_2024_08_01.parquet row count: 543


                                                                                

saved to: datamart/gold/features_financials/gold_features_financials_daily_2024_08_01.parquet
loaded from: datamart/silver/features_financials/silver_features_financials_daily_2024_09_01.parquet row count: 493


                                                                                

saved to: datamart/gold/features_financials/gold_features_financials_daily_2024_09_01.parquet
loaded from: datamart/silver/features_financials/silver_features_financials_daily_2024_10_01.parquet row count: 456


                                                                                

saved to: datamart/gold/features_financials/gold_features_financials_daily_2024_10_01.parquet
loaded from: datamart/silver/features_financials/silver_features_financials_daily_2024_11_01.parquet row count: 488
saved to: datamart/gold/features_financials/gold_features_financials_daily_2024_11_01.parquet
loaded from: datamart/silver/features_financials/silver_features_financials_daily_2024_12_01.parquet row count: 515


[Stage 1807:>                                                       (0 + 1) / 1]

saved to: datamart/gold/features_financials/gold_features_financials_daily_2024_12_01.parquet


                                                                                