In [1]:
# --- 3_hybrid_data_prep.ipynb: Final Merging, Scaling, and Sequence Creation ---
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.compose import ColumnTransformer




In [2]:
# Define constants
MAX_SEQUENCE_LENGTH = 8
#TEST_SPLIT_DATE = '2023-10-01' # Define the date to split the data

In [3]:
print("--- Phase 3: Final Hybrid Data Preparation ---")

# --- Step 1: Data Loading ---
try:
    # 1. Load the feature-engineered LMS data
    df_lms = pd.read_csv('lms_feature_engineered.csv')
    # Convert dates and correct dtypes after loading from CSV
    df_lms['DUE_DATE'] = pd.to_datetime(df_lms['DUE_DATE'])
    df_lms['LOAN_DATE'] = pd.to_datetime(df_lms['LOAN_DATE'])

    #df_lms['DUE_DATE_DT'] = df_lms['DUE_DATE'] # Prepare for split check
    print(f"✅ Loaded feature-engineered LMS data. Shape: {df_lms.shape}")

    # 2. Load the preprocessed LOS data
    df_los = pd.read_csv('los_cleaned.csv')
    RENAME_MAP_LOS = {
        'LOAN_AMOUNT': 'LOAN_AMOUNT_STATIC',
        'TENURE': 'TENURE_STATIC',
        'INTEREST_RATE': 'INTEREST_RATE_STATIC'
    }
    df_los.rename(columns=RENAME_MAP_LOS, inplace=True)
    df_los['LOAN_DATE'] = pd.to_datetime(df_los['LOAN_DATE'])
    print(f"✅ Loaded preprocessed LOS data. Shape: {df_los.shape}")

except FileNotFoundError as e:
    print(f"❌ ERROR: Required file not found. Check if 1_los_data_prep.ipynb and 2_lms_data_prep.ipynb were run and saved correctly. Error: {e}")
    raise

--- Phase 3: Final Hybrid Data Preparation ---
✅ Loaded feature-engineered LMS data. Shape: (28351669, 21)
✅ Loaded preprocessed LOS data. Shape: (1985182, 16)


In [4]:
df_lms

Unnamed: 0,LOAN_ID,INSTALLMENT_NO,DUE_DATE,INSTALLMENT_AMOUNT,LOAN_DATE,LOAN_AMOUNT_LMS,TENURE_LMS,INTEREST_RATE_LMS,DAYS_LATE,DAYS_BETWEEN_DUE_DATES,...,LOAN_SCHEDULE_TYPE,IS_UNPAID,IS_DAYS_LATE_MISSING,PAID_RATIO,DELTA_DAYS_LATE,CURRENT_EMI_BEHAVIOR_LABEL,COMPOSITE_RISK,PAYMENT_SCORE,PAYMENT_SCORE_RANK,NEXT_EMI_LABEL
0,1001UNSL002793,1,2023-02-10,3000,2023-01-03,60000,2,24.5,0.0,,...,Monthly,0,0,1.000000,0.0,0,0.000000,1.5,1,0
1,1001UNSL002793,2,2023-03-10,3000,2023-01-03,60000,2,24.5,0.0,28.0,...,Monthly,0,0,1.000000,0.0,0,0.000000,1.5,1,0
2,1001UNSL002793,3,2023-04-07,3000,2023-01-03,60000,2,24.5,0.0,28.0,...,Monthly,0,0,1.000000,0.0,0,0.000000,1.5,1,0
3,1001UNSL002793,4,2023-05-05,3000,2023-01-03,60000,2,24.5,0.0,28.0,...,Monthly,0,0,1.000000,0.0,0,0.000000,1.5,1,0
4,1001UNSL002793,5,2023-06-02,3000,2023-01-03,60000,2,24.5,0.0,28.0,...,Monthly,0,0,1.000000,0.0,0,0.000000,1.5,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28351664,4209UNSL000970,15,2025-03-11,2010,2023-12-30,40000,2,25.0,0.0,28.0,...,Monthly,0,0,1.000000,0.0,0,0.000000,1.5,1,0
28351665,4209UNSL000970,16,2025-04-08,2010,2023-12-30,40000,2,25.0,0.0,28.0,...,Monthly,0,0,1.000000,0.0,0,0.000000,1.5,1,1
28351666,4209UNSL000970,17,2025-05-06,2010,2023-12-30,40000,2,25.0,25.0,28.0,...,Monthly,0,0,0.052239,25.0,1,34.477612,0.0,3,1
28351667,4209UNSL000970,18,2025-06-03,2010,2023-12-30,40000,2,25.0,0.0,28.0,...,Monthly,1,1,0.000000,0.0,1,10.000000,-100.0,4,1


In [5]:
df_lms.isna().sum()

LOAN_ID                             0
INSTALLMENT_NO                      0
DUE_DATE                            0
INSTALLMENT_AMOUNT                  0
LOAN_DATE                           0
LOAN_AMOUNT_LMS                     0
TENURE_LMS                          0
INTEREST_RATE_LMS                   0
DAYS_LATE                           0
DAYS_BETWEEN_DUE_DATES        1136897
REPAYMENT_SCHEDULE_CAT              0
LOAN_SCHEDULE_TYPE                  0
IS_UNPAID                           0
IS_DAYS_LATE_MISSING                0
PAID_RATIO                          0
DELTA_DAYS_LATE                     0
CURRENT_EMI_BEHAVIOR_LABEL          0
COMPOSITE_RISK                      0
PAYMENT_SCORE                       0
PAYMENT_SCORE_RANK                  0
NEXT_EMI_LABEL                      0
dtype: int64

In [6]:
df_los

Unnamed: 0,LOAN_ID,PIN_CODE,TOTAL_INCOME,TOTAL_EXPENSE,CUSTOMER_FLAG,MARITAL_STATUS_NAME,LOAN_DATE,LOAN_AMOUNT_STATIC,TENURE_STATIC,INTEREST_RATE_STATIC,DISBURSED_AMOUNT,CYCLE,STATE_NAME,OCCUPATION_NAME,LOAN_PURPOSE,AGE
0,1854UNSL009563,852114.0,25000.0,1760.0,A,MARRIED,2023-06-20,40000,2,25.0,40000,1,BIHAR,AGRI INPUT,Agri - Agriculture,27.0
1,2040UNSL002256,757036.0,21000.0,300.0,A,MARRIED,2023-06-13,40000,2,25.0,40000,1,ODISHA,AGRICULTURE AND RELATED INPUTS,Agri - Agriculture,51.0
2,1715UNSL001385,416211.0,0.0,0.0,D,MARRIED,2023-08-31,44000,2,25.0,44000,4,MAHARASHTRA,UNKNOWN,Dairy Farm,33.0
3,2651UNSL003253,123401.0,22000.0,2910.0,A,MARRIED,2023-11-07,50000,2,25.0,50000,2,HARYANA,ANIMAL HUSBANDRY,Agri - Agriculture,43.0
4,1165UNSL009368,571301.0,17500.0,2160.0,A,MARRIED,2023-10-25,70000,3,25.0,70000,2,KARNATAKA,AGRI INPUT,Agri - Agriculture,40.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1985177,1030UNSL004771,630606.0,15500.0,150.0,A,MARRIED,2023-05-24,70000,2,25.0,70000,6,TAMIL NADU,AGRICULTURE AND RELATED INPUTS,Agri - Agriculture,38.0
1985178,2709UNSL003338,388710.0,17500.0,1920.0,A,MARRIED,2023-10-14,60000,2,25.0,60000,2,GUJARAT,AGRI INPUT,Agri - Dairy,42.0
1985179,1293UNSL002341,461661.0,18000.0,2424.0,A,MARRIED,2023-10-07,50000,2,25.0,50000,2,MADHYA PRADESH,AGRI INPUT,Agri - Agriculture,44.0
1985180,1229UNSL005582,577002.0,17500.0,1280.0,A,MARRIED,2023-08-07,50000,2,25.0,50000,2,KARNATAKA,AGRICULTURE,Busket Making,30.0


In [7]:
df_los.isna().sum()

LOAN_ID                 0
PIN_CODE                0
TOTAL_INCOME            0
TOTAL_EXPENSE           0
CUSTOMER_FLAG           0
MARITAL_STATUS_NAME     0
LOAN_DATE               0
LOAN_AMOUNT_STATIC      0
TENURE_STATIC           0
INTEREST_RATE_STATIC    0
DISBURSED_AMOUNT        0
CYCLE                   0
STATE_NAME              0
OCCUPATION_NAME         0
LOAN_PURPOSE            0
AGE                     0
dtype: int64

In [8]:
# Identify the final static features (all columns in df_los_final except LOAN_ID)
#STATIC_FEATURES_FINAL = [col for col in df_los_final.columns if col != 'LOAN_ID']

# Merge LOS (static) and LMS (time-series) data
df_combined = pd.merge(
    df_los,
    df_lms,
    on='LOAN_ID',
    how='inner'
)
print(f"✅ Merged LMS and LOS data. Combined Shape: {df_combined.shape}")

✅ Merged LMS and LOS data. Combined Shape: (28135758, 36)


In [9]:
df_lms.shape

(28351669, 21)

In [10]:
28351669-28135758

215911

In [11]:
df_combined.columns

Index(['LOAN_ID', 'PIN_CODE', 'TOTAL_INCOME', 'TOTAL_EXPENSE', 'CUSTOMER_FLAG',
       'MARITAL_STATUS_NAME', 'LOAN_DATE_x', 'LOAN_AMOUNT_STATIC',
       'TENURE_STATIC', 'INTEREST_RATE_STATIC', 'DISBURSED_AMOUNT', 'CYCLE',
       'STATE_NAME', 'OCCUPATION_NAME', 'LOAN_PURPOSE', 'AGE',
       'INSTALLMENT_NO', 'DUE_DATE', 'INSTALLMENT_AMOUNT', 'LOAN_DATE_y',
       'LOAN_AMOUNT_LMS', 'TENURE_LMS', 'INTEREST_RATE_LMS', 'DAYS_LATE',
       'DAYS_BETWEEN_DUE_DATES', 'REPAYMENT_SCHEDULE_CAT',
       'LOAN_SCHEDULE_TYPE', 'IS_UNPAID', 'IS_DAYS_LATE_MISSING', 'PAID_RATIO',
       'DELTA_DAYS_LATE', 'CURRENT_EMI_BEHAVIOR_LABEL', 'COMPOSITE_RISK',
       'PAYMENT_SCORE', 'PAYMENT_SCORE_RANK', 'NEXT_EMI_LABEL'],
      dtype='object')

In [12]:
df_combined[df_combined["LOAN_DATE_x"]==df_combined["LOAN_DATE_y"]]

Unnamed: 0,LOAN_ID,PIN_CODE,TOTAL_INCOME,TOTAL_EXPENSE,CUSTOMER_FLAG,MARITAL_STATUS_NAME,LOAN_DATE_x,LOAN_AMOUNT_STATIC,TENURE_STATIC,INTEREST_RATE_STATIC,...,LOAN_SCHEDULE_TYPE,IS_UNPAID,IS_DAYS_LATE_MISSING,PAID_RATIO,DELTA_DAYS_LATE,CURRENT_EMI_BEHAVIOR_LABEL,COMPOSITE_RISK,PAYMENT_SCORE,PAYMENT_SCORE_RANK,NEXT_EMI_LABEL
0,2040UNSL002256,757036.0,21000.0,300.0,A,MARRIED,2023-06-13,40000,2,25.0,...,Monthly,0,0,1.0,0.0,0,0.0,1.5,1,0
1,2040UNSL002256,757036.0,21000.0,300.0,A,MARRIED,2023-06-13,40000,2,25.0,...,Monthly,0,0,1.0,0.0,0,0.0,1.5,1,0
2,2040UNSL002256,757036.0,21000.0,300.0,A,MARRIED,2023-06-13,40000,2,25.0,...,Monthly,0,0,1.0,0.0,0,0.0,1.5,1,0
3,2040UNSL002256,757036.0,21000.0,300.0,A,MARRIED,2023-06-13,40000,2,25.0,...,Monthly,0,0,1.0,0.0,0,0.0,1.5,1,0
4,2040UNSL002256,757036.0,21000.0,300.0,A,MARRIED,2023-06-13,40000,2,25.0,...,Monthly,0,0,1.0,0.0,0,0.0,1.5,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28135753,1390UNSL004000,731234.0,22500.0,2360.0,A,MARRIED,2023-09-28,30000,2,25.0,...,Monthly,0,0,1.0,0.0,0,0.0,1.5,1,0
28135754,1390UNSL004000,731234.0,22500.0,2360.0,A,MARRIED,2023-09-28,30000,2,25.0,...,Monthly,0,0,1.0,0.0,0,0.0,1.5,1,0
28135755,1390UNSL004000,731234.0,22500.0,2360.0,A,MARRIED,2023-09-28,30000,2,25.0,...,Monthly,0,0,1.0,0.0,0,0.0,1.5,1,0
28135756,1390UNSL004000,731234.0,22500.0,2360.0,A,MARRIED,2023-09-28,30000,2,25.0,...,Monthly,0,0,1.0,0.0,0,0.0,1.5,1,0


In [13]:
df_combined[df_combined["LOAN_DATE_x"]!=df_combined["LOAN_DATE_y"]]

Unnamed: 0,LOAN_ID,PIN_CODE,TOTAL_INCOME,TOTAL_EXPENSE,CUSTOMER_FLAG,MARITAL_STATUS_NAME,LOAN_DATE_x,LOAN_AMOUNT_STATIC,TENURE_STATIC,INTEREST_RATE_STATIC,...,LOAN_SCHEDULE_TYPE,IS_UNPAID,IS_DAYS_LATE_MISSING,PAID_RATIO,DELTA_DAYS_LATE,CURRENT_EMI_BEHAVIOR_LABEL,COMPOSITE_RISK,PAYMENT_SCORE,PAYMENT_SCORE_RANK,NEXT_EMI_LABEL


In [14]:
df_combined.drop('LOAN_DATE_y',axis=1,inplace=True)

In [15]:
df_combined.shape

(28135758, 35)

In [16]:
df_combined.columns

Index(['LOAN_ID', 'PIN_CODE', 'TOTAL_INCOME', 'TOTAL_EXPENSE', 'CUSTOMER_FLAG',
       'MARITAL_STATUS_NAME', 'LOAN_DATE_x', 'LOAN_AMOUNT_STATIC',
       'TENURE_STATIC', 'INTEREST_RATE_STATIC', 'DISBURSED_AMOUNT', 'CYCLE',
       'STATE_NAME', 'OCCUPATION_NAME', 'LOAN_PURPOSE', 'AGE',
       'INSTALLMENT_NO', 'DUE_DATE', 'INSTALLMENT_AMOUNT', 'LOAN_AMOUNT_LMS',
       'TENURE_LMS', 'INTEREST_RATE_LMS', 'DAYS_LATE',
       'DAYS_BETWEEN_DUE_DATES', 'REPAYMENT_SCHEDULE_CAT',
       'LOAN_SCHEDULE_TYPE', 'IS_UNPAID', 'IS_DAYS_LATE_MISSING', 'PAID_RATIO',
       'DELTA_DAYS_LATE', 'CURRENT_EMI_BEHAVIOR_LABEL', 'COMPOSITE_RISK',
       'PAYMENT_SCORE', 'PAYMENT_SCORE_RANK', 'NEXT_EMI_LABEL'],
      dtype='object')

In [17]:
df_combined.rename(columns={"LOAN_DATE_x":"LOAN_DATE"},inplace=True)

In [18]:
# Before the comparison/drop steps
print("Optimizing memory for LOAN_AMOUNT columns...")

# Ensure both are float32 for efficient, low-memory comparison
df_combined['LOAN_AMOUNT_STATIC'] = df_combined['LOAN_AMOUNT_STATIC'].astype('float32')
df_combined['LOAN_AMOUNT_LMS'] = df_combined['LOAN_AMOUNT_LMS'].astype('float32')

Optimizing memory for LOAN_AMOUNT columns...


In [19]:
# Check for equality using NumPy arrays
# np.equal creates the boolean mask directly from arrays
is_equal_mask = np.equal(df_combined['LOAN_AMOUNT_STATIC'].values, df_combined['LOAN_AMOUNT_LMS'].values)

# Now, apply the mask (this is still memory-intensive, but often works)
# If this fails, proceed to the next section (Goal-Oriented Fix).

In [20]:
num_equal = is_equal_mask.sum()
print(f"Number of equal LOAN_AMOUNT records: {num_equal}")

Number of equal LOAN_AMOUNT records: 28135758


In [21]:
#df_combined[df_combined["LOAN_AMOUNT_STATIC"]==df_combined["LOAN_AMOUNT_LMS"]]

In [22]:
#df_combined.drop('LOAN_AMOUNT_LMS',axis=1,inplace=True)
# Use the 'del' statement for a more memory-efficient column removal
del df_combined['LOAN_AMOUNT_LMS']

In [23]:
#df_combined.rename(columns={"LOAN_AMOUNT_STATIC":"LOAN_AMOUNT"},inplace=True)
# Now, rename the remaining column
df_combined.rename(columns={"LOAN_AMOUNT_STATIC":"LOAN_AMOUNT"}, inplace=True)

In [24]:
df_combined.shape

(28135758, 34)

In [25]:
df_combined.columns

Index(['LOAN_ID', 'PIN_CODE', 'TOTAL_INCOME', 'TOTAL_EXPENSE', 'CUSTOMER_FLAG',
       'MARITAL_STATUS_NAME', 'LOAN_DATE', 'LOAN_AMOUNT', 'TENURE_STATIC',
       'INTEREST_RATE_STATIC', 'DISBURSED_AMOUNT', 'CYCLE', 'STATE_NAME',
       'OCCUPATION_NAME', 'LOAN_PURPOSE', 'AGE', 'INSTALLMENT_NO', 'DUE_DATE',
       'INSTALLMENT_AMOUNT', 'TENURE_LMS', 'INTEREST_RATE_LMS', 'DAYS_LATE',
       'DAYS_BETWEEN_DUE_DATES', 'REPAYMENT_SCHEDULE_CAT',
       'LOAN_SCHEDULE_TYPE', 'IS_UNPAID', 'IS_DAYS_LATE_MISSING', 'PAID_RATIO',
       'DELTA_DAYS_LATE', 'CURRENT_EMI_BEHAVIOR_LABEL', 'COMPOSITE_RISK',
       'PAYMENT_SCORE', 'PAYMENT_SCORE_RANK', 'NEXT_EMI_LABEL'],
      dtype='object')

In [26]:
#df_combined[df_combined["TENURE_STATIC"]==df_combined["TENURE_LMS"]]
# Check for equality using NumPy arrays
# np.equal creates the boolean mask directly from arrays
is_equal_mask = np.equal(df_combined['TENURE_STATIC'].values, df_combined['TENURE_LMS'].values)

# Now, apply the mask (this is still memory-intensive, but often works)
# If this fails, proceed to the next section (Goal-Oriented Fix).
num_equal = is_equal_mask.sum()
print(f"Number of equal TENURE records: {num_equal}")

Number of equal TENURE records: 28135758


In [27]:
#df_combined.drop("TENURE_LMS",axis=1,inplace=True)
del df_combined['TENURE_LMS']

In [28]:
df_combined.rename(columns={"TENURE_STATIC":"TENURE"},inplace=True)

In [29]:
df_combined.columns

Index(['LOAN_ID', 'PIN_CODE', 'TOTAL_INCOME', 'TOTAL_EXPENSE', 'CUSTOMER_FLAG',
       'MARITAL_STATUS_NAME', 'LOAN_DATE', 'LOAN_AMOUNT', 'TENURE',
       'INTEREST_RATE_STATIC', 'DISBURSED_AMOUNT', 'CYCLE', 'STATE_NAME',
       'OCCUPATION_NAME', 'LOAN_PURPOSE', 'AGE', 'INSTALLMENT_NO', 'DUE_DATE',
       'INSTALLMENT_AMOUNT', 'INTEREST_RATE_LMS', 'DAYS_LATE',
       'DAYS_BETWEEN_DUE_DATES', 'REPAYMENT_SCHEDULE_CAT',
       'LOAN_SCHEDULE_TYPE', 'IS_UNPAID', 'IS_DAYS_LATE_MISSING', 'PAID_RATIO',
       'DELTA_DAYS_LATE', 'CURRENT_EMI_BEHAVIOR_LABEL', 'COMPOSITE_RISK',
       'PAYMENT_SCORE', 'PAYMENT_SCORE_RANK', 'NEXT_EMI_LABEL'],
      dtype='object')

In [30]:
#df_combined[df_combined["INTEREST_RATE_STATIC"]==df_combined["INTEREST_RATE_LMS"]]
# Check for equality using NumPy arrays
# np.equal creates the boolean mask directly from arrays
is_equal_mask = np.equal(df_combined['INTEREST_RATE_STATIC'].values, df_combined['INTEREST_RATE_LMS'].values)

# Now, apply the mask (this is still memory-intensive, but often works)
# If this fails, proceed to the next section (Goal-Oriented Fix).
num_equal = is_equal_mask.sum()
print(f"Number of equal INTEREST RATE records: {num_equal}")

Number of equal INTEREST RATE records: 28135758


In [31]:
#df_combined[df_combined["INTEREST_RATE_STATIC"]!=df_combined["INTEREST_RATE_LMS"]]

In [32]:
df_combined.drop("INTEREST_RATE_LMS",axis=1,inplace=True)

In [33]:
df_combined.columns

Index(['LOAN_ID', 'PIN_CODE', 'TOTAL_INCOME', 'TOTAL_EXPENSE', 'CUSTOMER_FLAG',
       'MARITAL_STATUS_NAME', 'LOAN_DATE', 'LOAN_AMOUNT', 'TENURE',
       'INTEREST_RATE_STATIC', 'DISBURSED_AMOUNT', 'CYCLE', 'STATE_NAME',
       'OCCUPATION_NAME', 'LOAN_PURPOSE', 'AGE', 'INSTALLMENT_NO', 'DUE_DATE',
       'INSTALLMENT_AMOUNT', 'DAYS_LATE', 'DAYS_BETWEEN_DUE_DATES',
       'REPAYMENT_SCHEDULE_CAT', 'LOAN_SCHEDULE_TYPE', 'IS_UNPAID',
       'IS_DAYS_LATE_MISSING', 'PAID_RATIO', 'DELTA_DAYS_LATE',
       'CURRENT_EMI_BEHAVIOR_LABEL', 'COMPOSITE_RISK', 'PAYMENT_SCORE',
       'PAYMENT_SCORE_RANK', 'NEXT_EMI_LABEL'],
      dtype='object')

In [34]:
df_combined.rename(columns={"INTEREST_RATE_STATIC":"INTEREST_RATE"},inplace=True)

In [35]:
df_combined.shape

(28135758, 32)

In [36]:
#df_combined.drop("DISBURSED_AMOUNT",axis=1,inplace=True)
del df_combined["DISBURSED_AMOUNT"]

In [37]:
df_combined["LOAN_DATE"].min()

Timestamp('2023-01-02 00:00:00')

In [38]:
df_combined["LOAN_DATE"].max()

Timestamp('2023-12-30 00:00:00')

In [39]:
df_combined["PIN_CODE"].nunique()

13562

In [40]:
del df_combined["PIN_CODE"]

In [41]:
df_combined["STATE_NAME"].nunique()

23

In [42]:
df_combined.columns

Index(['LOAN_ID', 'TOTAL_INCOME', 'TOTAL_EXPENSE', 'CUSTOMER_FLAG',
       'MARITAL_STATUS_NAME', 'LOAN_DATE', 'LOAN_AMOUNT', 'TENURE',
       'INTEREST_RATE', 'CYCLE', 'STATE_NAME', 'OCCUPATION_NAME',
       'LOAN_PURPOSE', 'AGE', 'INSTALLMENT_NO', 'DUE_DATE',
       'INSTALLMENT_AMOUNT', 'DAYS_LATE', 'DAYS_BETWEEN_DUE_DATES',
       'REPAYMENT_SCHEDULE_CAT', 'LOAN_SCHEDULE_TYPE', 'IS_UNPAID',
       'IS_DAYS_LATE_MISSING', 'PAID_RATIO', 'DELTA_DAYS_LATE',
       'CURRENT_EMI_BEHAVIOR_LABEL', 'COMPOSITE_RISK', 'PAYMENT_SCORE',
       'PAYMENT_SCORE_RANK', 'NEXT_EMI_LABEL'],
      dtype='object')

In [43]:
df_combined

Unnamed: 0,LOAN_ID,TOTAL_INCOME,TOTAL_EXPENSE,CUSTOMER_FLAG,MARITAL_STATUS_NAME,LOAN_DATE,LOAN_AMOUNT,TENURE,INTEREST_RATE,CYCLE,...,LOAN_SCHEDULE_TYPE,IS_UNPAID,IS_DAYS_LATE_MISSING,PAID_RATIO,DELTA_DAYS_LATE,CURRENT_EMI_BEHAVIOR_LABEL,COMPOSITE_RISK,PAYMENT_SCORE,PAYMENT_SCORE_RANK,NEXT_EMI_LABEL
0,2040UNSL002256,21000.0,300.0,A,MARRIED,2023-06-13,40000.0,2,25.0,1,...,Monthly,0,0,1.0,0.0,0,0.0,1.5,1,0
1,2040UNSL002256,21000.0,300.0,A,MARRIED,2023-06-13,40000.0,2,25.0,1,...,Monthly,0,0,1.0,0.0,0,0.0,1.5,1,0
2,2040UNSL002256,21000.0,300.0,A,MARRIED,2023-06-13,40000.0,2,25.0,1,...,Monthly,0,0,1.0,0.0,0,0.0,1.5,1,0
3,2040UNSL002256,21000.0,300.0,A,MARRIED,2023-06-13,40000.0,2,25.0,1,...,Monthly,0,0,1.0,0.0,0,0.0,1.5,1,0
4,2040UNSL002256,21000.0,300.0,A,MARRIED,2023-06-13,40000.0,2,25.0,1,...,Monthly,0,0,1.0,0.0,0,0.0,1.5,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28135753,1390UNSL004000,22500.0,2360.0,A,MARRIED,2023-09-28,30000.0,2,25.0,2,...,Monthly,0,0,1.0,0.0,0,0.0,1.5,1,0
28135754,1390UNSL004000,22500.0,2360.0,A,MARRIED,2023-09-28,30000.0,2,25.0,2,...,Monthly,0,0,1.0,0.0,0,0.0,1.5,1,0
28135755,1390UNSL004000,22500.0,2360.0,A,MARRIED,2023-09-28,30000.0,2,25.0,2,...,Monthly,0,0,1.0,0.0,0,0.0,1.5,1,0
28135756,1390UNSL004000,22500.0,2360.0,A,MARRIED,2023-09-28,30000.0,2,25.0,2,...,Monthly,0,0,1.0,0.0,0,0.0,1.5,1,0


In [44]:
# df_combined["PIN_CODE"]=df_combined["PIN_CODE"].astype("str")

In [44]:
df_combined[df_combined["DAYS_BETWEEN_DUE_DATES"].isna()]["INSTALLMENT_NO"].nunique()

1

In [45]:
df_combined["DAYS_BETWEEN_DUE_DATES"] = df_combined["DAYS_BETWEEN_DUE_DATES"].fillna(0)

In [46]:
null=df_combined.isna().sum()
null[null>0]

Series([], dtype: int64)

In [47]:
# --- Step 2: Final Feature Creation and Imputation ---

# --- 2.2: Create the Rolling Window Feature ---
print("\n--- Creating and Shifting Rolling Features (No Leakage) ---")
ROLLING_WINDOW_SIZE = 3

# 1. Create the rolling mean of the payment score
df_combined['RECENT_PAYMENT_SCORE'] = df_combined.groupby('LOAN_ID')['PAYMENT_SCORE'].rolling(
    window=ROLLING_WINDOW_SIZE, min_periods=1
).mean().reset_index(level=0, drop=True)

# 2. Shift the rolling feature by 1 to prevent data leakage (look-ahead)
df_combined['RECENT_PAYMENT_SCORE'] = df_combined.groupby('LOAN_ID')['RECENT_PAYMENT_SCORE'].shift(1)

# 3. Impute the first EMI's shifted value with the overall mean score
overall_mean_score = df_combined['PAYMENT_SCORE'].mean()
df_combined['RECENT_PAYMENT_SCORE'] = df_combined['RECENT_PAYMENT_SCORE'].fillna(overall_mean_score)

print(f"✅ Created and shifted RECENT_PAYMENT_SCORE (rolling window {ROLLING_WINDOW_SIZE}).")


--- Creating and Shifting Rolling Features (No Leakage) ---
✅ Created and shifted RECENT_PAYMENT_SCORE (rolling window 3).


In [48]:
df_combined.columns

Index(['LOAN_ID', 'TOTAL_INCOME', 'TOTAL_EXPENSE', 'CUSTOMER_FLAG',
       'MARITAL_STATUS_NAME', 'LOAN_DATE', 'LOAN_AMOUNT', 'TENURE',
       'INTEREST_RATE', 'CYCLE', 'STATE_NAME', 'OCCUPATION_NAME',
       'LOAN_PURPOSE', 'AGE', 'INSTALLMENT_NO', 'DUE_DATE',
       'INSTALLMENT_AMOUNT', 'DAYS_LATE', 'DAYS_BETWEEN_DUE_DATES',
       'REPAYMENT_SCHEDULE_CAT', 'LOAN_SCHEDULE_TYPE', 'IS_UNPAID',
       'IS_DAYS_LATE_MISSING', 'PAID_RATIO', 'DELTA_DAYS_LATE',
       'CURRENT_EMI_BEHAVIOR_LABEL', 'COMPOSITE_RISK', 'PAYMENT_SCORE',
       'PAYMENT_SCORE_RANK', 'NEXT_EMI_LABEL', 'RECENT_PAYMENT_SCORE'],
      dtype='object')

In [49]:
df_combined.drop(columns=["IS_DAYS_LATE_MISSING", "TENURE", "INTEREST_RATE"], inplace=True, errors='ignore')

In [50]:
# Feature Lists
SEQUENTIAL_COLS_NUMERICAL = [
    'INSTALLMENT_NO','INSTALLMENT_AMOUNT', 'DAYS_LATE', 'DAYS_BETWEEN_DUE_DATES','PAID_RATIO', 'DELTA_DAYS_LATE','PAYMENT_SCORE', 
    'COMPOSITE_RISK', 'RECENT_PAYMENT_SCORE','PAYMENT_SCORE_RANK','IS_UNPAID','CURRENT_EMI_BEHAVIOR_LABEL',
]
SEQUENTIAL_COLS_CATEGORICAL = [
    'REPAYMENT_SCHEDULE_CAT',
]
STATIC_COLS_NUMERICAL = [
    'TOTAL_INCOME', 'TOTAL_EXPENSE', 'LOAN_AMOUNT', 'AGE','CYCLE',
]
STATIC_COLS_OHE = [
    'MARITAL_STATUS_NAME', 'STATE_NAME', 'LOAN_SCHEDULE_TYPE',
]
STATIC_COLS_EMBEDDING = [
    'OCCUPATION_NAME', 'LOAN_PURPOSE',
]
TARGET_COL = 'NEXT_EMI_LABEL'


In [51]:
# Custom Encoding (Ordinal & Embedding) on df_combined 
print("\n--- Encoding Ordinal & Embedding Features (Pre-Split) ---")

# 1. Ordinal Encoding for CUSTOMER_FLAG
FLAG_ORDER = {'A': 4, 'B': 3, 'C': 2, 'D': 1}
df_combined['CUSTOMER_FLAG_ENCODED'] = df_combined['CUSTOMER_FLAG'].map(FLAG_ORDER)
STATIC_COLS_NUMERICAL.append('CUSTOMER_FLAG_ENCODED')
del df_combined['CUSTOMER_FLAG']
print("✅ CUSTOMER_FLAG: Ordinal mapped and added to STATIC_COLS_NUMERICAL.")


--- Encoding Ordinal & Embedding Features (Pre-Split) ---
✅ CUSTOMER_FLAG: Ordinal mapped and added to STATIC_COLS_NUMERICAL.


In [52]:
# 2. Label Encoding for Embedding Features (OCCUPATION_NAME, LOAN_PURPOSE)
df_combined['OCCUPATION_NAME'] = df_combined['OCCUPATION_NAME'].astype('category')
df_combined['LOAN_PURPOSE'] = df_combined['LOAN_PURPOSE'].astype('category')

df_combined['OCCUPATION_NAME_ENCODED'] = df_combined['OCCUPATION_NAME'].cat.codes + 1
df_combined['LOAN_PURPOSE_ENCODED'] = df_combined['LOAN_PURPOSE'].cat.codes + 1

STATIC_COLS_EMBEDDING_FINAL = ['OCCUPATION_NAME_ENCODED', 'LOAN_PURPOSE_ENCODED']
del df_combined['OCCUPATION_NAME']
del df_combined['LOAN_PURPOSE']
print("✅ OCCUPATION_NAME & LOAN_PURPOSE: Label Encoded for Embedding Layers.")

✅ OCCUPATION_NAME & LOAN_PURPOSE: Label Encoded for Embedding Layers.


In [53]:
# --- OHE Sequential Categorical Feature (REPAYMENT_SCHEDULE_CAT) ---
print(" One-Hot Encoding Sequential Categorical Feature...")
df_combined = pd.get_dummies(df_combined, columns=SEQUENTIAL_COLS_CATEGORICAL, prefix='REPAYMENT_CAT')

REPAYMENT_CAT_OHE_COLS = [col for col in df_combined.columns if col.startswith('REPAYMENT_CAT_')]
NUMERICAL_FEATURES_FINAL = SEQUENTIAL_COLS_NUMERICAL + STATIC_COLS_NUMERICAL + REPAYMENT_CAT_OHE_COLS
print(f"✅ REPAYMENT_SCHEDULE_CAT encoded.")

 One-Hot Encoding Sequential Categorical Feature...
✅ REPAYMENT_SCHEDULE_CAT encoded.


In [None]:
# # --- 3.4: Chronological Train/Test Split (The Final Split) ---
# TEST_SPLIT_DATE = pd.to_datetime('2025-04-01')
# print(f"\n--- Chronological Train/Test Split (Cutoff: {TEST_SPLIT_DATE.strftime('%Y-%m-%d')}) ---")

# # CRITICAL: Split the fully encoded df_combined based on DUE_DATE
# train_df = df_combined[df_combined['DUE_DATE'] < TEST_SPLIT_DATE].copy()
# test_df = df_combined[df_combined['DUE_DATE'] >= TEST_SPLIT_DATE].copy()

# # Drop the date columns now that the split is complete
# train_df.drop(columns=['LOAN_DATE', 'DUE_DATE'], inplace=True)
# test_df.drop(columns=['LOAN_DATE', 'DUE_DATE'], inplace=True)

# # Store LOAN_ID lists for sequence creation validation
# train_loan_ids = train_df['LOAN_ID'].unique()
# test_loan_ids = test_df['LOAN_ID'].unique()

# print(f"Train Shape: {train_df.shape}")
# print(f"Test Shape: {test_df.shape}")


--- Chronological Train/Test Split (Cutoff: 2025-04-01) ---
Train Shape: (24856962, 30)
Test Shape: (3278796, 30)


In [54]:
# ######################################################################
# --- CRITICAL CHANGE: Step 3: LOAN_ID Split for Generalization ---
# ######################################################################
import random
TEST_RATIO = 0.20
print(f"\n--- CRITICAL: Splitting by LOAN_ID ({int(TEST_RATIO*100)}% for Test) ---")

# 1. Get all unique LOAN_IDs
all_loan_ids = df_combined['LOAN_ID'].unique()

# 2. Randomly shuffle the IDs to ensure a good mix across train/test
# Setting seed for reproducibility
random.seed(42) 
random.shuffle(all_loan_ids)

# 3. Determine the split point
split_point = int(len(all_loan_ids) * (1 - TEST_RATIO))

# 4. Split the IDs
train_ids = all_loan_ids[:split_point]
test_ids = all_loan_ids[split_point:]

# 5. Filter the main DataFrame based on these ID lists
train_df = df_combined[df_combined['LOAN_ID'].isin(train_ids)].copy()
test_df = df_combined[df_combined['LOAN_ID'].isin(test_ids)].copy()

# 6. Drop the date columns (no longer the primary split mechanism)
train_df.drop(columns=['LOAN_DATE', 'DUE_DATE'], inplace=True, errors='ignore')
test_df.drop(columns=['LOAN_DATE', 'DUE_DATE'], inplace=True, errors='ignore')

# Store LOAN_ID lists for sequence creation validation
train_loan_ids = train_df['LOAN_ID'].unique()
test_loan_ids = test_df['LOAN_ID'].unique()

print(f"Train Loans: {len(train_loan_ids)} | Train Records: {train_df.shape}")
print(f"Test Loans: {len(test_loan_ids)} | Test Records: {test_df.shape}")
print("✅ Data successfully split by LOAN_ID, eliminating customer identity leakage.")


--- CRITICAL: Splitting by LOAN_ID (20% for Test) ---
Train Loans: 901865 | Train Records: (22512327, 30)
Test Loans: 225467 | Test Records: (5623431, 30)
✅ Data successfully split by LOAN_ID, eliminating customer identity leakage.


In [55]:
22512327+5623431

28135758

In [56]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22512327 entries, 0 to 28135668
Data columns (total 30 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   LOAN_ID                     object 
 1   TOTAL_INCOME                float64
 2   TOTAL_EXPENSE               float64
 3   MARITAL_STATUS_NAME         object 
 4   LOAN_AMOUNT                 float32
 5   CYCLE                       int64  
 6   STATE_NAME                  object 
 7   AGE                         float64
 8   INSTALLMENT_NO              int64  
 9   INSTALLMENT_AMOUNT          int64  
 10  DAYS_LATE                   float64
 11  DAYS_BETWEEN_DUE_DATES      float64
 12  LOAN_SCHEDULE_TYPE          object 
 13  IS_UNPAID                   int64  
 14  PAID_RATIO                  float64
 15  DELTA_DAYS_LATE             float64
 16  CURRENT_EMI_BEHAVIOR_LABEL  int64  
 17  COMPOSITE_RISK              float64
 18  PAYMENT_SCORE               float64
 19  PAYMENT_SCORE_RANK      

In [57]:
#NEW
# ######################################################################
# --- Step 4: ColumnTransformer (Scaling & OHE) with Memory Optimization ---
# ######################################################################

# --- 4.1: Memory Optimization for Preprocessor Fitting ---
FIT_SAMPLE_RATIO = 0.5 # Use 50% of the training data for fitting the scaler/OHE
sample_size = len(train_df) // 2

# Create a small, temporary sample for fitting the memory-intensive scalers
train_df_sample = train_df.sample(n=sample_size, random_state=42).copy()
print(f"\n--- Fitting Preprocessor on Sample ({FIT_SAMPLE_RATIO*100:.0f}%) to Save RAM ---")


--- Fitting Preprocessor on Sample (50%) to Save RAM ---


In [58]:
#NEW
# --- 4.2: Define the Column Transformer ---
# The columns passed through must include LOAN_ID, Target, and the Embedding features
PASSTHROUGH_COLS = ['LOAN_ID', TARGET_COL] + STATIC_COLS_EMBEDDING_FINAL

# Columns to be Scaled and OHE'd (excluding the passthrough and embedding features)
SCALER_OHE_COLS = NUMERICAL_FEATURES_FINAL + STATIC_COLS_OHE

preprocessor = ColumnTransformer(
    transformers=[
        # Transformer 1: Scale ALL numerical features (sequential + static + OHE sequential)
        ('num', StandardScaler(), NUMERICAL_FEATURES_FINAL),
        
        # Transformer 2: OHE the remaining static categorical features
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=True), STATIC_COLS_OHE)
    ],
    # The Embedding features, LOAN_ID, and TARGET_COL are passed through untouched
    remainder='passthrough',
    # Ensure ColumnTransformer only processes features defined in SCALER_OHE_COLS
    # The actual columns used will be drawn from the input dataframe
    # The output will include all columns in transformers + remainder='passthrough'
)

In [None]:
#NEW
# --- 4.3: Fit and Save the Preprocessor ---

# CRITICAL: Fit the preprocessor ONLY on the SMALLER SAMPLE
preprocessor.fit(train_df_sample)
joblib.dump(preprocessor, 'preprocessor.pkl')
print(f"✅ Preprocessor fitted on sample size {len(train_df_sample)} and saved to preprocessor.pkl.")

# --- 4.4: Apply Transformation to FULL Data ---

# Apply the fitted preprocessor to the FULL training and testing sets
# The output is a sparse matrix due to sparse_output=True
train_transformed = preprocessor.transform(train_df)
test_transformed = preprocessor.transform(test_df)

print(f"✅ Transformation applied to FULL train/test data. Output is sparse matrix.")
print(f"Train Transformed Shape: {train_transformed.shape}")
print(f"Test Transformed Shape: {test_transformed.shape}")

In [None]:
#OLD STARTING:1
########### MEMORY OPTIMIZATION BY SETTING SPARSE_OUTPUT TO TRUE
# --- Step 4: ColumnTransformer (Scaling & OHE) ---

print("\n--- Step 4: Applying Scaling and OHE with ColumnTransformer ---")

# Define the Column Transformer
preprocessor = ColumnTransformer(
    transformers=[
        # Transformer 1: Scale ALL numerical features (sequential + static + OHE sequential)
        ('num', StandardScaler(), NUMERICAL_FEATURES_FINAL),
        
        # Transformer 2: OHE the remaining static categorical features
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=True), STATIC_COLS_OHE)
    ],
    # The Embedding features, LOAN_ID, and TARGET_COL are passed through untouched
    remainder='passthrough' 
)


--- Step 4: Applying Scaling and OHE with ColumnTransformer ---


In [None]:
#OLD 2
# 1. Fit and Save the preprocessor ONLY on the training data (CRITICAL: prevents leakage)
preprocessor.fit(train_df)
joblib.dump(preprocessor, 'preprocessor.pkl')
print("✅ Preprocessor fitted on train data and saved.")

# 2. Apply the transformation
# train_transformed and test_transformed will now be sparse matrices
train_transformed = preprocessor.transform(train_df)
test_transformed = preprocessor.transform(test_df)
print("✅ Transformation applied successfully (using sparse matrices).")

In [None]:
@############@@@@@@@@@@

In [None]:
# Get all feature names
static_ohe_names = list(preprocessor.named_transformers_['cat'].get_feature_names_out(STATIC_COLS_OHE))
remainder_cols = ['LOAN_ID', TARGET_COL] + STATIC_COLS_EMBEDDING_FINAL

feature_names_out = NUMERICAL_FEATURES_FINAL + static_ohe_names + remainder_cols

In [None]:
# Convert the transformed arrays back to DataFrames
train_df_scaled = pd.DataFrame(train_transformed, columns=feature_names_out)
test_df_scaled = pd.DataFrame(test_transformed, columns=feature_names_out)

print(f"Train Scaled Shape: {train_df_scaled.shape}")
print(f"Test Scaled Shape: {test_df_scaled.shape}")

In [None]:
#-----------------------------------------------------------
#------------------------------------------------------------

In [49]:
# Define the chronological split point
TEST_SPLIT_DATE = pd.to_datetime('2025-04-01')

print(f"\n--- Step 3: Chronological Train/Test Split (Cutoff: {TEST_SPLIT_DATE.strftime('%Y-%m-%d')}) ---")

# CRITICAL: Split based on DUE_DATE
train_df = df_combined[df_combined['DUE_DATE'] < TEST_SPLIT_DATE].copy()
test_df = df_combined[df_combined['DUE_DATE'] >= TEST_SPLIT_DATE].copy()

# Note: We do NOT need a separate validation set yet. We will split the train_df later.

print(f"Train Shape: {train_df.shape}")
print(f"Test Shape: {test_df.shape}")


--- Step 3: Chronological Train/Test Split (Cutoff: 2025-04-01) ---
Train Shape: (24856962, 31)
Test Shape: (3278796, 31)


In [None]:
# Drop the columns that are no longer needed for feature engineering
# Dues dates are not needed as features, only for splitting/reference
#train_df.drop(columns=['LOAN_DATE', 'DUE_DATE'], inplace=True)
#test_df.drop(columns=['LOAN_DATE', 'DUE_DATE'], inplace=True)

In [50]:
train_df.columns

Index(['LOAN_ID', 'TOTAL_INCOME', 'TOTAL_EXPENSE', 'CUSTOMER_FLAG',
       'MARITAL_STATUS_NAME', 'LOAN_DATE', 'LOAN_AMOUNT', 'TENURE',
       'INTEREST_RATE', 'CYCLE', 'STATE_NAME', 'OCCUPATION_NAME',
       'LOAN_PURPOSE', 'AGE', 'INSTALLMENT_NO', 'DUE_DATE',
       'INSTALLMENT_AMOUNT', 'DAYS_LATE', 'DAYS_BETWEEN_DUE_DATES',
       'REPAYMENT_SCHEDULE_CAT', 'LOAN_SCHEDULE_TYPE', 'IS_UNPAID',
       'IS_DAYS_LATE_MISSING', 'PAID_RATIO', 'DELTA_DAYS_LATE',
       'CURRENT_EMI_BEHAVIOR_LABEL', 'COMPOSITE_RISK', 'PAYMENT_SCORE',
       'PAYMENT_SCORE_RANK', 'NEXT_EMI_LABEL', 'RECENT_PAYMENT_SCORE'],
      dtype='object')

In [52]:
train_df["IS_UNPAID"].value_counts()

IS_UNPAID
0    18233163
1     6623799
Name: count, dtype: int64

In [53]:
train_df["IS_DAYS_LATE_MISSING"].value_counts()

IS_DAYS_LATE_MISSING
0    18233163
1     6623799
Name: count, dtype: int64

In [54]:
del train_df["IS_DAYS_LATE_MISSING"]
del test_df["IS_DAYS_LATE_MISSING"]

In [55]:
del train_df["TENURE"]
del test_df["TENURE"]

In [56]:
del train_df["INTEREST_RATE"]
del test_df["INTEREST_RATE"]

In [57]:
train_df["CYCLE"].value_counts()

CYCLE
1     14827181
2      3855060
4      3087590
3      1345617
5      1167345
6       475323
7        63819
8        20960
9         7375
10        6692
Name: count, dtype: int64

In [58]:
train_df["CURRENT_EMI_BEHAVIOR_LABEL"].value_counts()

CURRENT_EMI_BEHAVIOR_LABEL
0    17154867
1     7702095
Name: count, dtype: int64

In [59]:
train_df.columns

Index(['LOAN_ID', 'TOTAL_INCOME', 'TOTAL_EXPENSE', 'CUSTOMER_FLAG',
       'MARITAL_STATUS_NAME', 'LOAN_AMOUNT', 'CYCLE', 'STATE_NAME',
       'OCCUPATION_NAME', 'LOAN_PURPOSE', 'AGE', 'INSTALLMENT_NO',
       'INSTALLMENT_AMOUNT', 'DAYS_LATE', 'DAYS_BETWEEN_DUE_DATES',
       'REPAYMENT_SCHEDULE_CAT', 'LOAN_SCHEDULE_TYPE', 'IS_UNPAID',
       'PAID_RATIO', 'DELTA_DAYS_LATE', 'CURRENT_EMI_BEHAVIOR_LABEL',
       'COMPOSITE_RISK', 'PAYMENT_SCORE', 'PAYMENT_SCORE_RANK',
       'NEXT_EMI_LABEL', 'RECENT_PAYMENT_SCORE'],
      dtype='object')

In [74]:
# Removed tenure and interest rate
SEQUENTIAL_COLS_NUMERICAL = [
    'INSTALLMENT_NO','INSTALLMENT_AMOUNT', 'DAYS_LATE', 'DAYS_BETWEEN_DUE_DATES','PAID_RATIO', 'DELTA_DAYS_LATE','PAYMENT_SCORE', 
    'COMPOSITE_RISK', 'RECENT_PAYMENT_SCORE','PAYMENT_SCORE_RANK','IS_UNPAID','CURRENT_EMI_BEHAVIOR_LABEL',
]
SEQUENTIAL_COLS_CATEGORICAL = [
    'REPAYMENT_SCHEDULE_CAT',
]
STATIC_COLS_NUMERICAL = [
    'TOTAL_INCOME', 'TOTAL_EXPENSE', 'LOAN_AMOUNT', 'AGE','CYCLE',
]
STATIC_COLS_OHE = [
    'MARITAL_STATUS_NAME', 'STATE_NAME', 'LOAN_SCHEDULE_TYPE',
]
STATIC_COLS_EMBEDDING = [
    'OCCUPATION_NAME', 'LOAN_PURPOSE',
]
STATIC_COL_ORDINAL = [
    'CUSTOMER_FLAG' 
]
TARGET_COL = 'NEXT_EMI_LABEL'

In [75]:
# Custom Encoding of Static Features

print("\n Encoding Ordinal/Embedding Features ")

# Combine data temporarily for consistent encoding (train_df and test_df were split in the previous step)
df_combined = pd.concat([train_df, test_df], ignore_index=True)

# 1. Ordinal Encoding for CUSTOMER_FLAG
FLAG_ORDER = {'A': 4, 'B': 3, 'C': 2, 'D': 1} # A=Best (4), D=Worst (1)
df_combined['CUSTOMER_FLAG_ENCODED'] = df_combined['CUSTOMER_FLAG'].map(FLAG_ORDER)


 Encoding Ordinal/Embedding Features 


In [76]:
# Update lists and drop original column
STATIC_COLS_NUMERICAL.append('CUSTOMER_FLAG_ENCODED')
del df_combined['CUSTOMER_FLAG']
print("✅ CUSTOMER_FLAG: Ordinal mapped (A:4 to D:1) and added to STATIC_COLS_NUMERICAL.")

✅ CUSTOMER_FLAG: Ordinal mapped (A:4 to D:1) and added to STATIC_COLS_NUMERICAL.


In [None]:
# 2. Label Encoding for Embedding Features (OCCUPATION_NAME, LOAN_PURPOSE)
# These are encoded to integers (index + 1) for the Keras Embedding layer
df_combined['OCCUPATION_NAME'] = df_combined['OCCUPATION_NAME'].astype('category')
df_combined['LOAN_PURPOSE'] = df_combined['LOAN_PURPOSE'].astype('category')

df_combined['OCCUPATION_NAME_ENCODED'] = df_combined['OCCUPATION_NAME'].cat.codes + 1
df_combined['LOAN_PURPOSE_ENCODED'] = df_combined['LOAN_PURPOSE'].cat.codes + 1

# Update the feature list for final use and drop originals
STATIC_COLS_EMBEDDING_FINAL = ['OCCUPATION_NAME_ENCODED', 'LOAN_PURPOSE_ENCODED']
del df_combined['OCCUPATION_NAME']
del df_combined['LOAN_PURPOSE']
print("✅ OCCUPATION_NAME & LOAN_PURPOSE: Label Encoded for Embedding Layers.")

✅ OCCUPATION_NAME & LOAN_PURPOSE: Label Encoded for Embedding Layers.


In [None]:
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

In [80]:
TEST_SPLIT_DATE

Timestamp('2025-04-01 00:00:00')

In [82]:
# Define the chronological split point
#TEST_SPLIT_DATE = pd.to_datetime('2025-04-01')

print("\n--- Re-splitting DataFrames after Custom Encoding ---")

train_df = df_combined[df_combined['DUE_DATE'] < TEST_SPLIT_DATE].copy()
test_df = df_combined[df_combined['DUE_DATE'] >= TEST_SPLIT_DATE].copy()


--- Re-splitting DataFrames after Custom Encoding ---


KeyError: 'DUE_DATE'

In [None]:


# Drop the date columns that are no longer needed for feature engineering
# (They were kept temporarily for the split logic)
train_df.drop(columns=['LOAN_DATE', 'DUE_DATE'], inplace=True)
test_df.drop(columns=['LOAN_DATE', 'DUE_DATE'], inplace=True)

print(f"✅ Re-split successful based on DUE_DATE < {TEST_SPLIT_DATE.strftime('%Y-%m-%d')}")
print(f"Train Shape: {train_df.shape}")
print(f"Test Shape: {test_df.shape}")

In [78]:
# --- Re-split the DataFrame based on the chronological cutoff date ---
# Use the LOAN_ID lists from the previous split to ensure integrity
train_loan_ids = train_df['LOAN_ID'].unique()
test_loan_ids = test_df['LOAN_ID'].unique()

train_df = df_combined[df_combined['LOAN_ID'].isin(train_loan_ids)].copy()
test_df = df_combined[df_combined['LOAN_ID'].isin(test_loan_ids)].copy()

In [None]:
train_df.shape

(28135758, 26)

In [None]:
# Remove remaining unused feature list
del STATIC_COL_ORDINAL

In [None]:
#####################################################################
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

In [61]:
# --- STEP 4.1: OHE Sequential Categorical Feature (REPAYMENT_SCHEDULE_CAT) ---
print("\n--- Step 4: Applying Scaling and Encoding ---")
print("4.1: One-Hot Encoding Sequential Categorical Feature...")

# 1. Recombine data for OHE to ensure all categories are captured
df_combined = pd.concat([train_df, test_df], ignore_index=True)

# 2. Perform OHE on the sequential categorical column
df_combined = pd.get_dummies(df_combined, columns=SEQUENTIAL_COLS_CATEGORICAL, prefix='REPAYMENT_CAT')

# 3. Identify the new OHE columns
REPAYMENT_CAT_OHE_COLS = [col for col in df_combined.columns if col.startswith('REPAYMENT_CAT_')]


--- Step 4: Applying Scaling and Encoding ---
4.1: One-Hot Encoding Sequential Categorical Feature...


In [None]:


# 4. Create the final comprehensive list of numerical features for the StandardScaler
# This includes all original numerical sequential/static features + the new OHE columns
NUMERICAL_FEATURES_FINAL = SEQUENTIAL_COLS_NUMERICAL + STATIC_COLS_NUMERICAL + REPAYMENT_CAT_OHE_COLS

print(f"✅ REPAYMENT_SCHEDULE_CAT encoded, resulting in {len(REPAYMENT_CAT_OHE_COLS)} new columns.")

# 5. Re-split the DataFrame based on the chronological cutoff date
TEST_SPLIT_DATE = pd.to_datetime('2025-04-01') # Use the date defined in Step 3
train_df = df_combined[df_combined['LOAN_ID'].isin(train_df['LOAN_ID'])].copy()
test_df = df_combined[df_combined['LOAN_ID'].isin(test_df['LOAN_ID'])].copy()

# Note: Using LOAN_ID ensures entire loan histories remain in their original split.

In [None]:
# --- Step 4: Define Feature Sets for Hybrid Model ---

# Features unique to the LOS data (Static Branch)
STATIC_UNIQUE_COLS = [
    'AGE', 'TOTAL_EXPENSE', 'TOTAL_INCOME', 'OCCUPATION_NAME',
    'MARITAL_STATUS_NAME', 'STATE_NAME', 'CUSTOMER_FLAG', 'LOAN_PURPOSE', 
    'PIN_CODE' # PIN_CODE will be target encoded later, but grouped here
]

# Core Loan Static Features (Shared by LMS/LOS, now cleaned and renamed)
CORE_STATIC_COLS = ['LOAN_AMOUNT', 'TENURE', 'INTEREST_RATE', 'LOAN_SCHEDULE_TYPE']

# Final Static Features (for the Dense layer input)
STATIC_COLS = CORE_STATIC_COLS + STATIC_UNIQUE_COLS

# Sequential features (time-steps for the RNN/LSTM input)
SEQUENTIAL_COLS = [
    'INSTALLMENT_AMOUNT', 'DAYS_LATE', 'DAYS_BETWEEN_DUE_DATES', 'PAID_RATIO', 
    'DELTA_DAYS_LATE', 'IS_UNPAID', 'PAYMENT_SCORE', 'PAYMENT_SCORE_RANK', 
    'COMPOSITE_RISK', 'RECENT_PAYMENT_SCORE', 'REPAYMENT_SCHEDULE_CAT'
]

# Categorical columns for One-Hot Encoding (OHE)
OHE_COLS = [
    'MARITAL_STATUS_NAME', 'STATE_NAME', 'CUSTOMER_FLAG', 'REPAYMENT_SCHEDULE_CAT', 'LOAN_SCHEDULE_TYPE'
]

# Numerical columns for Scaling
NUM_COLS = [
    col for col in STATIC_COLS + SEQUENTIAL_COLS 
    if df_combined[col].dtype in ['int64', 'float64'] and col not in OHE_COLS
]

# Features requiring special handling (PIN_CODE, OCCUPATION_NAME, LOAN_PURPOSE)
# These will be treated as numerical for now and their high-cardinality handled post-split/post-scaling
HIGH_CARD_COLS = ['PIN_CODE', 'OCCUPATION_NAME', 'LOAN_PURPOSE']
NUM_COLS.extend([c for c in HIGH_CARD_COLS if c not in NUM_COLS])


# --- Step 5: Scaling and Encoding Pipelines (Fit on Train Data ONLY) ---
print("\n--- Step 5: Fitting Scalers and Encoders on Training Data ---")

# Numerical Scaler (Fit on ALL numerical features from the training set)
scaler = StandardScaler()
scaler.fit(df_train[NUM_COLS])
joblib.dump(scaler, 'hybrid_scaler.pkl')
print(f"✅ Fitted and saved StandardScaler for {len(NUM_COLS)} numerical features.")

# One-Hot Encoder (Fit on ALL low-cardinality categorical features)
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
ohe.fit(df_train[OHE_COLS])
joblib.dump(ohe, 'hybrid_ohe.pkl')
print(f"✅ Fitted and saved OneHotEncoder for {len(OHE_COLS)} categorical features.")


# --- Step 6: Transform and Reshape Data for Hybrid Model ---

def transform_and_reshape(df, scaler, ohe, static_cols, seq_cols, ohe_cols, num_cols, max_len):
    """Applies preprocessing and reshapes data into Static and Sequential inputs."""
    
    # 1. Prepare Features (Apply transformations)
    df_temp = df.copy()

    # Numerical Scaling (Apply to all numerical columns)
    # Convert high-cardinality columns (PIN_CODE) to numeric before scaling if they contain only numeric data
    for col in HIGH_CARD_COLS:
        if col in df_temp.columns and df_temp[col].dtype == 'object':
            # This is a simplification; ideally, target encoding would happen here.
            # For now, we convert PIN_CODE (stored as str) to numeric for scaling.
            df_temp[col] = pd.to_numeric(df_temp[col], errors='coerce').fillna(df_temp[col].median())
            
    df_temp[num_cols] = scaler.transform(df_temp[num_cols])

    # OHE (Apply to all OHE columns)
    ohe_output = ohe.transform(df_temp[ohe_cols])
    ohe_col_names = ohe.get_feature_names_out(ohe_cols)
    df_ohe = pd.DataFrame(ohe_output, columns=ohe_col_names, index=df_temp.index)
    df_temp = pd.concat([df_temp.drop(columns=ohe_cols), df_ohe], axis=1)

    # 2. Sequential Data Preparation (X_seq)
    # Define the final sequential column names (scaled numericals + OHE)
    # Includes numericals and OHE features relevant to the sequence
    final_seq_cols = [c for c in seq_cols if c not in ohe_cols] + [c for c in ohe_col_names if 'REPAYMENT_SCHEDULE_CAT' in c]
    X_seq_features = df_temp[final_seq_cols].values
    SEQ_FEAT_DIM = X_seq_features.shape[1] 
    
    # 3. Static Data Preparation (X_static)
    # Includes numericals and all OHE features (as they are static per loan)
    final_static_cols = [c for c in static_cols if c not in ohe_cols] + [c for c in ohe_col_names if any(ohe_col in static_cols + seq_cols for ohe_col in c.split('_'))]
    
    # --- IMPORTANT: Filter for unique static rows (one per loan) ---
    df_static_unique = df_temp.drop_duplicates(subset=['LOAN_ID'], keep='first').set_index('LOAN_ID')
    
    X_static_all_features = df_static_unique[final_static_cols].values
    STATIC_FEAT_DIM = X_static_all_features.shape[1]

    # --- 4. Group, Pad, and Flatten ---
    
    X_seq_list = []
    y_list = [] 
    
    grouped = df_temp.groupby('LOAN_ID')

    for loan_id, group in grouped:
        X_seq_list.append(group[final_seq_cols].values)
        y_list.append(group['NEXT_EMI_LABEL'].values)
        
    # Pad sequences and targets
    X_seq_padded = pad_sequences(X_seq_list, maxlen=max_len, dtype='float32', padding='pre', truncating='pre', value=0.0)
    y_padded = pad_sequences(y_list, maxlen=max_len, dtype='int32', padding='pre', truncating='pre', value=-1) # -1 is padding value
    
    # Flatten y_padded and remove padding indices
    y_flat = y_padded[y_padded != -1].flatten()

    # Filter out sequence rows that were padded (where y_padded != -1)
    X_seq_flat = X_seq_padded[y_padded != -1].reshape(-1, SEQ_FEAT_DIM)
    
    # The static input must be "broadcast" to match the flattened sequence length.
    
    # Match the order of static features to the sequence order
    loan_id_order = [loan_id for loan_id, _ in grouped]
    X_static_ordered = df_static_unique.loc[loan_id_order][final_static_cols].values

    # Broadcast static features
    X_static_broadcast = np.repeat(X_static_ordered, [len(y[y!=-1]) for y in y_list], axis=0)

    print(f"Final X_seq shape (flat): {X_seq_flat.shape}")
    print(f"Final X_static shape (broadcast): {X_static_broadcast.shape}")
    print(f"Final y shape (flat): {y_flat.shape}")

    return X_seq_flat, X_static_broadcast, y_flat, SEQ_FEAT_DIM, STATIC_FEAT_DIM


# --- Transform Training Data ---
X_seq_train, X_static_train, y_train, SEQ_FEAT_DIM, STATIC_FEAT_DIM = transform_and_reshape(
    df_train, scaler, ohe, STATIC_COLS, SEQUENTIAL_COLS, OHE_COLS, NUM_COLS, MAX_SEQUENCE_LENGTH
)
print(f"\nTraining Data Prepared. SEQ_DIM: {SEQ_FEAT_DIM}, STATIC_DIM: {STATIC_FEAT_DIM}")


# --- Transform Testing Data ---
X_seq_test, X_static_test, y_test, _, _ = transform_and_reshape(
    df_test, scaler, ohe, STATIC_COLS, SEQUENTIAL_COLS, OHE_COLS, NUM_COLS, MAX_SEQUENCE_LENGTH
)
print("\nTesting Data Prepared.")


# --- Step 7: Final Saving of Preprocessed Data ---

# Save the preprocessed numpy arrays
np.savez_compressed(
    'hybrid_model_data.npz',
    X_seq_train=X_seq_train,
    X_static_train=X_static_train,
    y_train=y_train,
    X_seq_test=X_seq_test,
    X_static_test=X_static_test,
    y_test=y_test,
    SEQ_FEAT_DIM=SEQ_FEAT_DIM,
    STATIC_FEAT_DIM=STATIC_FEAT_DIM
)

print(f"\n✅ All preprocessed hybrid model data saved to 'hybrid_model_data.npz'.")
print("Project ready for final Model Building and Training (4_hybrid_model.ipynb).")