In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
import joblib
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [1]:
# Define constants
TWO_CLASS_STATUS_MAP = {0: 'Paid', 1: 'Not Paid'}
GRACE_PERIOD_DAYS = 2
MAX_SEQUENCE_LENGTH = 8

In [3]:
print("--- Phase 1: Preparing Data for Hybrid Model ---")
# --- Step 1: Data Loading ---
folder_path = "CorrectedCollection2023formatted"
excel_files = {
    'jan': [f'{folder_path}/jan1_2023.xlsx', f'{folder_path}/jan2_2023.xlsx'],
    'feb': [f'{folder_path}/feb1_2023.xlsx', f'{folder_path}/feb2_2023.xlsx'],
    'mar': [f'{folder_path}/mar1_2023.xlsx', f'{folder_path}/mar2_2023.xlsx', f'{folder_path}/mar3_2023.xlsx'],
    'apr': [f'{folder_path}/apr1_2023.xlsx', f'{folder_path}/apr2_2023.xlsx', f'{folder_path}/apr3_2023.xlsx'],
    'may': [f'{folder_path}/may1_2023.xlsx'],
    'jun': [f'{folder_path}/jun1_2023.xlsx'],
    'jul': [f'{folder_path}/jul1_2023.xlsx', f'{folder_path}/jul2_2023.xlsx'],
    'aug': [f'{folder_path}/aug1_2023.xlsx', f'{folder_path}/aug2_2023.xlsx', f'{folder_path}/aug3_2023.xlsx'],
    'sep': [f'{folder_path}/sept1_2023.xlsx', f'{folder_path}/sept2_2023.xlsx', f'{folder_path}/sept3_2023.xlsx', f'{folder_path}/sept4_2023.xlsx'],
    'oct': [f'{folder_path}/oct1_2023.xlsx', f'{folder_path}/oct2_2023.xlsx', f'{folder_path}/oct3_2023.xlsx', f'{folder_path}/oct4_2023.xlsx'],
    'nov': [f'{folder_path}/nov1_2023.xlsx', f'{folder_path}/nov2_2023.xlsx', f'{folder_path}/nov3_2023.xlsx', f'{folder_path}/nov4_2023.xlsx'],
    'dec': [f'{folder_path}/dec1_2023.xlsx', f'{folder_path}/dec2_2023.xlsx', f'{folder_path}/dec3_2023.xlsx', f'{folder_path}/dec4_2023.xlsx']
}
all_dfs = []
for month, files in excel_files.items():
    print(f"Loading files for {month.capitalize()}...")
    for file_name in files:
        if os.path.exists(file_name):
            try:
                df_month = pd.read_excel(file_name)
                all_dfs.append(df_month)
                print(f"✅ Successfully loaded: {file_name}")
            except Exception as e:
                print(f"❌ Error loading {file_name}: {e}")
        else:
            print(f"⚠️ Warning: {file_name} not found.")

--- Phase 1: Preparing Data for Hybrid Model ---
Loading files for Jan...
✅ Successfully loaded: CorrectedCollection2023formatted/jan1_2023.xlsx
✅ Successfully loaded: CorrectedCollection2023formatted/jan2_2023.xlsx
Loading files for Feb...
✅ Successfully loaded: CorrectedCollection2023formatted/feb1_2023.xlsx
✅ Successfully loaded: CorrectedCollection2023formatted/feb2_2023.xlsx
Loading files for Mar...
✅ Successfully loaded: CorrectedCollection2023formatted/mar1_2023.xlsx
✅ Successfully loaded: CorrectedCollection2023formatted/mar2_2023.xlsx
✅ Successfully loaded: CorrectedCollection2023formatted/mar3_2023.xlsx
Loading files for Apr...
✅ Successfully loaded: CorrectedCollection2023formatted/apr1_2023.xlsx
✅ Successfully loaded: CorrectedCollection2023formatted/apr2_2023.xlsx
✅ Successfully loaded: CorrectedCollection2023formatted/apr3_2023.xlsx
Loading files for May...
✅ Successfully loaded: CorrectedCollection2023formatted/may1_2023.xlsx
Loading files for Jun...
✅ Successfully loade

In [4]:
df_lms = pd.concat(all_dfs, ignore_index=True)

In [5]:
df_lms

Unnamed: 0,LOAN_ID,INSTALLMENT_NO,DUE_DATE,INSTALLMENT_AMOUNT,INTEREST_AMOUNT,STATUS,PAID_DT,LOAN_DATE,LOAN_AMOUNT,TENURE,INTEREST_RATE,PRINCIPAL_AMOUNT,PAID_AMOUNT
0,1001UNSL002793,1,2023-02-10,3000,1571,0,2023-02-10 10:48:17,2023-01-03,60000,2,24.5,1429,3000.0
1,1001UNSL002793,2,2023-03-10,3000,1101,0,2023-03-10 12:54:15,2023-01-03,60000,2,24.5,1899,3000.0
2,1001UNSL002793,3,2023-04-07,3000,1065,0,2023-04-07 12:31:44,2023-01-03,60000,2,24.5,1935,3000.0
3,1001UNSL002793,4,2023-05-05,3000,1029,0,2023-05-05 14:40:36,2023-01-03,60000,2,24.5,1971,3000.0
4,1001UNSL002793,5,2023-06-02,3000,992,0,2023-06-02 16:56:10,2023-01-03,60000,2,24.5,2008,3000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
29488564,4209UNSL000933,17,2025-04-10,2010,330,0,2025-04-10 12:54:54,2023-12-16,40000,2,25.0,1680,2010.0
29488565,4209UNSL000933,18,2025-05-08,2010,298,0,2025-05-08 09:52:06,2023-12-16,40000,2,25.0,1712,2010.0
29488566,4209UNSL000933,19,2025-06-05,2010,265,0,2025-06-05 10:25:19,2023-12-16,40000,2,25.0,1745,2010.0
29488567,4209UNSL000933,20,2025-07-03,2010,231,0,2025-07-03 09:51:50,2023-12-16,40000,2,25.0,1779,2010.0


In [6]:
df_lms['DUE_DATE'] = pd.to_datetime(df_lms['DUE_DATE'], errors='coerce')
df_lms['PAID_DT'] = pd.to_datetime(df_lms['PAID_DT'], errors='coerce')

In [7]:
df_lms.loc[df_lms['STATUS'] == 1, 'PAID_DT'] = pd.NaT

In [9]:
# # --- BEFORE THE SORTING STEP ---
# # 1. Optimize Numeric Types
# for col in ['INSTALLMENT_NO', 'TENURE', 'STATUS']:
#     # Use the smallest integer type that fits the data range
#     df_lms[col] = pd.to_numeric(df_lms[col], downcast='integer', errors='coerce')

# for col in ['LOAN_AMOUNT', 'INSTALLMENT_AMOUNT', 'PAID_AMOUNT']:
#     # Use float32 instead of float64
#     df_lms[col] = df_lms[col].astype('float32')

# # 2. Optimize LOAN_ID
# # Since LOAN_ID is used for grouping/sorting but is an object (string), 
# # converting it to the 'category' type often helps pandas with memory efficiency
# # during groupby/sort operations.
# df_lms['LOAN_ID'] = df_lms['LOAN_ID'].astype('category')

# # 3. Optimize Date Types
# # This is already done, but ensure they are datetime64[ns]

# # Now, attempt the sort again with the much smaller DataFrame:
# df_lms = df_lms.sort_values(by=['LOAN_ID', 'INSTALLMENT_NO']).reset_index(drop=True)

In [8]:
df_lms = df_lms.sort_values(by=['LOAN_ID', 'INSTALLMENT_NO']).reset_index(drop=True)

In [10]:
df_lms["STATUS"].value_counts()

STATUS
0    20177202
1     8882206
2      429161
Name: count, dtype: int64

In [11]:
df_lms.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29488569 entries, 0 to 29488568
Data columns (total 13 columns):
 #   Column              Dtype         
---  ------              -----         
 0   LOAN_ID             object        
 1   INSTALLMENT_NO      int64         
 2   DUE_DATE            datetime64[ns]
 3   INSTALLMENT_AMOUNT  int64         
 4   INTEREST_AMOUNT     int64         
 5   STATUS              int64         
 6   PAID_DT             datetime64[ns]
 7   LOAN_DATE           datetime64[ns]
 8   LOAN_AMOUNT         int64         
 9   TENURE              int64         
 10  INTEREST_RATE       float64       
 11  PRINCIPAL_AMOUNT    int64         
 12  PAID_AMOUNT         float64       
dtypes: datetime64[ns](3), float64(2), int64(7), object(1)
memory usage: 2.9+ GB


In [12]:
df_lms.columns

Index(['LOAN_ID', 'INSTALLMENT_NO', 'DUE_DATE', 'INSTALLMENT_AMOUNT',
       'INTEREST_AMOUNT', 'STATUS', 'PAID_DT', 'LOAN_DATE', 'LOAN_AMOUNT',
       'TENURE', 'INTEREST_RATE', 'PRINCIPAL_AMOUNT', 'PAID_AMOUNT'],
      dtype='object')

In [13]:
df_lms.isna().sum()

LOAN_ID                     0
INSTALLMENT_NO              0
DUE_DATE                    0
INSTALLMENT_AMOUNT          0
INTEREST_AMOUNT             0
STATUS                      0
PAID_DT               8882209
LOAN_DATE                   0
LOAN_AMOUNT                 0
TENURE                      0
INTEREST_RATE               0
PRINCIPAL_AMOUNT            0
PAID_AMOUNT                 0
dtype: int64

In [14]:
rows_to_check = df_lms[(df_lms['PAID_DT'].isnull()) & (df_lms['STATUS'] != 1)]

In [15]:
rows_to_check

Unnamed: 0,LOAN_ID,INSTALLMENT_NO,DUE_DATE,INSTALLMENT_AMOUNT,INTEREST_AMOUNT,STATUS,PAID_DT,LOAN_DATE,LOAN_AMOUNT,TENURE,INTEREST_RATE,PRINCIPAL_AMOUNT,PAID_AMOUNT
17482449,1783UNSL002475,16,2024-04-29,315,71,2,NaT,2023-04-29,25000,2,25.0,244,0.0
18572024,1830UNSL006290,31,2025-04-09,349,6,2,NaT,2023-04-19,70000,2,25.0,343,0.0
18572025,1830UNSL006290,32,2025-04-16,349,8,2,NaT,2023-04-19,70000,2,25.0,341,0.0


In [16]:
df_lms = df_lms.loc[~((df_lms['PAID_DT'].isnull()) & (df_lms['STATUS'] != 1))].reset_index(drop=True)

In [17]:
df_lms.isna().sum()

LOAN_ID                     0
INSTALLMENT_NO              0
DUE_DATE                    0
INSTALLMENT_AMOUNT          0
INTEREST_AMOUNT             0
STATUS                      0
PAID_DT               8882206
LOAN_DATE                   0
LOAN_AMOUNT                 0
TENURE                      0
INTEREST_RATE               0
PRINCIPAL_AMOUNT            0
PAID_AMOUNT                 0
dtype: int64

In [18]:
df_lms["TENURE"].value_counts()

TENURE
2    25835535
3     3653031
Name: count, dtype: int64

In [19]:
# --- Step 2: Feature Engineering & Target Variable Creation ---
df_lms['DAYS_LATE'] = (df_lms['PAID_DT'] - df_lms['DUE_DATE']).dt.days

In [20]:
df_lms.isna().sum()

LOAN_ID                     0
INSTALLMENT_NO              0
DUE_DATE                    0
INSTALLMENT_AMOUNT          0
INTEREST_AMOUNT             0
STATUS                      0
PAID_DT               8882206
LOAN_DATE                   0
LOAN_AMOUNT                 0
TENURE                      0
INTEREST_RATE               0
PRINCIPAL_AMOUNT            0
PAID_AMOUNT                 0
DAYS_LATE             8882206
dtype: int64

In [21]:
df_lms["DAYS_LATE"].max()

np.float64(782.0)

In [22]:
#df_lms['DAYS_LATE'] = df_lms['DAYS_LATE'].fillna(9999)

In [23]:
df_lms[(df_lms['STATUS'] == 1)]["PAID_AMOUNT"].value_counts()

PAID_AMOUNT
0.0    8882206
Name: count, dtype: int64

In [24]:
df_lms[(df_lms['PAID_DT'].isnull()) & (df_lms['STATUS'] == 0)]

Unnamed: 0,LOAN_ID,INSTALLMENT_NO,DUE_DATE,INSTALLMENT_AMOUNT,INTEREST_AMOUNT,STATUS,PAID_DT,LOAN_DATE,LOAN_AMOUNT,TENURE,INTEREST_RATE,PRINCIPAL_AMOUNT,PAID_AMOUNT,DAYS_LATE


In [25]:
df_lms[(df_lms['PAID_DT'].isnull()) & (df_lms['STATUS'] == 0)]

Unnamed: 0,LOAN_ID,INSTALLMENT_NO,DUE_DATE,INSTALLMENT_AMOUNT,INTEREST_AMOUNT,STATUS,PAID_DT,LOAN_DATE,LOAN_AMOUNT,TENURE,INTEREST_RATE,PRINCIPAL_AMOUNT,PAID_AMOUNT,DAYS_LATE


In [26]:
df_lms[(df_lms["STATUS"]==0)&(df_lms["PAID_AMOUNT"]!=df_lms["INSTALLMENT_AMOUNT"])]

Unnamed: 0,LOAN_ID,INSTALLMENT_NO,DUE_DATE,INSTALLMENT_AMOUNT,INTEREST_AMOUNT,STATUS,PAID_DT,LOAN_DATE,LOAN_AMOUNT,TENURE,INTEREST_RATE,PRINCIPAL_AMOUNT,PAID_AMOUNT,DAYS_LATE


In [27]:
df_lms[(df_lms["STATUS"]==0)&((df_lms["PAID_AMOUNT"]/df_lms["INSTALLMENT_AMOUNT"]*100)==100)]

Unnamed: 0,LOAN_ID,INSTALLMENT_NO,DUE_DATE,INSTALLMENT_AMOUNT,INTEREST_AMOUNT,STATUS,PAID_DT,LOAN_DATE,LOAN_AMOUNT,TENURE,INTEREST_RATE,PRINCIPAL_AMOUNT,PAID_AMOUNT,DAYS_LATE
0,1001UNSL002793,1,2023-02-10,3000,1571,0,2023-02-10 10:48:17,2023-01-03,60000,2,24.5,1429,3000.0,0.0
1,1001UNSL002793,2,2023-03-10,3000,1101,0,2023-03-10 12:54:15,2023-01-03,60000,2,24.5,1899,3000.0,0.0
2,1001UNSL002793,3,2023-04-07,3000,1065,0,2023-04-07 12:31:44,2023-01-03,60000,2,24.5,1935,3000.0,0.0
3,1001UNSL002793,4,2023-05-05,3000,1029,0,2023-05-05 14:40:36,2023-01-03,60000,2,24.5,1971,3000.0,0.0
4,1001UNSL002793,5,2023-06-02,3000,992,0,2023-06-02 16:56:10,2023-01-03,60000,2,24.5,2008,3000.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29488557,4209UNSL000970,12,2024-12-17,2010,490,0,2024-12-17 10:35:58,2023-12-30,40000,2,25.0,1520,2010.0,0.0
29488558,4209UNSL000970,13,2025-01-14,2010,461,0,2025-01-14 14:55:04,2023-12-30,40000,2,25.0,1549,2010.0,0.0
29488559,4209UNSL000970,14,2025-02-11,2010,431,0,2025-02-11 13:31:36,2023-12-30,40000,2,25.0,1579,2010.0,0.0
29488560,4209UNSL000970,15,2025-03-11,2010,401,0,2025-03-11 12:27:35,2023-12-30,40000,2,25.0,1609,2010.0,0.0


In [28]:
20177202

20177202

In [29]:
df_lms["STATUS"].value_counts()

STATUS
0    20177202
1     8882206
2      429158
Name: count, dtype: int64

In [30]:
partial_85percentpaid=df_lms[(df_lms["STATUS"]==2)&(df_lms["PAID_AMOUNT"]/df_lms["INSTALLMENT_AMOUNT"]*100>=85)]

In [31]:
len(partial_85percentpaid)

62211

In [32]:
#percent of partial payments with more than 85% paid
62211/429158*100

14.496059726254664

In [33]:
partial_85percentpaid.columns

Index(['LOAN_ID', 'INSTALLMENT_NO', 'DUE_DATE', 'INSTALLMENT_AMOUNT',
       'INTEREST_AMOUNT', 'STATUS', 'PAID_DT', 'LOAN_DATE', 'LOAN_AMOUNT',
       'TENURE', 'INTEREST_RATE', 'PRINCIPAL_AMOUNT', 'PAID_AMOUNT',
       'DAYS_LATE'],
      dtype='object')

In [34]:
partial_85percentpaid = partial_85percentpaid.copy()
partial_85percentpaid["BALANCE_AMOUNT"] = partial_85percentpaid["INSTALLMENT_AMOUNT"] - partial_85percentpaid["PAID_AMOUNT"]

In [35]:
partial_85percentpaid["BALANCE_AMOUNT"].max()

np.float64(526.0)

In [36]:
partial_85percentpaid["BALANCE_AMOUNT"].min()

np.float64(0.009999999999990905)

In [37]:
partial_85percentpaid[partial_85percentpaid["BALANCE_AMOUNT"]<=100]

Unnamed: 0,LOAN_ID,INSTALLMENT_NO,DUE_DATE,INSTALLMENT_AMOUNT,INTEREST_AMOUNT,STATUS,PAID_DT,LOAN_DATE,LOAN_AMOUNT,TENURE,INTEREST_RATE,PRINCIPAL_AMOUNT,PAID_AMOUNT,DAYS_LATE,BALANCE_AMOUNT
178,1001UNSL002832,23,2024-11-21,2010,144,2,2024-12-21 12:33:52,2023-01-20,40000,2,25.0,1866,1969.00,30.0,41.00
539,1001UNSL002933,20,2024-09-06,3010,360,2,2025-03-22 00:00:00,2023-02-09,60000,2,25.0,2650,2969.26,197.0,40.74
904,1001UNSL003067,21,2024-10-15,2010,194,2,2024-12-07 09:42:59,2023-03-06,40000,2,25.0,1816,1913.00,53.0,97.00
1165,1001UNSL003100,22,2024-12-11,2010,178,2,2025-03-21 00:00:00,2023-03-11,40000,2,25.0,1832,1971.42,100.0,38.58
1491,1001UNSL003191,10,2024-01-10,2010,542,2,2024-02-07 19:13:24,2023-03-27,40000,2,25.0,1468,1964.00,28.0,46.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29478780,4208UNSL000070,24,2024-11-18,2000,89,2,2024-12-28 09:35:31,2023-01-11,40000,2,24.5,1911,1939.00,40.0,61.00
29479191,4208UNSL000558,20,2025-02-17,2010,230,2,2025-03-13 18:47:15,2023-08-04,40000,2,25.0,1780,1977.00,24.0,33.00
29480085,4208UNSL000662,23,2025-06-19,2010,123,2,2025-07-17 12:28:58,2023-09-14,40000,2,25.0,1887,1975.00,28.0,35.00
29480431,4208UNSL000693,11,2024-07-25,2010,508,2,2025-03-22 00:00:00,2023-09-21,40000,2,25.0,1502,2001.00,240.0,9.00


In [38]:
bins = [0, 100, 200, 300, 400, np.inf]
labels = ['0-100', '101-200', '201-300', '301-400', '>400']

partial_85percentpaid['BALANCE_AMOUNT_CAT'] = pd.cut(
    partial_85percentpaid['BALANCE_AMOUNT'],
    bins=bins,
    labels=labels,
    right=True, # The rightmost bin edge is included in the interval.
    include_lowest=True # The first bin will include the lowest value.
)

In [39]:
partial_85percentpaid["BALANCE_AMOUNT_CAT"].value_counts()

BALANCE_AMOUNT_CAT
0-100      31314
101-200    15145
201-300    11645
301-400     2655
>400        1452
Name: count, dtype: int64

In [40]:
31314+15145+11645+2655+1452

62211

In [41]:
len(partial_85percentpaid)

62211

In [42]:
# To get both counts and percentages in one output
counts = partial_85percentpaid["BALANCE_AMOUNT_CAT"].value_counts()
percentages = partial_85percentpaid["BALANCE_AMOUNT_CAT"].value_counts(normalize=True) * 100

combined_df = pd.DataFrame({
    'Counts': counts,
    'Percentages': percentages
}).round(2) # Round the percentages to 2 decimal places

print(combined_df)

                    Counts  Percentages
BALANCE_AMOUNT_CAT                     
0-100                31314        50.34
101-200              15145        24.34
201-300              11645        18.72
301-400               2655         4.27
>400                  1452         2.33


In [43]:
partial_90percentpaid=df_lms[(df_lms["STATUS"]==2)&(df_lms["PAID_AMOUNT"]/df_lms["INSTALLMENT_AMOUNT"]*100>=90)]

In [44]:
len(partial_90percentpaid)

48583

In [45]:
48583/429158*100

11.32053928856039

In [46]:
partial_90percentpaid = partial_90percentpaid.copy()
partial_90percentpaid["BALANCE_AMOUNT"] = partial_90percentpaid["INSTALLMENT_AMOUNT"] - partial_90percentpaid["PAID_AMOUNT"]

In [47]:
partial_90percentpaid["BALANCE_AMOUNT"].max()

np.float64(351.0)

In [48]:
bins = [0, 100, 200, 300, 400, np.inf]
labels = ['0-100', '101-200', '201-300', '301-400', '>400']

partial_90percentpaid['BALANCE_AMOUNT_CAT'] = pd.cut(
    partial_90percentpaid['BALANCE_AMOUNT'],
    bins=bins,
    labels=labels,
    right=True, # The rightmost bin edge is included in the interval.
    include_lowest=True # The first bin will include the lowest value.
)

In [49]:
partial_90percentpaid["BALANCE_AMOUNT_CAT"].value_counts()

BALANCE_AMOUNT_CAT
0-100      30900
101-200    14704
201-300     2621
301-400      358
>400           0
Name: count, dtype: int64

In [50]:
30900+14704+2621+358

48583

In [51]:
# To get both counts and percentages in one output
counts = partial_90percentpaid["BALANCE_AMOUNT_CAT"].value_counts()
percentages = partial_90percentpaid["BALANCE_AMOUNT_CAT"].value_counts(normalize=True) * 100

combined_df = pd.DataFrame({
    'Counts': counts,
    'Percentages': percentages
}).round(2) # Round the percentages to 2 decimal places

print(combined_df)

                    Counts  Percentages
BALANCE_AMOUNT_CAT                     
0-100                30900        63.60
101-200              14704        30.27
201-300               2621         5.39
301-400                358         0.74
>400                     0         0.00


In [52]:
# Step 1: Filter each DataFrame for the '0-100' category
df_85_cat_0_100 = partial_85percentpaid[
    partial_85percentpaid['BALANCE_AMOUNT_CAT'] == '0-100'
]

df_90_cat_0_100 = partial_90percentpaid[
    partial_90percentpaid['BALANCE_AMOUNT_CAT'] == '0-100'
]

In [53]:
# Step 2: Merge the two filtered DataFrames on the composite key
# We use a left join with 'indicator=True' to mark the source of each row.
merged_df = pd.merge(
    df_85_cat_0_100,
    df_90_cat_0_100[['LOAN_ID', 'INSTALLMENT_NO']],
    on=['LOAN_ID', 'INSTALLMENT_NO'],
    how='left',
    indicator=True
)

In [54]:
# Step 3: Filter for records that only exist in the left DataFrame (partial_85percentpaid)
records_unique_to_85 = merged_df[merged_df['_merge'] == 'left_only'].drop(columns='_merge')

In [55]:
records_unique_to_85

Unnamed: 0,LOAN_ID,INSTALLMENT_NO,DUE_DATE,INSTALLMENT_AMOUNT,INTEREST_AMOUNT,STATUS,PAID_DT,LOAN_DATE,LOAN_AMOUNT,TENURE,INTEREST_RATE,PRINCIPAL_AMOUNT,PAID_AMOUNT,DAYS_LATE,BALANCE_AMOUNT,BALANCE_AMOUNT_CAT
589,1025UNSL004561,58,2025-02-17,503,35,2,2025-05-19 11:01:15,2023-05-03,40000,2,25.0,468,448.0,91.0,55.0,0-100
1352,1069UNSL003706,6,2023-11-17,650,210,2,2025-03-28 21:08:37,2023-05-31,13000,2,25.0,440,563.0,497.0,87.0,0-100
1384,1071UNSL003646,6,2023-10-13,600,193,2,2024-11-12 20:49:42,2023-04-28,12000,2,25.0,407,518.0,396.0,82.0,0-100
1735,1082UNSL003626,6,2023-10-13,700,226,2,2024-11-12 21:07:54,2023-04-28,14000,2,25.0,474,603.0,396.0,97.0,0-100
1939,1088UNSL003319,17,2024-05-24,501,128,2,2024-08-17 10:44:47,2023-06-29,40000,2,25.0,373,444.0,85.0,57.0,0-100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30481,3017UNSL000209,26,2025-02-20,793,15,2,2025-04-27 12:45:00,2023-02-20,40000,2,25.0,778,698.0,66.0,95.0,0-100
30971,4144UNSL000663,34,2025-05-05,501,40,2,2025-05-10 10:34:36,2023-08-26,40000,2,25.0,461,450.0,5.0,51.0,0-100
30974,4144UNSL000687,34,2025-05-12,501,40,2,2025-05-24 16:03:53,2023-09-02,40000,2,25.0,461,432.0,12.0,69.0,0-100
30983,4144UNSL000949,24,2025-03-03,503,75,2,2025-03-24 12:49:56,2023-10-30,40000,2,25.0,428,445.0,21.0,58.0,0-100


We will move forward with considering 90% above partially paid as paid itself

In [56]:
#to identify weekly and monthly repayment schedules:
df_lms.sort_values(by=['LOAN_ID', 'DUE_DATE'], inplace=True)

In [57]:
df_lms['DAYS_BETWEEN_DUE_DATES'] = df_lms.groupby('LOAN_ID')['DUE_DATE'].diff().dt.days

In [58]:
df_lms["DAYS_BETWEEN_DUE_DATES"].value_counts()

DAYS_BETWEEN_DUE_DATES
28.0    24729287
7.0      3622379
56.0           2
14.0           1
Name: count, dtype: int64

In [59]:
df_lms['REPAYMENT_SCHEDULE_CAT'] = np.select(
    [
        df_lms['DAYS_BETWEEN_DUE_DATES'].isnull(), # Condition for the first installment
        df_lms['DAYS_BETWEEN_DUE_DATES'].isin([28, 29, 30, 31]),
        df_lms['DAYS_BETWEEN_DUE_DATES'] == 7,
        df_lms['DAYS_BETWEEN_DUE_DATES'] == 14,
        df_lms['DAYS_BETWEEN_DUE_DATES'] == 56
    ],
    [
        'Initial', # Category for the first installment
        'Monthly',
        'Weekly',
        'Bi-Weekly',
        'Bi-Monthly'
    ],
    default='Other' # Assigns 'Other' to any values not specified
)

In [60]:
df_lms["REPAYMENT_SCHEDULE_CAT"].value_counts()

REPAYMENT_SCHEDULE_CAT
Monthly       24729287
Weekly         3622379
Initial        1136897
Bi-Monthly           2
Bi-Weekly            1
Name: count, dtype: int64

In [61]:
df_lms["DAYS_BETWEEN_DUE_DATES"].value_counts()

DAYS_BETWEEN_DUE_DATES
28.0    24729287
7.0      3622379
56.0           2
14.0           1
Name: count, dtype: int64

In [62]:
df_lms["LOAN_ID"].nunique()

1136897

In [63]:
# Group by LOAN_ID and get the unique repayment schedules for each loan
unique_schedules = df_lms.groupby('LOAN_ID')['REPAYMENT_SCHEDULE_CAT'].unique()

# Find the LOAN_IDs where both 'Weekly' and 'Monthly' are present
mixed_schedule_loans = unique_schedules[
    unique_schedules.apply(lambda x: 'Weekly' in x and 'Monthly' in x)
]

# Get the list of LOAN_IDs
mixed_schedule_loan_ids = mixed_schedule_loans.index.tolist()

print(f"Number of loans with both weekly and monthly schedules: {len(mixed_schedule_loan_ids)}")
print("\nList of such LOAN_IDs:")
print(mixed_schedule_loan_ids)

Number of loans with both weekly and monthly schedules: 70414

List of such LOAN_IDs:
['1020UNSL004107', '1020UNSL004108', '1020UNSL004126', '1020UNSL004142', '1020UNSL004144', '1020UNSL004151', '1020UNSL004152', '1020UNSL004154', '1020UNSL004165', '1020UNSL004171', '1020UNSL004195', '1020UNSL004267', '1020UNSL004290', '1020UNSL004317', '1020UNSL004350', '1020UNSL004390', '1020UNSL004391', '1020UNSL004395', '1020UNSL004396', '1020UNSL004400', '1020UNSL004435', '1020UNSL004461', '1020UNSL004463', '1020UNSL004467', '1020UNSL004471', '1020UNSL004487', '1020UNSL004499', '1020UNSL004515', '1020UNSL004544', '1020UNSL004561', '1020UNSL004642', '1020UNSL004645', '1020UNSL004689', '1020UNSL004717', '1020UNSL004731', '1020UNSL004732', '1020UNSL004751', '1020UNSL004752', '1020UNSL004761', '1020UNSL004762', '1020UNSL004763', '1020UNSL004766', '1020UNSL004792', '1020UNSL004794', '1020UNSL004800', '1020UNSL004801', '1020UNSL004805', '1020UNSL004815', '1020UNSL004820', '1020UNSL004831', '1020UNSL0048

In [64]:
df_lms.columns

Index(['LOAN_ID', 'INSTALLMENT_NO', 'DUE_DATE', 'INSTALLMENT_AMOUNT',
       'INTEREST_AMOUNT', 'STATUS', 'PAID_DT', 'LOAN_DATE', 'LOAN_AMOUNT',
       'TENURE', 'INTEREST_RATE', 'PRINCIPAL_AMOUNT', 'PAID_AMOUNT',
       'DAYS_LATE', 'DAYS_BETWEEN_DUE_DATES', 'REPAYMENT_SCHEDULE_CAT'],
      dtype='object')

In [65]:
df_lms[df_lms["LOAN_ID"]=="1020UNSL004144"]

Unnamed: 0,LOAN_ID,INSTALLMENT_NO,DUE_DATE,INSTALLMENT_AMOUNT,INTEREST_AMOUNT,STATUS,PAID_DT,LOAN_DATE,LOAN_AMOUNT,TENURE,INTEREST_RATE,PRINCIPAL_AMOUNT,PAID_AMOUNT,DAYS_LATE,DAYS_BETWEEN_DUE_DATES,REPAYMENT_SCHEDULE_CAT
290969,1020UNSL004144,1,2023-02-08,2000,886,0,2023-02-08 14:16:13,2023-01-07,40000,2,24.5,1114,2000.0,0.0,,Initial
290970,1020UNSL004144,2,2023-03-08,2000,731,0,2023-03-08 13:15:29,2023-01-07,40000,2,24.5,1269,2000.0,0.0,28.0,Monthly
290971,1020UNSL004144,3,2023-04-05,2000,707,0,2023-04-05 13:23:34,2023-01-07,40000,2,24.5,1293,2000.0,0.0,28.0,Monthly
290972,1020UNSL004144,4,2023-05-03,2000,683,0,2023-05-03 13:30:50,2023-01-07,40000,2,24.5,1317,2000.0,0.0,28.0,Monthly
290973,1020UNSL004144,5,2023-05-31,2000,658,0,2023-05-31 12:20:48,2023-01-07,40000,2,24.5,1342,2000.0,0.0,28.0,Monthly
290974,1020UNSL004144,6,2023-06-28,2000,633,0,2023-06-28 13:03:45,2023-01-07,40000,2,24.5,1367,2000.0,0.0,28.0,Monthly
290975,1020UNSL004144,7,2023-07-26,2000,607,0,2023-07-26 15:19:21,2023-01-07,40000,2,24.5,1393,2000.0,0.0,28.0,Monthly
290976,1020UNSL004144,8,2023-08-23,2000,581,0,2023-08-23 15:09:55,2023-01-07,40000,2,24.5,1419,2000.0,0.0,28.0,Monthly
290977,1020UNSL004144,9,2023-09-20,2000,554,0,2023-09-20 14:50:06,2023-01-07,40000,2,24.5,1446,2000.0,0.0,28.0,Monthly
290978,1020UNSL004144,10,2023-10-18,2000,527,0,2023-10-18 16:27:53,2023-01-07,40000,2,24.5,1473,2000.0,0.0,28.0,Monthly


In [66]:
df_lms[(df_lms["REPAYMENT_SCHEDULE_CAT"]=="Weekly")&(df_lms["TENURE"]==3)]

Unnamed: 0,LOAN_ID,INSTALLMENT_NO,DUE_DATE,INSTALLMENT_AMOUNT,INTEREST_AMOUNT,STATUS,PAID_DT,LOAN_DATE,LOAN_AMOUNT,TENURE,INTEREST_RATE,PRINCIPAL_AMOUNT,PAID_AMOUNT,DAYS_LATE,DAYS_BETWEEN_DUE_DATES,REPAYMENT_SCHEDULE_CAT
1068525,1054UNSL005634,7,2024-03-29,755,347,0,2024-04-08 20:01:23,2023-09-09,80000,3,25.0,408,755.0,10.0,7.0,Weekly
1068526,1054UNSL005634,8,2024-04-05,755,347,0,2024-04-08 20:01:23,2023-09-09,80000,3,25.0,408,755.0,3.0,7.0,Weekly
1068527,1054UNSL005634,9,2024-04-12,755,347,0,2024-04-12 12:12:02,2023-09-09,80000,3,25.0,408,755.0,0.0,7.0,Weekly
1068528,1054UNSL005634,10,2024-04-19,755,346,0,2024-04-17 16:05:17,2023-09-09,80000,3,25.0,409,755.0,-2.0,7.0,Weekly
1068529,1054UNSL005634,11,2024-04-26,755,339,0,2024-04-26 14:06:03,2023-09-09,80000,3,25.0,416,755.0,0.0,7.0,Weekly
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28594556,2955UNSL002539,39,2025-07-03,660,199,1,NaT,2023-12-23,70000,3,25.0,461,0.0,,7.0,Weekly
28594557,2955UNSL002539,40,2025-07-10,660,199,1,NaT,2023-12-23,70000,3,25.0,461,0.0,,7.0,Weekly
28594558,2955UNSL002539,41,2025-07-17,660,198,1,NaT,2023-12-23,70000,3,25.0,462,0.0,,7.0,Weekly
28594559,2955UNSL002539,42,2025-07-24,660,190,1,NaT,2023-12-23,70000,3,25.0,470,0.0,,7.0,Weekly


In [67]:
3622379

3622379

In [68]:
df_lms[(df_lms["REPAYMENT_SCHEDULE_CAT"]=="Monthly")&(df_lms["TENURE"]==3)]

Unnamed: 0,LOAN_ID,INSTALLMENT_NO,DUE_DATE,INSTALLMENT_AMOUNT,INTEREST_AMOUNT,STATUS,PAID_DT,LOAN_DATE,LOAN_AMOUNT,TENURE,INTEREST_RATE,PRINCIPAL_AMOUNT,PAID_AMOUNT,DAYS_LATE,DAYS_BETWEEN_DUE_DATES,REPAYMENT_SCHEDULE_CAT
9925,1001UNSL003980,2,2023-11-06,2640,1327,0,2023-11-04 21:42:12,2023-09-02,70000,3,25.0,1313,2640.0,-2.0,28.0,Monthly
9926,1001UNSL003980,3,2023-12-04,2640,1302,0,2023-12-06 08:35:14,2023-09-02,70000,3,25.0,1338,2640.0,2.0,28.0,Monthly
9927,1001UNSL003980,4,2024-01-01,2640,1276,0,2023-12-30 18:23:30,2023-09-02,70000,3,25.0,1364,2640.0,-2.0,28.0,Monthly
9928,1001UNSL003980,5,2024-01-29,2640,1250,0,2024-01-29 11:33:28,2023-09-02,70000,3,25.0,1390,2640.0,0.0,28.0,Monthly
9929,1001UNSL003980,6,2024-02-26,2640,1223,0,2024-02-26 11:10:32,2023-09-02,70000,3,25.0,1417,2640.0,0.0,28.0,Monthly
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29300402,4149UNSL001130,16,2025-03-21,2640,923,0,2025-03-21 11:39:46,2023-12-23,70000,3,25.0,1717,2640.0,0.0,28.0,Monthly
29300403,4149UNSL001130,17,2025-04-18,2640,890,0,2025-04-18 19:31:36,2023-12-23,70000,3,25.0,1750,2640.0,0.0,28.0,Monthly
29300404,4149UNSL001130,18,2025-05-16,2640,857,0,2025-05-16 14:13:05,2023-12-23,70000,3,25.0,1783,2640.0,0.0,28.0,Monthly
29300405,4149UNSL001130,19,2025-06-13,2640,822,0,2025-06-13 12:20:02,2023-12-23,70000,3,25.0,1818,2640.0,0.0,28.0,Monthly


In [69]:
# Step 1: Group by LOAN_ID and get the unique schedules for each loan
unique_schedules = df_lms.groupby('LOAN_ID')['REPAYMENT_SCHEDULE_CAT'].unique()

# Step 2: Identify loans where 'Weekly' is the only non-initial schedule
pure_weekly_loans = unique_schedules[
    unique_schedules.apply(lambda x: len(x) == 1 and 'Weekly' in x or (len(x) == 2 and 'Initial' in x and 'Weekly' in x))
]

# Get the list of LOAN_IDs
pure_weekly_loan_ids = pure_weekly_loans.index.tolist()

print(f"Number of loans with a pure weekly schedule: {len(pure_weekly_loan_ids)}")
# print("\nFirst 5 loan IDs with only weekly schedules:")
# print(pure_weekly_loan_ids[:5])

Number of loans with a pure weekly schedule: 70


In [70]:
pure_weekly_loan_ids

['1603UNSL008036',
 '1603UNSL008041',
 '1603UNSL008047',
 '1603UNSL008048',
 '1603UNSL008049',
 '1603UNSL008050',
 '1603UNSL008051',
 '1603UNSL008053',
 '1603UNSL008062',
 '1603UNSL008063',
 '1603UNSL008064',
 '1603UNSL008065',
 '1603UNSL008066',
 '1603UNSL008070',
 '1603UNSL008071',
 '1603UNSL008072',
 '1603UNSL008074',
 '1603UNSL008075',
 '1603UNSL008076',
 '1603UNSL008077',
 '1603UNSL008078',
 '1603UNSL008086',
 '1603UNSL008087',
 '1603UNSL008089',
 '1603UNSL008092',
 '1603UNSL008093',
 '1603UNSL008094',
 '1603UNSL008101',
 '1603UNSL008102',
 '1603UNSL008103',
 '1603UNSL008104',
 '1603UNSL008105',
 '1603UNSL008106',
 '1603UNSL008108',
 '1603UNSL008109',
 '1603UNSL008110',
 '1606UNSL006450',
 '1606UNSL006451',
 '1606UNSL006452',
 '1606UNSL006453',
 '1606UNSL006454',
 '1606UNSL006458',
 '1606UNSL006459',
 '1606UNSL006460',
 '1606UNSL006461',
 '1606UNSL006462',
 '1606UNSL006463',
 '1606UNSL006464',
 '1606UNSL006465',
 '1606UNSL006466',
 '1606UNSL006467',
 '1606UNSL006468',
 '1606UNSL00

In [71]:
df_lms[df_lms["LOAN_ID"]=="1610UNSL003260"]

Unnamed: 0,LOAN_ID,INSTALLMENT_NO,DUE_DATE,INSTALLMENT_AMOUNT,INTEREST_AMOUNT,STATUS,PAID_DT,LOAN_DATE,LOAN_AMOUNT,TENURE,INTEREST_RATE,PRINCIPAL_AMOUNT,PAID_AMOUNT,DAYS_LATE,DAYS_BETWEEN_DUE_DATES,REPAYMENT_SCHEDULE_CAT
13010380,1610UNSL003260,1,2024-02-07,503,288,0,2023-12-20 00:00:00,2023-12-14,40000,2,25.0,215,503.0,-49.0,,Initial
13010381,1610UNSL003260,2,2024-02-14,503,288,0,2023-12-20 00:00:00,2023-12-14,40000,2,25.0,215,503.0,-56.0,7.0,Weekly
13010382,1610UNSL003260,3,2024-02-21,503,288,0,2023-12-20 00:00:00,2023-12-14,40000,2,25.0,215,503.0,-63.0,7.0,Weekly
13010383,1610UNSL003260,4,2024-02-28,501,287,0,2023-12-20 00:00:00,2023-12-14,40000,2,25.0,214,501.0,-70.0,7.0,Weekly
13010384,1610UNSL003260,5,2024-03-06,503,188,0,2024-06-30 16:14:07,2023-12-14,40000,2,25.0,315,503.0,116.0,7.0,Weekly
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13010453,1610UNSL003260,74,2025-07-02,503,68,1,NaT,2023-12-14,40000,2,25.0,435,0.0,,7.0,Weekly
13010454,1610UNSL003260,75,2025-07-09,503,68,1,NaT,2023-12-14,40000,2,25.0,435,0.0,,7.0,Weekly
13010455,1610UNSL003260,76,2025-07-16,501,67,1,NaT,2023-12-14,40000,2,25.0,434,0.0,,7.0,Weekly
13010456,1610UNSL003260,77,2025-07-23,503,60,1,NaT,2023-12-14,40000,2,25.0,443,0.0,,7.0,Weekly


In [72]:
df_lms.columns

Index(['LOAN_ID', 'INSTALLMENT_NO', 'DUE_DATE', 'INSTALLMENT_AMOUNT',
       'INTEREST_AMOUNT', 'STATUS', 'PAID_DT', 'LOAN_DATE', 'LOAN_AMOUNT',
       'TENURE', 'INTEREST_RATE', 'PRINCIPAL_AMOUNT', 'PAID_AMOUNT',
       'DAYS_LATE', 'DAYS_BETWEEN_DUE_DATES', 'REPAYMENT_SCHEDULE_CAT'],
      dtype='object')

In [73]:
print("Starting vectorized computation for loan schedule type")

print("Determining loan schedule type (Hybrid/Weekly/Monthly)...")

# Determine the overall loan type using groupby().transform() ---
# Get a Series that shows if each loan has a Weekly and/or Monthly schedule
has_weekly = df_lms.groupby('LOAN_ID')['REPAYMENT_SCHEDULE_CAT'].transform(
    lambda x: 'Weekly' in x.unique()
)
has_monthly = df_lms.groupby('LOAN_ID')['REPAYMENT_SCHEDULE_CAT'].transform(
    lambda x: 'Monthly' in x.unique()
)

# Use a boolean mask to find all Hybrid loans
is_hybrid = (has_weekly) & (has_monthly)

# Determine the mode (most frequent schedule) for all loans
# This is a fallback for non-hybrid loans
mode_schedule = df_lms.groupby('LOAN_ID')['REPAYMENT_SCHEDULE_CAT'].transform(
    lambda x: x.mode()[0] if not x.mode().empty else 'Initial'
)

# Use np.where to efficiently assign the final loan schedule type
df_lms['LOAN_SCHEDULE_TYPE'] = np.where(
    is_hybrid,
    'Hybrid',
    mode_schedule
)

print("Vectorized computation complete. Final distribution:")
print(df_lms['LOAN_SCHEDULE_TYPE'].value_counts())

Starting vectorized computation for loan schedule type
Determining loan schedule type (Hybrid/Weekly/Monthly)...
Vectorized computation complete. Final distribution:
LOAN_SCHEDULE_TYPE
Monthly    25004751
Hybrid      4478267
Weekly         5548
Name: count, dtype: int64


In [74]:
25004751+4478267+5548

29488566

In [75]:
df_lms.columns

Index(['LOAN_ID', 'INSTALLMENT_NO', 'DUE_DATE', 'INSTALLMENT_AMOUNT',
       'INTEREST_AMOUNT', 'STATUS', 'PAID_DT', 'LOAN_DATE', 'LOAN_AMOUNT',
       'TENURE', 'INTEREST_RATE', 'PRINCIPAL_AMOUNT', 'PAID_AMOUNT',
       'DAYS_LATE', 'DAYS_BETWEEN_DUE_DATES', 'REPAYMENT_SCHEDULE_CAT',
       'LOAN_SCHEDULE_TYPE'],
      dtype='object')

In [76]:
df_lms.shape

(29488566, 17)

In [77]:
df_lms.isna().sum()

LOAN_ID                         0
INSTALLMENT_NO                  0
DUE_DATE                        0
INSTALLMENT_AMOUNT              0
INTEREST_AMOUNT                 0
STATUS                          0
PAID_DT                   8882206
LOAN_DATE                       0
LOAN_AMOUNT                     0
TENURE                          0
INTEREST_RATE                   0
PRINCIPAL_AMOUNT                0
PAID_AMOUNT                     0
DAYS_LATE                 8882206
DAYS_BETWEEN_DUE_DATES    1136897
REPAYMENT_SCHEDULE_CAT          0
LOAN_SCHEDULE_TYPE              0
dtype: int64

In [78]:
df_lms["LOAN_AMOUNT"].max()

np.int64(80000)

In [79]:
# --- Step 2: Feature Engineering & Target Variable Creation ---
print("\n--- Step 2: Creating Final Features and Target Variable ---")

# Define constants
GRACE_PERIOD_DAYS = 2
PAID_PERCENTAGE_THRESHOLD = 0.90 # 90%


--- Step 2: Creating Final Features and Target Variable ---


In [80]:
# --- CORE FEATURE ENGINEERING: DAYS_LATE and IS_UNPAID ---
# 1. Create the binary feature for outright default
df_lms['IS_UNPAID'] = np.where(df_lms['STATUS'] == 1, 1, 0)

In [81]:
df_lms['IS_DAYS_LATE_MISSING'] = df_lms['DAYS_LATE'].isna().astype(int)

In [82]:
# 2. Impute the DAYS_LATE null values (for not paid status) with 0.
# The fact it was unpaid is captured by the IS_UNPAID feature.
df_lms['DAYS_LATE'] = df_lms['DAYS_LATE'].fillna(0)

In [83]:
df_lms.isna().sum()

LOAN_ID                         0
INSTALLMENT_NO                  0
DUE_DATE                        0
INSTALLMENT_AMOUNT              0
INTEREST_AMOUNT                 0
STATUS                          0
PAID_DT                   8882206
LOAN_DATE                       0
LOAN_AMOUNT                     0
TENURE                          0
INTEREST_RATE                   0
PRINCIPAL_AMOUNT                0
PAID_AMOUNT                     0
DAYS_LATE                       0
DAYS_BETWEEN_DUE_DATES    1136897
REPAYMENT_SCHEDULE_CAT          0
LOAN_SCHEDULE_TYPE              0
IS_UNPAID                       0
IS_DAYS_LATE_MISSING            0
dtype: int64

In [84]:
# NEW HIGH-VALUE SEQUENTIAL FEATURES
# Feature 1: PAID_RATIO (Payment Magnitude)
df_lms['PAID_RATIO'] = df_lms['PAID_AMOUNT'] / df_lms['INSTALLMENT_AMOUNT']
# Cap ratio at 1.0 to handle potential overpayments cleanly
df_lms['PAID_RATIO'] = df_lms['PAID_RATIO'].clip(upper=1.0)

# Feature 2: DELTA_DAYS_LATE (Lateness Momentum)
df_lms['DELTA_DAYS_LATE'] = df_lms.groupby('LOAN_ID')['DAYS_LATE'].diff().fillna(0)
# First installment will have a diff of NaN, filled with 0.

In [85]:
# Define the conditions based on your new logic
conditions = [
    # Condition 1 & 3 & 5 (for 'Not Paid' / 1)
    (df_lms['STATUS'] == 1) | \
    ((df_lms['STATUS'] == 0) & (df_lms['DAYS_LATE'] > GRACE_PERIOD_DAYS)) | \
    ((df_lms['STATUS'] == 2) & (
        (df_lms['DAYS_LATE'] > GRACE_PERIOD_DAYS) | 
        (df_lms['PAID_AMOUNT'] / df_lms['INSTALLMENT_AMOUNT'] < PAID_PERCENTAGE_THRESHOLD)
    )),

    # Condition 2 & 4 (for 'Paid' / 0)
    (df_lms['STATUS'] == 0) & (df_lms['DAYS_LATE'] <= GRACE_PERIOD_DAYS),
    (df_lms['STATUS'] == 2) & \
    (df_lms['DAYS_LATE'] <= GRACE_PERIOD_DAYS) & \
    (df_lms['PAID_AMOUNT'] / df_lms['INSTALLMENT_AMOUNT'] >= PAID_PERCENTAGE_THRESHOLD)
]

# Define the corresponding values
choices = [
    1, # Not Paid
    0, # Paid
    0, # Paid
]

In [86]:
# Apply the conditions to create the new feature
df_lms['CURRENT_EMI_BEHAVIOR_LABEL'] = np.select(conditions, choices, default=-1)

In [87]:
df_lms["CURRENT_EMI_BEHAVIOR_LABEL"].value_counts()

CURRENT_EMI_BEHAVIOR_LABEL
0    19235740
1    10252826
Name: count, dtype: int64

In [88]:
19235740+10252826

29488566

In [89]:
df_lms.shape

(29488566, 22)

In [90]:
10252826/(10252826+19235740)

0.34768818531223256

In [91]:
EPSILON = 0.001  # Small constant to prevent division by zero

In [92]:
df_lms.isna().sum()

LOAN_ID                             0
INSTALLMENT_NO                      0
DUE_DATE                            0
INSTALLMENT_AMOUNT                  0
INTEREST_AMOUNT                     0
STATUS                              0
PAID_DT                       8882206
LOAN_DATE                           0
LOAN_AMOUNT                         0
TENURE                              0
INTEREST_RATE                       0
PRINCIPAL_AMOUNT                    0
PAID_AMOUNT                         0
DAYS_LATE                           0
DAYS_BETWEEN_DUE_DATES        1136897
REPAYMENT_SCHEDULE_CAT              0
LOAN_SCHEDULE_TYPE                  0
IS_UNPAID                           0
IS_DAYS_LATE_MISSING                0
PAID_RATIO                          0
DELTA_DAYS_LATE                     0
CURRENT_EMI_BEHAVIOR_LABEL          0
dtype: int64

In [93]:
# ---PAYMENT SCORE LOGIC ---
# Creates a single, hybrid score that accurately combines lateness and payment status severity.
conditions_score = [
    # 1. OUTRIGHT DEFAULT / NOT PAID: Highest Risk. We use the IS_UNPAID flag.
    df_lms['IS_UNPAID'] == 1,
    
    # 2. SEVERELY LATE (but paid): Payment was made, but after the grace period.
    # Note: We use CURRENT_EMI_BEHAVIOR_LABEL == 1 AND IS_UNPAID == 0 to catch the severely late paid.
    (df_lms['CURRENT_EMI_BEHAVIOR_LABEL'] == 1) & (df_lms['IS_UNPAID'] == 0),
    
    # 3. SLIGHTLY LATE (within grace period) or Paid On-Time: DAYS_LATE > 0 and <= GRACE_PERIOD_DAYS
    (df_lms['CURRENT_EMI_BEHAVIOR_LABEL'] == 0)&(df_lms['DAYS_LATE'] > 0) & (df_lms['DAYS_LATE'] <= GRACE_PERIOD_DAYS) & (df_lms['IS_UNPAID'] == 0),
    
    # 4. ON-TIME or EARLY PAID: The best behavior. DAYS_LATE is 0 or negative.
    (df_lms['CURRENT_EMI_BEHAVIOR_LABEL'] == 0)&(df_lms['DAYS_LATE'] <= 0) & (df_lms['IS_UNPAID'] == 0)
]

In [94]:
df_lms[(df_lms["DAYS_LATE"]==-2)&(df_lms['CURRENT_EMI_BEHAVIOR_LABEL'] == 1) & (df_lms['IS_UNPAID'] == 0)]


Unnamed: 0,LOAN_ID,INSTALLMENT_NO,DUE_DATE,INSTALLMENT_AMOUNT,INTEREST_AMOUNT,STATUS,PAID_DT,LOAN_DATE,LOAN_AMOUNT,TENURE,...,PAID_AMOUNT,DAYS_LATE,DAYS_BETWEEN_DUE_DATES,REPAYMENT_SCHEDULE_CAT,LOAN_SCHEDULE_TYPE,IS_UNPAID,IS_DAYS_LATE_MISSING,PAID_RATIO,DELTA_DAYS_LATE,CURRENT_EMI_BEHAVIOR_LABEL
39695,1002UNSL002890,19,2025-06-02,3020,936,2,2025-05-31 00:00:00,2023-12-14,80000,3,...,8.35,-2.0,28.0,Monthly,Monthly,0,0,0.002765,-2.0,1
151324,1010UNSL001587,25,2025-03-24,2010,57,2,2025-03-22 00:00:00,2023-04-15,40000,2,...,40.00,-2.0,28.0,Monthly,Monthly,0,0,0.019900,-2.0,1
581695,1032UNSL005517,18,2025-06-02,2010,310,2,2025-05-31 00:00:00,2023-12-23,40000,2,...,3.00,-2.0,28.0,Monthly,Monthly,0,0,0.001493,-2.0,1
604643,1033UNSL006529,17,2025-03-24,3010,516,2,2025-03-22 00:00:00,2023-11-10,60000,2,...,7.00,-2.0,28.0,Monthly,Monthly,0,0,0.002326,-2.0,1
700603,1037UNSL005600,20,2025-06-02,2010,248,2,2025-05-31 10:28:00,2023-10-24,40000,2,...,0.04,-2.0,28.0,Monthly,Monthly,0,0,0.000020,-28.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29074626,4105UNSL000192,26,2025-06-02,1059,20,2,2025-05-31 00:00:00,2023-05-27,40000,2,...,3.00,-2.0,28.0,Monthly,Monthly,0,0,0.002833,-2.0,1
29074652,4105UNSL000194,26,2025-06-02,1059,20,2,2025-05-31 00:00:00,2023-05-27,40000,2,...,5.00,-2.0,28.0,Monthly,Monthly,0,0,0.004721,-2.0,1
29438445,4171UNSL000181,24,2025-03-24,2010,108,2,2025-03-22 00:00:00,2023-04-25,40000,2,...,8.00,-2.0,28.0,Monthly,Monthly,0,0,0.003980,-7.0,1
29438549,4171UNSL000196,24,2025-03-24,2010,103,2,2025-03-22 00:00:00,2023-05-02,40000,2,...,8.00,-2.0,28.0,Monthly,Monthly,0,0,0.003980,-7.0,1


In [95]:
# Calculate the remaining ratio of EMI to be paid
df_lms['REMAINING_EMI_RATIO']=1-df_lms["PAID_RATIO"]

# Calculate the new composite risk factor
# This factor integrates lateness with payment completeness
df_lms['COMPOSITE_RISK']=df_lms['DAYS_LATE'] + (df_lms['REMAINING_EMI_RATIO'] * 10)

In [96]:
choices_score = [
    -100, # Choice 1: Default
    
    # Choice 2: HIGH-RISK PAID (LABEL=1). Score must be LOW.
    # We use a linear formula, but apply a low upper-limit cap.
    # Max possible score here should be capped at, say, 0.30.
    np.maximum(
        0.0, # Ensures no overly negative penalty (though -100 catches extreme default)
        np.minimum( 
            0.30, # **CRITICAL CAP**: Ensures Max Score < Min Choice 3 Score (0.33)
            0.30 - (df_lms['COMPOSITE_RISK'] * 0.03) 
        )
    ),

    # Choice 3: SLIGHTLY LATE LOW-RISK (LABEL=0). Score is MID-RANGE (0.33 to 0.50).
    1.0 / (1 + df_lms['COMPOSITE_RISK']), 
    
    # Choice 4: ON-TIME or EARLY LOW-RISK (LABEL=0). Score is HIGH (1.5+).
    1.5 + (np.abs(df_lms['DAYS_LATE']) / 10)
]

In [97]:
df_lms['PAYMENT_SCORE'] = np.select(conditions_score, choices_score, default=0)

# The logic is now correct:
# Max Score(Choice 2) <= 0.30
# Min Score(Choice 3) >= 0.33 (1 / (1 + 2))

# CAPPING REMAINS AS SAFETY NET
#f_lms['PAYMENT_SCORE'] = df_lms['PAYMENT_SCORE'].replace([np.inf, -np.inf], [9999.0, -9999.0])

In [98]:
df_lms["PAYMENT_SCORE"].max()

np.float64(97.8)

In [99]:
# Define the choices for the rank, where 4 is the worst and 1 is the best.
rank_choices = [
    4, # Choice 1: OUTRIGHT DEFAULT (Worst Rank)
    3, # Choice 2: HIGH-RISK PAID 
    2, # Choice 3: SLIGHTLY LATE LOW-RISK 
    1  # Choice 4: ON-TIME or EARLY LOW-RISK (Best Rank)
]

In [100]:
df_lms['PAYMENT_SCORE_RANK'] = np.select(conditions_score, rank_choices)

In [101]:
df_lms["PAYMENT_SCORE_RANK"].value_counts()

PAYMENT_SCORE_RANK
1    18973419
4     8882206
3     1370620
2      262321
Name: count, dtype: int64

In [102]:
18973419+8882206+1370620+262321

29488566

In [103]:
df_lms.shape

(29488566, 26)

In [104]:
df_lms[df_lms["PAYMENT_SCORE"]==0][["LOAN_ID","DAYS_LATE","REMAINING_EMI_RATIO","PAYMENT_SCORE","PAYMENT_SCORE_RANK"]]

Unnamed: 0,LOAN_ID,DAYS_LATE,REMAINING_EMI_RATIO,PAYMENT_SCORE,PAYMENT_SCORE_RANK
44,1001UNSL002803,18.0,0.000000,0.0,3
48,1001UNSL002803,11.0,0.000000,0.0,3
72,1001UNSL002817,204.0,0.985060,0.0,3
83,1001UNSL002818,56.0,0.000000,0.0,3
84,1001UNSL002818,84.0,0.000000,0.0,3
...,...,...,...,...,...
29485059,4209UNSL000644,59.0,0.000000,0.0,3
29485083,4209UNSL000645,39.0,0.000000,0.0,3
29485084,4209UNSL000645,11.0,0.000000,0.0,3
29488423,4209UNSL000961,45.0,0.997015,0.0,3


In [105]:
default_records = df_lms[df_lms["PAYMENT_SCORE"] < 0]

In [106]:
default_records["PAYMENT_SCORE_RANK"].value_counts()

PAYMENT_SCORE_RANK
4    8882206
Name: count, dtype: int64

In [107]:
default_records["PAYMENT_SCORE"].value_counts()

PAYMENT_SCORE
-100.0    8882206
Name: count, dtype: int64

In [108]:
# # Create the final rolling window feature (RECENT_PAYMENT_SCORE)
# # This block is removed to prevent data leakage. It will be re-implemented in 3_hybrid_data_prep.ipynb.
# ROLLING_WINDOW_SIZE = 3
# df_lms['RECENT_PAYMENT_SCORE'] = df_lms.groupby('LOAN_ID')['PAYMENT_SCORE'].rolling(window=ROLLING_WINDOW_SIZE, min_periods=1).mean().reset_index(level=0, drop=True)
# df_lms['RECENT_PAYMENT_SCORE'] = df_lms.groupby('LOAN_ID')['RECENT_PAYMENT_SCORE'].shift(1)

In [109]:
df_lms.isna().sum()

LOAN_ID                             0
INSTALLMENT_NO                      0
DUE_DATE                            0
INSTALLMENT_AMOUNT                  0
INTEREST_AMOUNT                     0
STATUS                              0
PAID_DT                       8882206
LOAN_DATE                           0
LOAN_AMOUNT                         0
TENURE                              0
INTEREST_RATE                       0
PRINCIPAL_AMOUNT                    0
PAID_AMOUNT                         0
DAYS_LATE                           0
DAYS_BETWEEN_DUE_DATES        1136897
REPAYMENT_SCHEDULE_CAT              0
LOAN_SCHEDULE_TYPE                  0
IS_UNPAID                           0
IS_DAYS_LATE_MISSING                0
PAID_RATIO                          0
DELTA_DAYS_LATE                     0
CURRENT_EMI_BEHAVIOR_LABEL          0
REMAINING_EMI_RATIO                 0
COMPOSITE_RISK                      0
PAYMENT_SCORE                       0
PAYMENT_SCORE_RANK                  0
dtype: int64

In [110]:
#df_lms["RECENT_PAYMENT_SCORE"].max()

In [111]:
df_lms.loc[df_lms['IS_UNPAID'] == 1, 'DELTA_DAYS_LATE'] = 0

In [112]:
df_lms.isna().sum()

LOAN_ID                             0
INSTALLMENT_NO                      0
DUE_DATE                            0
INSTALLMENT_AMOUNT                  0
INTEREST_AMOUNT                     0
STATUS                              0
PAID_DT                       8882206
LOAN_DATE                           0
LOAN_AMOUNT                         0
TENURE                              0
INTEREST_RATE                       0
PRINCIPAL_AMOUNT                    0
PAID_AMOUNT                         0
DAYS_LATE                           0
DAYS_BETWEEN_DUE_DATES        1136897
REPAYMENT_SCHEDULE_CAT              0
LOAN_SCHEDULE_TYPE                  0
IS_UNPAID                           0
IS_DAYS_LATE_MISSING                0
PAID_RATIO                          0
DELTA_DAYS_LATE                     0
CURRENT_EMI_BEHAVIOR_LABEL          0
REMAINING_EMI_RATIO                 0
COMPOSITE_RISK                      0
PAYMENT_SCORE                       0
PAYMENT_SCORE_RANK                  0
dtype: int64

In [113]:
#df_lms["RECENT_PAYMENT_SCORE"]=df_lms["RECENT_PAYMENT_SCORE"].fillna(0)

In [114]:
# FINAL OPTIMIZED SOLUTION: Vectorized Shift via Sorting and NumPy
# Replaces the slow iterative approach and the memory-heavy groupby approach
print("Feature engineering updated successfully. Applying fastest vectorized shift via NumPy...")

# 1. Prepare: Sort the DataFrame once to ensure correct sequence for shifting
df_lms.sort_values(by=['LOAN_ID', 'INSTALLMENT_NO'], inplace=True)

# 2. Optimize the data type of the source column (Essential for memory)
df_lms['CURRENT_EMI_BEHAVIOR_LABEL'] = df_lms['CURRENT_EMI_BEHAVIOR_LABEL'].astype(np.int8)
source_column_np = df_lms['CURRENT_EMI_BEHAVIOR_LABEL'].values

Feature engineering updated successfully. Applying fastest vectorized shift via NumPy...


In [115]:
# 3. Create the Shifted Array using pure NumPy
# Initialize the target array (float32 to hold NaN)
shifted_target = np.full(source_column_np.shape, np.nan, dtype=np.float32)

# Copy the entire source array, shifted by one position backward (i.e., the "next" value)
# The value at index i in the source array becomes the value at index i-1 in the shifted_target array.
# [Current EMI] -> [Next EMI]
shifted_target[:-1] = source_column_np[1:]

In [116]:
# 4. Correct the boundaries where the LOAN_ID changes
# Find the indices where the LOAN_ID changes (the start of a new loan/group)
# Comparing LOAN_ID[i] != LOAN_ID[i-1] gives True at the start of every new group, except the very first.
loan_change_indices = df_lms['LOAN_ID'].values[:-1] != df_lms['LOAN_ID'].values[1:]

# If LOAN_ID changes from index i to i+1, the value at index i (the last EMI of the previous loan)
# should NOT be assigned the value of index i+1 (the first EMI of the new loan).
# We must reset the 'Next EMI' for the *last* EMI of each loan to NaN (or keep it as NaN).
# loan_change_indices are the indices of the LAST EMI of a loan (excluding the true last row).
last_emi_indices = np.where(loan_change_indices)[0]

# For all last EMIs, set the shifted target to NaN, correctly terminating the sequence for that loan.
shifted_target[last_emi_indices] = np.nan

In [117]:
# 5. Assign the new target column
df_lms['NEXT_EMI_LABEL'] = shifted_target.copy()


# 6. Drop the rows where the target is NaN (the last EMI of each loan)
# Note: This removes the true last EMI row for every loan, as intended by the shift(-1)
df_lms.dropna(subset=['NEXT_EMI_LABEL'], inplace=True)


# 7. Cast the final target back to the smallest integer type (since it's a label)
df_lms['NEXT_EMI_LABEL'] = df_lms['NEXT_EMI_LABEL'].astype(np.int8)


# 8. Reset the index since we dropped rows
df_lms.reset_index(drop=True, inplace=True)

print(f"Target variable creation complete. Final shape: {df_lms.shape}. Vectorization successful.")

Target variable creation complete. Final shape: (28351669, 27). Vectorization successful.


In [118]:
df_lms.columns

Index(['LOAN_ID', 'INSTALLMENT_NO', 'DUE_DATE', 'INSTALLMENT_AMOUNT',
       'INTEREST_AMOUNT', 'STATUS', 'PAID_DT', 'LOAN_DATE', 'LOAN_AMOUNT',
       'TENURE', 'INTEREST_RATE', 'PRINCIPAL_AMOUNT', 'PAID_AMOUNT',
       'DAYS_LATE', 'DAYS_BETWEEN_DUE_DATES', 'REPAYMENT_SCHEDULE_CAT',
       'LOAN_SCHEDULE_TYPE', 'IS_UNPAID', 'IS_DAYS_LATE_MISSING', 'PAID_RATIO',
       'DELTA_DAYS_LATE', 'CURRENT_EMI_BEHAVIOR_LABEL', 'REMAINING_EMI_RATIO',
       'COMPOSITE_RISK', 'PAYMENT_SCORE', 'PAYMENT_SCORE_RANK',
       'NEXT_EMI_LABEL'],
      dtype='object')

In [119]:
# --- FINAL CLEANUP AND COLUMN SELECTION ---
print("\n--- Step 3: Final Column Selection and Cleanup ---")
RENAME_MAP = {
    'LOAN_AMOUNT': 'LOAN_AMOUNT_LMS',
    'TENURE': 'TENURE_LMS',
    'INTEREST_RATE': 'INTEREST_RATE_LMS'
}

df_lms.rename(columns=RENAME_MAP, inplace=True)


--- Step 3: Final Column Selection and Cleanup ---


In [120]:
df_lms.columns

Index(['LOAN_ID', 'INSTALLMENT_NO', 'DUE_DATE', 'INSTALLMENT_AMOUNT',
       'INTEREST_AMOUNT', 'STATUS', 'PAID_DT', 'LOAN_DATE', 'LOAN_AMOUNT_LMS',
       'TENURE_LMS', 'INTEREST_RATE_LMS', 'PRINCIPAL_AMOUNT', 'PAID_AMOUNT',
       'DAYS_LATE', 'DAYS_BETWEEN_DUE_DATES', 'REPAYMENT_SCHEDULE_CAT',
       'LOAN_SCHEDULE_TYPE', 'IS_UNPAID', 'IS_DAYS_LATE_MISSING', 'PAID_RATIO',
       'DELTA_DAYS_LATE', 'CURRENT_EMI_BEHAVIOR_LABEL', 'REMAINING_EMI_RATIO',
       'COMPOSITE_RISK', 'PAYMENT_SCORE', 'PAYMENT_SCORE_RANK',
       'NEXT_EMI_LABEL'],
      dtype='object')

In [121]:
df_lms

Unnamed: 0,LOAN_ID,INSTALLMENT_NO,DUE_DATE,INSTALLMENT_AMOUNT,INTEREST_AMOUNT,STATUS,PAID_DT,LOAN_DATE,LOAN_AMOUNT_LMS,TENURE_LMS,...,IS_UNPAID,IS_DAYS_LATE_MISSING,PAID_RATIO,DELTA_DAYS_LATE,CURRENT_EMI_BEHAVIOR_LABEL,REMAINING_EMI_RATIO,COMPOSITE_RISK,PAYMENT_SCORE,PAYMENT_SCORE_RANK,NEXT_EMI_LABEL
0,1001UNSL002793,1,2023-02-10,3000,1571,0,2023-02-10 10:48:17,2023-01-03,60000,2,...,0,0,1.000000,0.0,0,0.000000,0.000000,1.5,1,0
1,1001UNSL002793,2,2023-03-10,3000,1101,0,2023-03-10 12:54:15,2023-01-03,60000,2,...,0,0,1.000000,0.0,0,0.000000,0.000000,1.5,1,0
2,1001UNSL002793,3,2023-04-07,3000,1065,0,2023-04-07 12:31:44,2023-01-03,60000,2,...,0,0,1.000000,0.0,0,0.000000,0.000000,1.5,1,0
3,1001UNSL002793,4,2023-05-05,3000,1029,0,2023-05-05 14:40:36,2023-01-03,60000,2,...,0,0,1.000000,0.0,0,0.000000,0.000000,1.5,1,0
4,1001UNSL002793,5,2023-06-02,3000,992,0,2023-06-02 16:56:10,2023-01-03,60000,2,...,0,0,1.000000,0.0,0,0.000000,0.000000,1.5,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28351664,4209UNSL000970,15,2025-03-11,2010,401,0,2025-03-11 12:27:35,2023-12-30,40000,2,...,0,0,1.000000,0.0,0,0.000000,0.000000,1.5,1,0
28351665,4209UNSL000970,16,2025-04-08,2010,370,0,2025-04-08 11:27:10,2023-12-30,40000,2,...,0,0,1.000000,0.0,0,0.000000,0.000000,1.5,1,1
28351666,4209UNSL000970,17,2025-05-06,2010,338,2,2025-05-31 00:00:00,2023-12-30,40000,2,...,0,0,0.052239,25.0,1,0.947761,34.477612,0.0,3,1
28351667,4209UNSL000970,18,2025-06-03,2010,306,1,NaT,2023-12-30,40000,2,...,1,1,0.000000,0.0,1,1.000000,10.000000,-100.0,4,1


In [122]:
# Define the columns that we need for the final merged dataset
LMS_COLS_TO_KEEP = [
    'LOAN_ID', 
    'LOAN_DATE', #ADDED for future reference/splitting
    'DUE_DATE', 
    
    # Loan Originating System (LOS) Features - Static per Loan (from initial agreement)
    'LOAN_AMOUNT_LMS', 'TENURE_LMS', 'INTEREST_RATE_LMS',
    'LOAN_SCHEDULE_TYPE', 
    
    # Core Sequential Features (Input for the RNN - ALL values that change per EMI)
    'INSTALLMENT_NO', 
    'INSTALLMENT_AMOUNT', 
    'DAYS_LATE', 
    'DAYS_BETWEEN_DUE_DATES', 
    'PAID_RATIO', 
    'DELTA_DAYS_LATE', 
    'IS_UNPAID',
    'PAYMENT_SCORE_RANK', 
    'PAYMENT_SCORE',
    'COMPOSITE_RISK', 
    'REPAYMENT_SCHEDULE_CAT',
    'IS_DAYS_LATE_MISSING',
    
    # Target and Split Reference
    'NEXT_EMI_LABEL', 
    'CURRENT_EMI_BEHAVIOR_LABEL'
]

# Drop all intermediate and raw status columns not needed
cols_to_drop = [col for col in df_lms.columns if col not in LMS_COLS_TO_KEEP]

# Explicitly ensure these raw/intermediate columns are dropped
cols_to_drop.extend(['STATUS', 'PAID_DT', 'PAID_AMOUNT','REMAINING_EMI_RATIO'])


df_lms.drop(columns=[col for col in cols_to_drop if col in df_lms.columns], errors='ignore', inplace=True)

print(f"LMS columns finalized. Shape: {df_lms.shape}")
print(f"Final columns: {list(df_lms.columns)}")

# --- FINAL SAVE OF FEATURE-ENGINEERED LMS DATA ---
df_lms.to_csv('lms_feature_engineered.csv', index=False)
print(f"✅ Feature-engineered LMS data saved to 'lms_feature_engineered.csv' with shape: {df_lms.shape}")

LMS columns finalized. Shape: (28351669, 21)
Final columns: ['LOAN_ID', 'INSTALLMENT_NO', 'DUE_DATE', 'INSTALLMENT_AMOUNT', 'LOAN_DATE', 'LOAN_AMOUNT_LMS', 'TENURE_LMS', 'INTEREST_RATE_LMS', 'DAYS_LATE', 'DAYS_BETWEEN_DUE_DATES', 'REPAYMENT_SCHEDULE_CAT', 'LOAN_SCHEDULE_TYPE', 'IS_UNPAID', 'IS_DAYS_LATE_MISSING', 'PAID_RATIO', 'DELTA_DAYS_LATE', 'CURRENT_EMI_BEHAVIOR_LABEL', 'COMPOSITE_RISK', 'PAYMENT_SCORE', 'PAYMENT_SCORE_RANK', 'NEXT_EMI_LABEL']
✅ Feature-engineered LMS data saved to 'lms_feature_engineered.csv' with shape: (28351669, 21)


In [None]:
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@