# Overall Dataset Handler and Ideas

In [2]:
import sys
import os

script_dir = os.path.dirname(os.path.abspath('dataset_handler.ipynb'))
parent_directory = os.path.dirname(script_dir)
module_directory = os.path.join(parent_directory, 'module') 
utils_directory = os.path.join(parent_directory, 'utils') 

if (parent_directory not in sys.path):
    sys.path.append(parent_directory)
    
if (module_directory not in sys.path):
    sys.path.append(module_directory)
    
if (utils_directory not in sys.path):
    sys.path.append(utils_directory)  
    
from module.preprocess.load_and_batch import TableInfoManagers
from utils import config  
import os
import polars as pl

In [3]:
full_data = TableInfoManagers.load_data_downloaded(config.DATA_LOCATION, "train")

------> Size of train dataset: (1526659, 224) <------


# Overview of the Issue
The dataset exhibits a significant imbalance with a ratio of 30:1 (majority to minority classes). This can lead to biased models favoring the majority class.

In [5]:
# Group by 'target' and count occurrences
value_counts = full_data.group_by("target").agg(pl.len().alias("count"))

# Calculate total number of records in the dataset
total_count = full_data.height

# Calculate ratio for each group
value_ratio = value_counts.with_columns(
    (pl.col("count") / total_count).alias("ratio")
)

# Find the maximum count (minority class count)
min_count = value_counts["count"].min()

# Calculate ratio of each class count to the minority class count
comp_ratio = value_counts.with_columns(
    (pl.col("count") / min_count).alias("majority_ratio")
)

# Print each target with its corresponding ratio
print(f"Split and ration information per class:")
print(value_counts)
print("----------")
print(value_ratio)
print("----------")
print(comp_ratio)

Split and ration information per class:
shape: (2, 2)
┌────────┬─────────┐
│ target ┆ count   │
│ ---    ┆ ---     │
│ i64    ┆ u32     │
╞════════╪═════════╡
│ 0      ┆ 1478665 │
│ 1      ┆ 47994   │
└────────┴─────────┘
----------
shape: (2, 3)
┌────────┬─────────┬──────────┐
│ target ┆ count   ┆ ratio    │
│ ---    ┆ ---     ┆ ---      │
│ i64    ┆ u32     ┆ f64      │
╞════════╪═════════╪══════════╡
│ 0      ┆ 1478665 ┆ 0.968563 │
│ 1      ┆ 47994   ┆ 0.031437 │
└────────┴─────────┴──────────┘
----------
shape: (2, 3)
┌────────┬─────────┬────────────────┐
│ target ┆ count   ┆ majority_ratio │
│ ---    ┆ ---     ┆ ---            │
│ i64    ┆ u32     ┆ f64            │
╞════════╪═════════╪════════════════╡
│ 0      ┆ 1478665 ┆ 30.809372      │
│ 1      ┆ 47994   ┆ 1.0            │
└────────┴─────────┴────────────────┘


### Proposed Solution
To address this imbalance, we implement an ensemble strategy:
- **Data Segmentation**: The majority class is segmented into five separate datasets. This segmentation is strategic, allowing for diverse learning scopes within each subset.
- **Balanced Training**: Each segmented dataset is then combined with the full minority class dataset. This approach ensures each model trained on these combinations has exposure to balanced data, thereby mitigating the risk of bias.
- **Ensemble Technique**: After training individual models on these balanced datasets, a soft voting mechanism is employed to aggregate the predictions from each model. Soft voting considers the probability of the output classes, which enhances decision-making by leveraging the strengths of each m.d models.
rained moels.


<figure>
    <img src="images/Method.png" alt="Centered Image" style="display: block; margin-left: auto; margin-right: auto; width: 65%;" />
    <figcaption style="text-align: center;">Moving From Single Model to Embedding Model Approach.</figcaption>
</figure>

# Data Splitting

## Storing Minority Class

In [12]:
# save minority seperately
filtered_data = full_data.filter(pl.col("target") == 1)
file_path = os.path.join(config.SPLIT_DATASETS, "target.csv")
filtered_data.write_csv(file_path)

## Stratified Sampling Of Majority Class
The majority class was stratified on the 'WEEK_NUM' column. Ensuring records for each **Decision WEEK** in all training data.

In [8]:
def split_datasets(data: pl.DataFrame, folder_dir: str, num_splits: int = 5):
    # Filter data where 'target' is 0 and shuffle
    filtered_data = data.filter(pl.col("target") == 0)
    filtered_data = filtered_data.sample(n=filtered_data.height, shuffle=True)
    # Group by 'WEEK_NUM', calculate group sizes
    group_sizes = filtered_data.group_by("WEEK_NUM").agg(pl.len().alias("size"))

    # Calculate total size and determine fraction needed per split
    total_size = filtered_data.height
    fraction_per_split = 1 / num_splits

    # Prepare storage for the split datasets
    splits = [pl.DataFrame() for _ in range(num_splits)]

    # Iterate over each group, sample the appropriate fraction for each split
    for week_num, size in zip(group_sizes['WEEK_NUM'], group_sizes['size']):
        group_data = filtered_data.filter(pl.col("WEEK_NUM") == week_num)
        samples_per_split = int(size * fraction_per_split)

        for i in range(num_splits):
            if i == num_splits - 1:  # Handle last split to include all remaining data
                split_sample = group_data.tail(size - samples_per_split * i)
            else:
                split_sample = group_data.slice(samples_per_split * i, samples_per_split)
            splits[i] = splits[i].vstack(split_sample)

    # Ensure the directory exists
    if not os.path.exists(folder_dir):
        os.makedirs(folder_dir)

    # Save each split to a CSV file
    for i, split in enumerate(splits):
        print(f"Datasize for {i+1} split is {split.height}")
        file_path = os.path.join(folder_dir, f"dataset_{i+1}.csv")
        split.write_csv(file_path)

In [6]:
split_datasets(full_data, num_splits=5, folder_dir=config.SPLIT_DATASETS)

def dataset_info(idx):
    """For viewing split dataset"""
    loc_ = config.SPLIT_DATASETS + f"dataset_{idx+1}.csv"
    data = pl.read_csv(loc_)
    print(f"Shape for Dataset {idx+1}: {data.shape}")

## Data Properties Post Split

In [7]:
dataset_info(0)

Counts for Dataset 1:
shape: (92, 2)
┌──────────┬───────┐
│ WEEK_NUM ┆ count │
│ ---      ┆ ---   │
│ i64      ┆ u32   │
╞══════════╪═══════╡
│ 0        ┆ 3267  │
│ 1        ┆ 3673  │
│ 2        ┆ 3402  │
│ 3        ┆ 3137  │
│ 4        ┆ 2781  │
│ …        ┆ …     │
│ 87       ┆ 3511  │
│ 88       ┆ 2793  │
│ 89       ┆ 2658  │
│ 90       ┆ 2354  │
│ 91       ┆ 2477  │
└──────────┴───────┘


case_id,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,applicationscnt_1086L,applicationscnt_464L,applicationscnt_629L,applicationscnt_867L,avgdbddpdlast24m_3658932P,avgdbddpdlast3m_4187120P,avgdbdtollast24m_4525197P,avgdpdtolclosure24_3658938P,avginstallast24m_3658937A,avglnamtstart24m_4525187A,avgmaxdpdlast9m_3716943P,avgoutstandbalancel6m_4187114A,avgpmtlast12m_4525200A,bankacctype_710L,cardtype_51L,clientscnt12m_3712952L,clientscnt3m_3712950L,clientscnt6m_3712949L,clientscnt_100L,clientscnt_1022L,clientscnt_1071L,clientscnt_1130L,clientscnt_136L,clientscnt_157L,clientscnt_257L,clientscnt_304L,clientscnt_360L,clientscnt_493L,clientscnt_533L,clientscnt_887L,…,formonth_118L,formonth_206L,formonth_535L,forquarter_1017L,forquarter_462L,forquarter_634L,fortoday_1092L,forweek_1077L,forweek_528L,forweek_601L,foryear_618L,foryear_818L,foryear_850L,fourthquarter_440L,maritalst_385M,maritalst_893M,numberofqueries_373L,pmtaverage_3A,pmtaverage_4527227A,pmtaverage_4955615A,pmtcount_4527229L,pmtcount_4955617L,pmtcount_693L,pmtscount_423L,pmtssum_45A,requesttype_4525192L,responsedate_1012D,responsedate_4527233D,responsedate_4917613D,riskassesment_302T,riskassesment_940T,secondquarter_766L,thirdquarter_1082L,date_decision,MONTH,WEEK_NUM,target
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f64,f64,str,f64,f64,str,str,str,f64,f64,f64,f64,f64,f64,f64,str,f64,f64,f64,f64,f64,f64,f64,…,str,str,str,str,str,str,str,str,str,str,str,str,str,f64,str,str,f64,f64,str,str,str,str,f64,f64,f64,str,str,str,str,str,f64,f64,f64,str,i64,i64,i64
2557190,0.0,104088.86,4166.6,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-12.0,-5.0,,0.0,5478.4,,0.0,17366.666,,"""CA""",,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,,,,,,,,,,,,,,,"""a55475b1""","""a55475b1""",,0.0,,,,,0.0,,,,"""2019-04-13""",,,,,,,"""2019-03-30""",201903,12,0
661746,,,2786.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,1.0,0.0,…,,,,,,,,,,,,,,,"""a55475b1""","""a55475b1""",,,,,,,,0.0,0.0,,"""2019-04-14""",,,,,,,"""2019-03-31""",201903,12,0
2556685,0.0,94346.4,9342.4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-2.0,,,1.0,10483.0,,,,,"""CA""",,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,,,,,,,,,,,,,,2.0,"""3439d993""","""a55475b1""",1.0,,,,,,,3.0,5232.6,,"""2019-04-12""",,,,,0.0,0.0,"""2019-03-29""",201903,12,0
118623,0.0,26950.4,4980.2,2395.2,0.0,0.0,0.0,0.0,0.0,2.0,-12.0,-14.0,,0.0,3850.0,,0.0,10463.335,,"""CA""",,0.0,0.0,0.0,0.0,1.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,,,,,,,,,,,,,,1.0,"""a55475b1""","""a55475b1""",4.0,,,,,,,0.0,0.0,,"""2019-04-09""",,,,,2.0,1.0,"""2019-03-26""",201903,12,0
1325776,0.0,11469.4,1808.2001,0.0,0.0,0.0,0.0,0.0,0.0,4.0,-3.0,,,1.0,1433.8,,0.0,720.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,,,,,,,,,,,,,,7.0,"""a55475b1""","""a55475b1""",16.0,,,,,,,0.0,0.0,,"""2019-04-13""",,,,,3.0,4.0,"""2019-03-30""",201903,12,0


In [8]:
dataset_info(1)

Counts for Dataset 2:
shape: (92, 2)
┌──────────┬───────┐
│ WEEK_NUM ┆ count │
│ ---      ┆ ---   │
│ i64      ┆ u32   │
╞══════════╪═══════╡
│ 0        ┆ 3267  │
│ 1        ┆ 3673  │
│ 2        ┆ 3402  │
│ 3        ┆ 3137  │
│ 4        ┆ 2781  │
│ …        ┆ …     │
│ 87       ┆ 3511  │
│ 88       ┆ 2793  │
│ 89       ┆ 2658  │
│ 90       ┆ 2354  │
│ 91       ┆ 2477  │
└──────────┴───────┘


case_id,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,applicationscnt_1086L,applicationscnt_464L,applicationscnt_629L,applicationscnt_867L,avgdbddpdlast24m_3658932P,avgdbddpdlast3m_4187120P,avgdbdtollast24m_4525197P,avgdpdtolclosure24_3658938P,avginstallast24m_3658937A,avglnamtstart24m_4525187A,avgmaxdpdlast9m_3716943P,avgoutstandbalancel6m_4187114A,avgpmtlast12m_4525200A,bankacctype_710L,cardtype_51L,clientscnt12m_3712952L,clientscnt3m_3712950L,clientscnt6m_3712949L,clientscnt_100L,clientscnt_1022L,clientscnt_1071L,clientscnt_1130L,clientscnt_136L,clientscnt_157L,clientscnt_257L,clientscnt_304L,clientscnt_360L,clientscnt_493L,clientscnt_533L,clientscnt_887L,…,formonth_118L,formonth_206L,formonth_535L,forquarter_1017L,forquarter_462L,forquarter_634L,fortoday_1092L,forweek_1077L,forweek_528L,forweek_601L,foryear_618L,foryear_818L,foryear_850L,fourthquarter_440L,maritalst_385M,maritalst_893M,numberofqueries_373L,pmtaverage_3A,pmtaverage_4527227A,pmtaverage_4955615A,pmtcount_4527229L,pmtcount_4955617L,pmtcount_693L,pmtscount_423L,pmtssum_45A,requesttype_4525192L,responsedate_1012D,responsedate_4527233D,responsedate_4917613D,riskassesment_302T,riskassesment_940T,secondquarter_766L,thirdquarter_1082L,date_decision,MONTH,WEEK_NUM,target
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f64,f64,str,f64,f64,str,str,str,f64,f64,f64,f64,f64,f64,f64,str,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,f64,str,str,str,str,f64,f64,f64,str,str,str,str,str,f64,f64,f64,str,i64,i64,i64
660511,,,2362.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,,,,,,,,,,,,,,2.0,"""3439d993""","""a55475b1""",2.0,6987.1,,,,,6.0,,,,"""2019-04-12""",,,,,1.0,2.0,"""2019-03-29""",201903,12,0
658533,,,1503.2001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,"""INSTANT""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,10.0,…,,,,,,,,,,,,,,0.0,"""3439d993""","""a55475b1""",0.0,,,,,,,6.0,15193.4,,"""2019-04-09""",,,,,0.0,0.0,"""2019-03-26""",201903,12,0
662820,,,4772.8003,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,1.0,0.0,,1.0,0.0,0.0,0.0,0.0,1.0,3.0,…,,,,,,,,,,,,,,6.0,"""3439d993""","""a55475b1""",7.0,,,,,,,13.0,20610.0,,"""2019-04-15""",,,,,2.0,14.0,"""2019-04-01""",201904,12,0
1327589,0.0,26337.36,2644.0,2836.2,0.0,0.0,0.0,0.0,0.0,1.0,1.0,-4.0,,1.0,3488.8,,1.0,21370.066,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,,"""a55475b1""","""1a19667c""",,,,,,,,8.0,4696.8003,,"""2019-04-15""",,,,,,,"""2019-04-01""",201904,12,0
1324643,0.0,11385.0,1854.4,3795.0,0.0,0.0,0.0,0.0,0.0,2.0,7.0,9.0,,0.0,3795.0,,0.0,85153.4,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,,,,,,,,,,,,,,7.0,"""a55475b1""","""a55475b1""",7.0,3585.9001,,,,,12.0,,,,"""2019-04-13""",,,,,0.0,0.0,"""2019-03-30""",201903,12,0


In [9]:
dataset_info(2)

Counts for Dataset 3:
shape: (92, 2)
┌──────────┬───────┐
│ WEEK_NUM ┆ count │
│ ---      ┆ ---   │
│ i64      ┆ u32   │
╞══════════╪═══════╡
│ 0        ┆ 3267  │
│ 1        ┆ 3673  │
│ 2        ┆ 3402  │
│ 3        ┆ 3137  │
│ 4        ┆ 2781  │
│ …        ┆ …     │
│ 87       ┆ 3511  │
│ 88       ┆ 2793  │
│ 89       ┆ 2658  │
│ 90       ┆ 2354  │
│ 91       ┆ 2477  │
└──────────┴───────┘


case_id,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,applicationscnt_1086L,applicationscnt_464L,applicationscnt_629L,applicationscnt_867L,avgdbddpdlast24m_3658932P,avgdbddpdlast3m_4187120P,avgdbdtollast24m_4525197P,avgdpdtolclosure24_3658938P,avginstallast24m_3658937A,avglnamtstart24m_4525187A,avgmaxdpdlast9m_3716943P,avgoutstandbalancel6m_4187114A,avgpmtlast12m_4525200A,bankacctype_710L,cardtype_51L,clientscnt12m_3712952L,clientscnt3m_3712950L,clientscnt6m_3712949L,clientscnt_100L,clientscnt_1022L,clientscnt_1071L,clientscnt_1130L,clientscnt_136L,clientscnt_157L,clientscnt_257L,clientscnt_304L,clientscnt_360L,clientscnt_493L,clientscnt_533L,clientscnt_887L,…,formonth_118L,formonth_206L,formonth_535L,forquarter_1017L,forquarter_462L,forquarter_634L,fortoday_1092L,forweek_1077L,forweek_528L,forweek_601L,foryear_618L,foryear_818L,foryear_850L,fourthquarter_440L,maritalst_385M,maritalst_893M,numberofqueries_373L,pmtaverage_3A,pmtaverage_4527227A,pmtaverage_4955615A,pmtcount_4527229L,pmtcount_4955617L,pmtcount_693L,pmtscount_423L,pmtssum_45A,requesttype_4525192L,responsedate_1012D,responsedate_4527233D,responsedate_4917613D,riskassesment_302T,riskassesment_940T,secondquarter_766L,thirdquarter_1082L,date_decision,MONTH,WEEK_NUM,target
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f64,f64,str,f64,f64,str,str,str,f64,f64,f64,f64,f64,f64,f64,str,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,f64,str,str,str,str,f64,f64,f64,str,str,str,str,str,f64,f64,f64,str,i64,i64,i64
661371,,,3540.6,0.0,0.0,0.0,0.0,2.0,2.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,1.0,0.0,0.0,0.0,,0.0,1.0,0.0,0.0,0.0,0.0,0.0,…,,,,,,,,,,,,,,1.0,"""a55475b1""","""a55475b1""",4.0,,,,,,,0.0,0.0,,"""2019-04-13""",,,,,0.0,0.0,"""2019-03-30""",201903,12,0
659658,0.0,,5381.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,,,,,,,,,,,,,,0.0,"""a7fcb6e5""","""a55475b1""",5.0,,,,,,,6.0,10931.358,,"""2019-04-11""",,,,,5.0,1.0,"""2019-03-28""",201903,12,0
659728,,,4000.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,,,,,,,,,,,,,,0.0,"""a55475b1""","""a55475b1""",1.0,,,,,,,9.0,6036.2,,"""2019-04-11""",,,,,0.0,0.0,"""2019-03-28""",201903,12,0
2555776,0.0,93365.66,4420.6,3074.4001,0.0,0.0,0.0,0.0,0.0,2.0,-3.0,-3.0,,0.0,6224.4,,0.0,30609.6,,"""CA""",,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,1.0,0.0,0.0,…,,,,,,,,,,,,,,0.0,"""a55475b1""","""a55475b1""",0.0,8434.9,,,,,6.0,,,,"""2019-04-10""",,,,,0.0,0.0,"""2019-03-27""",201903,12,0
1325031,0.0,73904.305,2018.6,4680.6,0.0,0.0,0.0,0.0,0.0,0.0,-6.0,-10.0,,0.0,6071.4,,0.0,60433.34,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,,,,,,,,,,,,,,2.0,"""a55475b1""","""a55475b1""",3.0,,,,,,,10.0,13024.4,,"""2019-04-13""",,,,,0.0,1.0,"""2019-03-30""",201903,12,0


In [10]:
dataset_info(3)

Counts for Dataset 4:
shape: (92, 2)
┌──────────┬───────┐
│ WEEK_NUM ┆ count │
│ ---      ┆ ---   │
│ i64      ┆ u32   │
╞══════════╪═══════╡
│ 0        ┆ 3267  │
│ 1        ┆ 3673  │
│ 2        ┆ 3402  │
│ 3        ┆ 3137  │
│ 4        ┆ 2781  │
│ …        ┆ …     │
│ 87       ┆ 3511  │
│ 88       ┆ 2793  │
│ 89       ┆ 2658  │
│ 90       ┆ 2354  │
│ 91       ┆ 2477  │
└──────────┴───────┘


case_id,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,applicationscnt_1086L,applicationscnt_464L,applicationscnt_629L,applicationscnt_867L,avgdbddpdlast24m_3658932P,avgdbddpdlast3m_4187120P,avgdbdtollast24m_4525197P,avgdpdtolclosure24_3658938P,avginstallast24m_3658937A,avglnamtstart24m_4525187A,avgmaxdpdlast9m_3716943P,avgoutstandbalancel6m_4187114A,avgpmtlast12m_4525200A,bankacctype_710L,cardtype_51L,clientscnt12m_3712952L,clientscnt3m_3712950L,clientscnt6m_3712949L,clientscnt_100L,clientscnt_1022L,clientscnt_1071L,clientscnt_1130L,clientscnt_136L,clientscnt_157L,clientscnt_257L,clientscnt_304L,clientscnt_360L,clientscnt_493L,clientscnt_533L,clientscnt_887L,…,formonth_118L,formonth_206L,formonth_535L,forquarter_1017L,forquarter_462L,forquarter_634L,fortoday_1092L,forweek_1077L,forweek_528L,forweek_601L,foryear_618L,foryear_818L,foryear_850L,fourthquarter_440L,maritalst_385M,maritalst_893M,numberofqueries_373L,pmtaverage_3A,pmtaverage_4527227A,pmtaverage_4955615A,pmtcount_4527229L,pmtcount_4955617L,pmtcount_693L,pmtscount_423L,pmtssum_45A,requesttype_4525192L,responsedate_1012D,responsedate_4527233D,responsedate_4917613D,riskassesment_302T,riskassesment_940T,secondquarter_766L,thirdquarter_1082L,date_decision,MONTH,WEEK_NUM,target
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f64,f64,str,f64,f64,str,str,str,f64,f64,f64,f64,f64,f64,f64,str,f64,f64,f64,f64,f64,f64,f64,…,str,str,str,str,str,str,str,str,str,str,str,str,str,f64,str,str,f64,f64,str,str,str,str,f64,f64,f64,str,str,str,str,str,str,f64,f64,str,i64,i64,i64
2556171,0.0,134969.5,3386.2,3274.2,0.0,0.0,0.0,0.0,0.0,16.0,-22.0,-14.0,,0.0,5491.4,,0.0,47069.375,,"""CA""",,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,,,,,,,,,,,,,,2.0,"""38c061ee""","""a55475b1""",1.0,7428.7,,,,,5.0,,,,"""2019-04-11""",,,,,0.0,1.0,"""2019-03-28""",201903,12,0
1323244,0.0,19168.6,1218.8,0.0,0.0,0.0,0.0,0.0,0.0,9.0,-1.0,,,1.0,3560.0,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,,,,,,,,,,,,,,2.0,"""3439d993""","""a55475b1""",4.0,,,,,,,13.0,9349.0,,"""2019-04-11""",,,,,4.0,3.0,"""2019-03-28""",201903,12,0
1323644,0.0,,2515.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,,,,,,,,,,,,,,0.0,"""a55475b1""","""a55475b1""",4.0,,,,,,,1.0,1252.964,,"""2019-04-11""",,,,,5.0,0.0,"""2019-03-28""",201903,12,0
662056,,,5557.4,0.0,0.0,0.0,5.0,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,5.0,0.0,0.0,0.0,0.0,…,,,,,,,,,,,,,,2.0,"""b6cabe76""","""a55475b1""",0.0,17453.9,,,,,6.0,,,,"""2019-04-14""",,,,,0.0,0.0,"""2019-03-31""",201903,12,0
1322973,0.0,15175.8,1575.0,1683.4,0.0,0.0,0.0,0.0,0.0,2.0,0.0,-1.0,,1.0,1686.2001,,0.0,6704.742,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,,,,,,,,,,,,,,1.0,"""3439d993""","""a55475b1""",2.0,,,,,,,1.0,75.75,,"""2019-04-11""",,,,,0.0,1.0,"""2019-03-28""",201903,12,0


In [11]:
dataset_info(4)

Counts for Dataset 5:
shape: (92, 2)
┌──────────┬───────┐
│ WEEK_NUM ┆ count │
│ ---      ┆ ---   │
│ i64      ┆ u32   │
╞══════════╪═══════╡
│ 0        ┆ 3267  │
│ 1        ┆ 3677  │
│ 2        ┆ 3403  │
│ 3        ┆ 3141  │
│ 4        ┆ 2781  │
│ …        ┆ …     │
│ 87       ┆ 3513  │
│ 88       ┆ 2797  │
│ 89       ┆ 2659  │
│ 90       ┆ 2357  │
│ 91       ┆ 2478  │
└──────────┴───────┘


case_id,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,applicationscnt_1086L,applicationscnt_464L,applicationscnt_629L,applicationscnt_867L,avgdbddpdlast24m_3658932P,avgdbddpdlast3m_4187120P,avgdbdtollast24m_4525197P,avgdpdtolclosure24_3658938P,avginstallast24m_3658937A,avglnamtstart24m_4525187A,avgmaxdpdlast9m_3716943P,avgoutstandbalancel6m_4187114A,avgpmtlast12m_4525200A,bankacctype_710L,cardtype_51L,clientscnt12m_3712952L,clientscnt3m_3712950L,clientscnt6m_3712949L,clientscnt_100L,clientscnt_1022L,clientscnt_1071L,clientscnt_1130L,clientscnt_136L,clientscnt_157L,clientscnt_257L,clientscnt_304L,clientscnt_360L,clientscnt_493L,clientscnt_533L,clientscnt_887L,…,formonth_118L,formonth_206L,formonth_535L,forquarter_1017L,forquarter_462L,forquarter_634L,fortoday_1092L,forweek_1077L,forweek_528L,forweek_601L,foryear_618L,foryear_818L,foryear_850L,fourthquarter_440L,maritalst_385M,maritalst_893M,numberofqueries_373L,pmtaverage_3A,pmtaverage_4527227A,pmtaverage_4955615A,pmtcount_4527229L,pmtcount_4955617L,pmtcount_693L,pmtscount_423L,pmtssum_45A,requesttype_4525192L,responsedate_1012D,responsedate_4527233D,responsedate_4917613D,riskassesment_302T,riskassesment_940T,secondquarter_766L,thirdquarter_1082L,date_decision,MONTH,WEEK_NUM,target
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f64,f64,str,f64,f64,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,str,str,str,str,str,str,str,str,str,str,str,str,str,f64,str,str,f64,f64,str,str,str,str,f64,f64,f64,str,str,str,str,str,f64,f64,f64,str,i64,i64,i64
662955,,,1033.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,3.0,0.0,0.0,0.0,0.0,0.0,0.0,…,,,,,,,,,,,,,,0.0,"""a55475b1""","""a55475b1""",2.0,,,,,,,0.0,0.0,,"""2019-04-15""",,,,,0.0,0.0,"""2019-04-01""",201904,12,0
1323895,0.0,2872.0,2877.8,0.0,0.0,0.0,0.0,0.0,0.0,9.0,14.0,,,8.0,2871.2,,11.0,,,,,1.0,1.0,1.0,0.0,1.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,,,,,,,,,,,,,,1.0,"""3439d993""","""a55475b1""",4.0,,,,,,,11.0,22756.47,,"""2019-04-12""",,,,,1.0,4.0,"""2019-03-29""",201903,12,0
2556073,0.0,61887.7,2051.2,1932.0,0.0,0.0,0.0,0.0,0.0,5.0,-12.0,-15.0,,0.0,3526.8,,0.0,25444.473,,"""CA""",,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,,,,,,,,,,,,,,1.0,"""a7fcb6e5""","""a55475b1""",1.0,,,,,,,6.0,12603.822,,"""2019-04-11""",,,,,0.0,1.0,"""2019-03-28""",201903,12,0
2555501,0.0,7351.2,2316.2,2450.4001,0.0,0.0,0.0,0.0,0.0,2.0,-3.0,-3.0,,0.0,2450.4001,,0.0,25692.201,,"""CA""",,0.0,0.0,0.0,0.0,1.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,,,,,,,,,,,,,,7.0,"""3439d993""","""a55475b1""",3.0,,,,,,,9.0,5437.6,,"""2019-04-10""",,,,,5.0,1.0,"""2019-03-27""",201903,12,0
660736,,,2697.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,5.0,…,,,,,,,,,,,,,,2.0,"""a55475b1""","""a55475b1""",1.0,,,,,,,5.0,57017.137,,"""2019-04-12""",,,,,2.0,2.0,"""2019-03-29""",201903,12,0


# Additional Training Strategies
 **Sampling Techniques**: During training, various sampling strategies such as oversampling the minority class, undersampling the majority class, and maintaining the original distribution are employed. These techniques are crucial for enhancing the model's ability to generalize well across different data distributions. For more info check the `loading_test` notebook. TheThis ensemble method not only addresses the imbalance in the dataset but also improves the overall predictive accuracy and robustness of the model by incorporating diverse perspectives from multiple trained models.