# 1. Import libraries & define global variables

In [22]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

INPUT_PATH = "data_in"
OUTPUT_PATH = "data_out"

ID_LABELS = ["SK_ID_CURR"]
TARGET_LABEL = "TARGET"
OLD_PD_LABEL = "pd_v2023"

NUM_WOE_QUANTILES = 5
IV_THRESHOLD = 0.05
CORRELATION_FILTER = 0.50

RANDOM_STATE = 42

# 2. Create helper classes & functions

In [23]:
# def calculate_woe_iv(df, feature, target, num_bins=5):
#     """
#     Calculate Weight of Evidence (WOE) and Information Value (IV) for a single feature.
#
#     WOE = ln(% of Events / % of Non-Events)
#     IV = sum((% Events - % Non-Events) * WOE)
#
#     Parameters:
#     -----------
#     df : DataFrame
#         Input dataframe containing feature and target
#     feature : str
#         Column name of the feature to analyze
#     target : str
#         Column name of the target variable
#     num_bins : int
#         Number of quantile bins to create
#
#     Returns:
#     --------
#     DataFrame with WOE and IV statistics per bin
#     """
#     # Create a copy to avoid modifying original data
#     temp_df = df[[feature, target]].copy()
#     temp_df = temp_df.dropna()
#
#     if len(temp_df) == 0:
#         return None
#
#     # Create quantile bins using pandas qcut
#     try:
#         temp_df['bin'] = pd.qcut(temp_df[feature], q=num_bins, duplicates='drop')
#     except ValueError:
#         # If qcut fails (e.g., too few unique values), use cut instead
#         try:
#             temp_df['bin'] = pd.cut(temp_df[feature], bins=num_bins, duplicates='drop')
#         except ValueError:
#             return None
#
#     # Calculate statistics per bin
#     grouped = temp_df.groupby('bin', observed=True).agg(
#         num_obs=(target, 'count'),
#         num_events=(target, 'sum')
#     ).reset_index()
#
#     grouped['num_non_events'] = grouped['num_obs'] - grouped['num_events']
#
#     # Calculate percentages
#     total_events = grouped['num_events'].sum()
#     total_non_events = grouped['num_non_events'].sum()
#
#     if total_events == 0 or total_non_events == 0:
#         return None
#
#     grouped['perc_events'] = grouped['num_events'] / total_events
#     grouped['perc_non_events'] = grouped['num_non_events'] / total_non_events
#
#     # Handle zeros to avoid log(0)
#     grouped['perc_events'] = grouped['perc_events'].replace(0, 0.0001)
#     grouped['perc_non_events'] = grouped['perc_non_events'].replace(0, 0.0001)
#
#     # Calculate WOE
#     grouped['woe'] = np.log(grouped['perc_events'] / grouped['perc_non_events'])
#
#     # Calculate IV per bin and total IV
#     grouped['iv_bin'] = (grouped['perc_events'] - grouped['perc_non_events']) * grouped['woe']
#     total_iv = grouped['iv_bin'].sum()
#     grouped['iv'] = total_iv
#
#     # Add variable name
#     grouped['variable'] = feature
#
#     # Reorder columns
#     grouped = grouped[['variable', 'bin', 'num_obs', 'num_events', 'num_non_events',
#                        'perc_events', 'perc_non_events', 'woe', 'iv_bin', 'iv']]
#
#     return grouped
def calculate_woe_iv(df, feature, target, num_bins=5):
    """
    Calculate Weight of Evidence (WOE) and Information Value (IV) for a single feature.

    WOE = ln(% of Events / % of Non-Events)
    IV = sum((% Events - % Non-Events) * WOE)

    Parameters:
    -----------
    df : DataFrame
        Input dataframe containing feature and target
    feature : str
        Column name of the feature to analyze
    target : str
        Column name of the target variable
    num_bins : int
        Number of bins to create

    Returns:
    --------
    DataFrame with WOE and IV statistics per bin
    """
    # Create a copy to avoid modifying original data
    temp_df = df[[feature, target]].copy()
    temp_df = temp_df.dropna()

    if len(temp_df) == 0:
        return None

    # First get quantile boundaries
    try:
        _, bin_edges = pd.qcut(temp_df[feature], q=num_bins, retbins=True, duplicates='drop')
    except ValueError:
        # If qcut fails (e.g., too few unique values), fall back to evenly spaced bins
        try:
            bin_edges = np.linspace(temp_df[feature].min(), temp_df[feature].max(), num_bins + 1)
        except ValueError:
            return None

    # Now use cut with right=False to create left-closed, right-open intervals
    try:
        temp_df['bin'] = pd.cut(temp_df[feature], bins=bin_edges, right=False, duplicates='drop')
    except ValueError:
        return None

    # Calculate statistics per bin
    grouped = temp_df.groupby('bin', observed=True).agg(
        num_obs=(target, 'count'),
        num_events=(target, 'sum')
    ).reset_index()

    grouped['num_non_events'] = grouped['num_obs'] - grouped['num_events']

    # Calculate percentages
    total_events = grouped['num_events'].sum()
    total_non_events = grouped['num_non_events'].sum()

    if total_events == 0 or total_non_events == 0:
        return None

    grouped['perc_events'] = grouped['num_events'] / total_events
    grouped['perc_non_events'] = grouped['num_non_events'] / total_non_events

    # Handle zeros to avoid log(0)
    grouped['perc_events'] = grouped['perc_events'].replace(0, 0.0001)
    grouped['perc_non_events'] = grouped['perc_non_events'].replace(0, 0.0001)

    # Calculate WOE
    grouped['woe'] = np.log(grouped['perc_events'] / grouped['perc_non_events'])

    # Calculate IV per bin and total IV
    grouped['iv_bin'] = (grouped['perc_events'] - grouped['perc_non_events']) * grouped['woe']
    total_iv = grouped['iv_bin'].sum()
    grouped['iv'] = total_iv

    # Add variable name
    grouped['variable'] = feature

    # Reorder columns
    grouped = grouped[['variable', 'bin', 'num_obs', 'num_events', 'num_non_events',
                       'perc_events', 'perc_non_events', 'woe', 'iv_bin', 'iv']]

    return grouped

def calculate_iv_for_all_features(df, features, target, num_bins=5):
    """
    Calculate IV for all features in the dataset.
    
    Returns:
    --------
    Tuple of (detailed_df, summary_df)
    """
    all_results = []
    
    for feature in features:
        result = calculate_woe_iv(df, feature, target, num_bins)
        if result is not None:
            all_results.append(result)
    
    if not all_results:
        return None, None
    
    detailed_df = pd.concat(all_results, ignore_index=True)
    
    # Create summary with unique IV per variable
    summary_df = detailed_df.groupby('variable')['iv'].first().reset_index()
    summary_df = summary_df.sort_values('iv', ascending=False).reset_index(drop=True)
    
    return detailed_df, summary_df


def get_correlation_matrix(df, features, method='spearman'):
    """
    Calculate correlation matrix for selected features.
    
    Parameters:
    -----------
    df : DataFrame
        Input dataframe
    features : list
        List of feature columns to include
    method : str
        Correlation method ('pearson' or 'spearman')
        
    Returns:
    --------
    Correlation matrix as DataFrame
    """
    corr_matrix = df[features].corr(method=method)
    return corr_matrix


def create_prioritization_table(iv_summary, corr_matrix):
    """
    Merge IV summary with correlation matrix to create prioritization table.
    
    Parameters:
    -----------
    iv_summary : DataFrame
        DataFrame with 'variable' and 'iv' columns
    corr_matrix : DataFrame
        Correlation matrix
        
    Returns:
    --------
    Merged DataFrame with IV and correlations
    """
    # Reset correlation matrix to have variable column
    corr_df = corr_matrix.reset_index()
    corr_df = corr_df.rename(columns={'index': 'variable'})
    
    # Merge with IV summary
    merged = pd.merge(iv_summary, corr_df, on='variable', how='inner')
    
    # Sort by IV (highest first)
    merged = merged.sort_values('iv', ascending=False).reset_index(drop=True)
    
    return merged


def apply_correlation_filter(prioritization_df, corr_threshold):
    """
    Apply correlation-based feature filtering.
    
    For each pair of highly correlated features (|corr| >= threshold),
    keep the one with higher IV.
    
    Parameters:
    -----------
    prioritization_df : DataFrame
        Prioritization table with 'variable', 'iv', and correlation columns
    corr_threshold : float
        Correlation threshold (e.g., 0.50)
        
    Returns:
    --------
    DataFrame with max_abs_corr and keep_feature columns added
    """
    df = prioritization_df.copy()
    
    # Get feature columns (excluding 'variable' and 'iv')
    feature_cols = [col for col in df.columns if col not in ['variable', 'iv', 'max_abs_corr', 'keep_feature']]
    
    # Keep track of which features to keep
    kept_features = set()
    max_abs_corrs = []
    keep_flags = []
    
    # Process features in order of IV (highest first)
    for idx, row in df.iterrows():
        current_var = row['variable']
        
        if len(kept_features) == 0:
            # First feature is always kept
            max_abs_corrs.append(np.nan)
            keep_flags.append(1)
            kept_features.add(current_var)
        else:
            # Calculate max absolute correlation with already kept features
            corrs_with_kept = []
            for kept_var in kept_features:
                if kept_var in feature_cols:
                    corr_val = abs(row[kept_var])
                    corrs_with_kept.append(corr_val)
            
            if corrs_with_kept:
                max_corr = max(corrs_with_kept)
            else:
                max_corr = 0
            
            max_abs_corrs.append(max_corr)
            
            # Keep if below threshold
            if max_corr < corr_threshold:
                keep_flags.append(1)
                kept_features.add(current_var)
            else:
                keep_flags.append(0)
    
    df['max_abs_corr'] = max_abs_corrs
    df['keep_feature'] = keep_flags
    
    # Reorder columns to put max_abs_corr and keep_feature after iv
    cols = ['variable', 'iv', 'max_abs_corr', 'keep_feature'] + feature_cols
    df = df[cols]
    
    return df

# 3. Import data

In [24]:
# Four datasets: "data_treatment_train.csv", "data_treatment_val.csv", "data_treatment_test.csv",
# "data_treatment_impact.csv".

df_train = pd.read_csv(f"{INPUT_PATH}/data_treatment_train.csv")
df_val = pd.read_csv(f"{INPUT_PATH}/data_treatment_val.csv")
df_test = pd.read_csv(f"{INPUT_PATH}/data_treatment_test.csv")
df_impact = pd.read_csv(f"{INPUT_PATH}/data_treatment_impact.csv")

print(f"Train shape: {df_train.shape}")
print(f"Validation shape: {df_val.shape}")
print(f"Test shape: {df_test.shape}")
print(f"Impact shape: {df_impact.shape}")

# Display first few rows of training data
df_train.head()

Train shape: (23423, 197)
Validation shape: (2928, 197)
Test shape: (2928, 197)
Impact shape: (439, 197)


Unnamed: 0,SK_ID_CURR,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY_x,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,...,is_missing_YEARS_BEGINEXPLUATATION_MEDI,is_missing_YEARS_BEGINEXPLUATATION_MODE,is_missing_TOTALAREA_MODE,is_missing_EXT_SOURCE_3,is_missing_AMT_REQ_CREDIT_BUREAU_DAY,is_missing_AMT_REQ_CREDIT_BUREAU_HOUR,is_missing_AMT_REQ_CREDIT_BUREAU_MON,is_missing_AMT_REQ_CREDIT_BUREAU_QRT,is_missing_AMT_REQ_CREDIT_BUREAU_WEEK,is_missing_AMT_REQ_CREDIT_BUREAU_YEAR
0,162883,0.0,0.0,90000.0,180000.0,9000.0,180000.0,0.02461,-10706.0,-192.0,...,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,117041,0.0,0.0,90000.0,270000.0,13500.0,270000.0,0.018634,-17489.0,-1464.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,281109,0.0,1.0,148500.0,315000.0,15750.0,315000.0,0.009657,-18134.0,-7993.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,325062,0.0,0.0,166500.0,157500.0,7875.0,157500.0,0.010032,-18791.0,-4540.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,349323,0.0,0.0,225000.0,675000.0,33750.0,675000.0,0.019689,-20450.0,-2586.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 4. Conduct pre-model feature selection

## 4.1. Step 1: Conduct univariate variable selection

### 4.1.1. Apply WOE binning for each feature and calculate IV

In [25]:
# Calculate information value. Use 5 quantile bin via the pandas qcut() function.
# https://pandas.pydata.org/docs/reference/api/pandas.qcut.html
# Ignore the following columns from the treatment: ID, target label, old pd label.

# Get list of feature columns (exclude ID, TARGET, and OLD_PD_LABEL)
exclude_cols = ID_LABELS + [TARGET_LABEL, OLD_PD_LABEL]
feature_cols = [col for col in df_train.columns if col not in exclude_cols]

print(f"Number of features to analyze: {len(feature_cols)}")

# Calculate IV for all features using training data
iv_detailed, iv_summary = calculate_iv_for_all_features(
    df_train, 
    feature_cols, 
    TARGET_LABEL, 
    num_bins=NUM_WOE_QUANTILES
)

print(f"\nIV calculation completed for {len(iv_summary)} features.")
print(f"\nDetailed WOE/IV table (first 20 rows):")
iv_detailed.head(20)

# Output example.
# variable 	bin 	num_obs 	num_events 	num_non_events 	perc_events 	perc_non_events 	woe 	iv_bin 	iv
# 0 	EXT_SOURCE_2 	(-0.00099868, 0.366] 	4685 	518.0 	4167.0 	0.403741 	0.188211 	0.763208 	0.164494 	0.349617
# 1 	EXT_SOURCE_2 	(0.366, 0.522] 	4684 	281.0 	4403.0 	0.219018 	0.198871 	0.096498 	0.001944 	0.349617
# 2 	EXT_SOURCE_2 	(0.522, 0.612] 	4685 	226.0 	4459.0 	0.176150 	0.201400 	-0.133960 	0.003383 	0.349617
# 3 	EXT_SOURCE_2 	(0.612, 0.685] 	4684 	167.0 	4517.0 	0.130164 	0.204020 	-0.449425 	0.033193 	0.349617
# 4 	EXT_SOURCE_2 	(0.685, 0.855] 	4685 	91.0 	4594.0 	0.070928 	0.207498 	-1.073462 	0.146603 	0.349617

Number of features to analyze: 194

IV calculation completed for 194 features.

Detailed WOE/IV table (first 20 rows):


Unnamed: 0,variable,bin,num_obs,num_events,num_non_events,perc_events,perc_non_events,woe,iv_bin,iv
0,CNT_CHILDREN,"[0.0, 1.0)",15153,811.0,14342.0,0.645187,0.657558,-0.018993,0.000235,0.000674
1,CNT_CHILDREN,"[1.0, 3.0)",7915,446.0,7469.0,0.354813,0.342442,0.035489,0.000439,0.000674
2,AMT_INCOME_TOTAL,"[45000.0, 90000.0)",3225,232.0,2993.0,0.181961,0.136879,0.284693,0.012834,0.072544
3,AMT_INCOME_TOTAL,"[90000.0, 117000.0)",6098,415.0,5683.0,0.32549,0.259901,0.225031,0.01476,0.072544
4,AMT_INCOME_TOTAL,"[117000.0, 157500.0)",4160,236.0,3924.0,0.185098,0.179457,0.030952,0.000175,0.072544
5,AMT_INCOME_TOTAL,"[157500.0, 225000.0)",5210,238.0,4972.0,0.186667,0.227385,-0.19732,0.008035,0.072544
6,AMT_INCOME_TOTAL,"[225000.0, 675000.0)",4448,154.0,4294.0,0.120784,0.196378,-0.486035,0.036741,0.072544
7,AMT_CREDIT,"[135000.0, 180000.0)",2426,160.0,2266.0,0.124805,0.104237,0.180089,0.003704,0.126529
8,AMT_CREDIT,"[180000.0, 225000.0)",6698,516.0,6182.0,0.402496,0.284374,0.347396,0.041035,0.126529
9,AMT_CREDIT,"[225000.0, 270000.0)",2221,113.0,2108.0,0.088144,0.096969,-0.095421,0.000842,0.126529


### 4.1.2. Keep features that are equal to or greater than IV_THRESHOLD

In [26]:
# Filter features based on IV threshold
iv_filtered = iv_summary[iv_summary['iv'] >= IV_THRESHOLD].copy()
iv_filtered = iv_filtered.reset_index(drop=True)

print(f"IV threshold: {IV_THRESHOLD}")
print(f"Features meeting threshold: {len(iv_filtered)}")
print(f"Features removed: {len(iv_summary) - len(iv_filtered)}")

# Get the list of features to keep
features_to_keep = iv_filtered['variable'].tolist()

# Apply filter to training data
cols_to_keep = ID_LABELS + [TARGET_LABEL, OLD_PD_LABEL] + features_to_keep
df_train_filtered = df_train[cols_to_keep].copy()
df_val_filtered = df_val[cols_to_keep].copy()
df_test_filtered = df_test[cols_to_keep].copy()
df_impact_filtered = df_impact[cols_to_keep].copy()

print(f"\nSize of train after removal = {df_train_filtered.shape}.")
print(f"# features removed = {len(feature_cols) - len(features_to_keep)}.")

# Display IV summary for kept features
print(f"\nIV Summary for features meeting threshold:")
iv_filtered.head()

# Sample output example.
# variable 	iv
# 0 	AMT_ANNUITY_x 	0.141543
# 1 	AMT_CREDIT 	0.141543
# 2 	AMT_CREDIT_SUM 	0.019923
# 3 	AMT_CREDIT_SUM_DEBT 	0.026903
# 4 	AMT_GOODS_PRICE 	0.139077

# Size of train after removal = (23423, 26).
# # features removed = 171.

IV threshold: 0.05
Features meeting threshold: 22
Features removed: 172

Size of train after removal = (23423, 25).
# features removed = 172.

IV Summary for features meeting threshold:


Unnamed: 0,variable,iv
0,EXT_SOURCE_2,0.349476
1,EXT_SOURCE_3,0.247875
2,EXT_SOURCE_1,0.140765
3,DAYS_EMPLOYED,0.138171
4,AMT_CREDIT,0.126529


### 4.2.1. Get correlation matrix

In [27]:
# Use Pandas's Spearman correlation
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.corr.html

# Calculate Spearman correlation matrix for the filtered features
corr_matrix = get_correlation_matrix(df_train_filtered, features_to_keep, method='spearman')

# Convert to long format for display
corr_df = corr_matrix.reset_index()
corr_df = corr_df.rename(columns={'index': 'variable'})

print(f"Correlation matrix shape: {corr_matrix.shape}")
print(f"\nCorrelation matrix (first 5 rows):")
corr_df.head()

# Sample output example.
# variable 	AMT_ANNUITY_x 	AMT_CREDIT 	AMT_GOODS_PRICE 	AMT_INCOME_TOTAL 	APARTMENTS_AVG 	APARTMENTS_MODE 	DAYS_BIRTH 	DAYS_EMPLOYED 	DAYS_ID_PUBLISH 	... 	EXT_SOURCE_1 	EXT_SOURCE_2 	EXT_SOURCE_3 	FLOORSMAX_AVG 	FLOORSMAX_MEDI 	FLOORSMAX_MODE 	MAX_12M_DAYS_CREDIT 	NUM_TIMES_12M_CREDIT_ACTIVE_Active 	REGION_POPULATION_RELATIVE 	TOTALAREA_MODE
# 0 	AMT_ANNUITY_x 	1.000000 	0.999990 	0.984831 	0.504992 	0.063521 	0.060415 	-0.266947 	-0.136266 	-0.135793 	... 	0.213585 	0.242036 	0.087469 	0.130693 	0.127550 	0.121965 	-0.042097 	-0.015691 	0.154382 	0.085155
# 1 	AMT_CREDIT 	0.999990 	1.000000 	0.984840 	0.504980 	0.063466 	0.060366 	-0.266957 	-0.136284 	-0.135762 	... 	0.213523 	0.242006 	0.087436 	0.130646 	0.127505 	0.121924 	-0.042102 	-0.015690 	0.154359 	0.085112
# 2 	AMT_GOODS_PRICE 	0.984831 	0.984840 	1.000000 	0.499723 	0.062735 	0.059585 	-0.265253 	-0.138838 	-0.137305 	... 	0.210233 	0.242111 	0.085854 	0.129219 	0.126134 	0.120523 	-0.043285 	-0.014198 	0.152982 	0.084184
# 3 	AMT_INCOME_TOTAL 	0.504992 	0.504980 	0.499723 	1.000000 	0.093269 	0.088529 	-0.070374 	-0.146622 	-0.066916 	... 	0.129191 	0.224685 	-0.013639 	0.167823 	0.164531 	0.156526 	-0.045623 	0.032767 	0.147849 	0.121696
# 4 	APARTMENTS_AVG 	0.063521 	0.063466 	0.062735 	0.093269 	1.000000 	0.925845 	0.007444 	-0.009409 	-0.016673 	... 	0.029550 	0.073258 	0.012635 	0.656267 	0.656158 	0.654745 	-0.003585 	0.003451 	0.107505 	0.813276

# 5 rows × 24 columns

Correlation matrix shape: (22, 22)

Correlation matrix (first 5 rows):


Unnamed: 0,variable,EXT_SOURCE_2,EXT_SOURCE_3,EXT_SOURCE_1,DAYS_EMPLOYED,AMT_CREDIT,AMT_ANNUITY_x,AMT_GOODS_PRICE,DAYS_BIRTH,DAYS_ID_PUBLISH,...,ELEVATORS_MEDI,TOTALAREA_MODE,ELEVATORS_MODE,FLOORSMAX_MEDI,FLOORSMAX_AVG,LIVINGAREA_AVG,APARTMENTS_MODE,FLOORSMAX_MODE,REGION_RATING_CLIENT,APARTMENTS_MEDI
0,EXT_SOURCE_2,1.0,0.099211,0.157079,-0.10324,0.242006,0.242036,0.242111,-0.146328,-0.106413,...,0.161402,0.087936,0.156647,0.139015,0.140547,0.086822,0.067636,0.131985,-0.348961,0.070036
1,EXT_SOURCE_3,0.099211,1.0,0.092587,-0.068018,0.087436,0.087469,0.085854,-0.170191,-0.148008,...,0.025224,0.014025,0.023058,0.025808,0.026233,0.012164,0.013862,0.02407,0.000334,0.013879
2,EXT_SOURCE_1,0.157079,0.092587,1.0,-0.078025,0.213523,0.213585,0.210233,-0.374384,-0.139382,...,0.081838,0.045436,0.079123,0.06557,0.066425,0.042253,0.028365,0.061478,-0.068768,0.028305
3,DAYS_EMPLOYED,-0.10324,-0.068018,-0.078025,1.0,-0.136284,-0.136266,-0.138838,-0.013514,0.009264,...,-0.020975,-0.021045,-0.019445,-0.023158,-0.021893,-0.016359,-0.009519,-0.022408,0.002354,-0.007411
4,AMT_CREDIT,0.242006,0.087436,0.213523,-0.136284,1.0,0.99999,0.98484,-0.266957,-0.135762,...,0.144179,0.085112,0.138368,0.127505,0.130646,0.077888,0.060366,0.121924,-0.204004,0.060588


### 4.2.2. Create prioritization table

In [28]:
# Merge iv and corr.
prioritization_df = create_prioritization_table(iv_filtered, corr_matrix)

print(f"Prioritization table shape: {prioritization_df.shape}")
print(f"\nPrioritization table (first 5 rows):")
prioritization_df.head()

# Sample output example:
# variable 	iv 	AMT_ANNUITY_x 	AMT_CREDIT 	AMT_GOODS_PRICE 	AMT_INCOME_TOTAL 	APARTMENTS_AVG 	APARTMENTS_MODE 	DAYS_BIRTH 	DAYS_EMPLOYED 	... 	EXT_SOURCE_1 	EXT_SOURCE_2 	EXT_SOURCE_3 	FLOORSMAX_AVG 	FLOORSMAX_MEDI 	FLOORSMAX_MODE 	MAX_12M_DAYS_CREDIT 	NUM_TIMES_12M_CREDIT_ACTIVE_Active 	REGION_POPULATION_RELATIVE 	TOTALAREA_MODE
# 0 	AMT_ANNUITY_x 	0.141543 	1.000000 	0.999990 	0.984831 	0.504992 	0.063521 	0.060415 	-0.266947 	-0.136266 	... 	0.213585 	0.242036 	0.087469 	0.130693 	0.127550 	0.121965 	-0.042097 	-0.015691 	0.154382 	0.085155
# 1 	AMT_CREDIT 	0.141543 	0.999990 	1.000000 	0.984840 	0.504980 	0.063466 	0.060366 	-0.266957 	-0.136284 	... 	0.213523 	0.242006 	0.087436 	0.130646 	0.127505 	0.121924 	-0.042102 	-0.015690 	0.154359 	0.085112
# 2 	AMT_GOODS_PRICE 	0.139077 	0.984831 	0.984840 	1.000000 	0.499723 	0.062735 	0.059585 	-0.265253 	-0.138838 	... 	0.210233 	0.242111 	0.085854 	0.129219 	0.126134 	0.120523 	-0.043285 	-0.014198 	0.152982 	0.084184
# 3 	AMT_INCOME_TOTAL 	0.081092 	0.504992 	0.504980 	0.499723 	1.000000 	0.093269 	0.088529 	-0.070374 	-0.146622 	... 	0.129191 	0.224685 	-0.013639 	0.167823 	0.164531 	0.156526 	-0.045623 	0.032767 	0.147849 	0.121696
# 4 	APARTMENTS_AVG 	0.051764 	0.063521 	0.063466 	0.062735 	0.093269 	1.000000 	0.925845 	0.007444 	-0.009409 	... 	0.029550 	0.073258 	0.012635 	0.656267 	0.656158 	0.654745 	-0.003585 	0.003451 	0.107505 	0.813276

Prioritization table shape: (22, 24)

Prioritization table (first 5 rows):


Unnamed: 0,variable,iv,EXT_SOURCE_2,EXT_SOURCE_3,EXT_SOURCE_1,DAYS_EMPLOYED,AMT_CREDIT,AMT_ANNUITY_x,AMT_GOODS_PRICE,DAYS_BIRTH,...,ELEVATORS_MEDI,TOTALAREA_MODE,ELEVATORS_MODE,FLOORSMAX_MEDI,FLOORSMAX_AVG,LIVINGAREA_AVG,APARTMENTS_MODE,FLOORSMAX_MODE,REGION_RATING_CLIENT,APARTMENTS_MEDI
0,EXT_SOURCE_2,0.349476,1.0,0.099211,0.157079,-0.10324,0.242006,0.242036,0.242111,-0.146328,...,0.161402,0.087936,0.156647,0.139015,0.140547,0.086822,0.067636,0.131985,-0.348961,0.070036
1,EXT_SOURCE_3,0.247875,0.099211,1.0,0.092587,-0.068018,0.087436,0.087469,0.085854,-0.170191,...,0.025224,0.014025,0.023058,0.025808,0.026233,0.012164,0.013862,0.02407,0.000334,0.013879
2,EXT_SOURCE_1,0.140765,0.157079,0.092587,1.0,-0.078025,0.213523,0.213585,0.210233,-0.374384,...,0.081838,0.045436,0.079123,0.06557,0.066425,0.042253,0.028365,0.061478,-0.068768,0.028305
3,DAYS_EMPLOYED,0.138171,-0.10324,-0.068018,-0.078025,1.0,-0.136284,-0.136266,-0.138838,-0.013514,...,-0.020975,-0.021045,-0.019445,-0.023158,-0.021893,-0.016359,-0.009519,-0.022408,0.002354,-0.007411
4,AMT_CREDIT,0.126529,0.242006,0.087436,0.213523,-0.136284,1.0,0.99999,0.98484,-0.266957,...,0.144179,0.085112,0.138368,0.127505,0.130646,0.077888,0.060366,0.121924,-0.204004,0.060588


In [29]:
# Determine max absolute correlation with above features (sorted by IV importance) and flag to keep if at or below
# threshold.

prioritization_with_filter = apply_correlation_filter(prioritization_df, CORRELATION_FILTER)

print(f"Correlation filter threshold: {CORRELATION_FILTER}")
print(f"\nPrioritization table with filter (first 5 rows):")
prioritization_with_filter.head()

# Sample output example:
# variable 	iv 	max_abs_corr 	keep_feature 	AMT_ANNUITY_x 	AMT_CREDIT 	AMT_GOODS_PRICE 	AMT_INCOME_TOTAL 	APARTMENTS_AVG 	APARTMENTS_MODE 	... 	EXT_SOURCE_1 	EXT_SOURCE_2 	EXT_SOURCE_3 	FLOORSMAX_AVG 	FLOORSMAX_MEDI 	FLOORSMAX_MODE 	MAX_12M_DAYS_CREDIT 	NUM_TIMES_12M_CREDIT_ACTIVE_Active 	REGION_POPULATION_RELATIVE 	TOTALAREA_MODE
# 0 	AMT_ANNUITY_x 	0.141543 	NaN 	1 	1.000000 	0.999990 	0.984831 	0.504992 	0.063521 	0.060415 	... 	0.213585 	0.242036 	0.087469 	0.130693 	0.127550 	0.121965 	-0.042097 	-0.015691 	0.154382 	0.085155
# 1 	AMT_CREDIT 	0.141543 	0.999990 	0 	0.999990 	1.000000 	0.984840 	0.504980 	0.063466 	0.060366 	... 	0.213523 	0.242006 	0.087436 	0.130646 	0.127505 	0.121924 	-0.042102 	-0.015690 	0.154359 	0.085112
# 2 	AMT_GOODS_PRICE 	0.139077 	0.984840 	0 	0.984831 	0.984840 	1.000000 	0.499723 	0.062735 	0.059585 	... 	0.210233 	0.242111 	0.085854 	0.129219 	0.126134 	0.120523 	-0.043285 	-0.014198 	0.152982 	0.084184
# 3 	AMT_INCOME_TOTAL 	0.081092 	0.504992 	0 	0.504992 	0.504980 	0.499723 	1.000000 	0.093269 	0.088529 	... 	0.129191 	0.224685 	-0.013639 	0.167823 	0.164531 	0.156526 	-0.045623 	0.032767 	0.147849 	0.121696
# 4 	APARTMENTS_AVG 	0.051764 	0.093269 	1 	0.063521 	0.063466 	0.062735 	0.093269 	1.000000 	0.925845 	... 	0.029550 	0.073258 	0.012635 	0.656267 	0.656158 	0.654745 	-0.003585 	0.003451 	0.107505 	0.813276

Correlation filter threshold: 0.5

Prioritization table with filter (first 5 rows):


Unnamed: 0,variable,iv,max_abs_corr,keep_feature,EXT_SOURCE_2,EXT_SOURCE_3,EXT_SOURCE_1,DAYS_EMPLOYED,AMT_CREDIT,AMT_ANNUITY_x,...,ELEVATORS_MEDI,TOTALAREA_MODE,ELEVATORS_MODE,FLOORSMAX_MEDI,FLOORSMAX_AVG,LIVINGAREA_AVG,APARTMENTS_MODE,FLOORSMAX_MODE,REGION_RATING_CLIENT,APARTMENTS_MEDI
0,EXT_SOURCE_2,0.349476,,1,1.0,0.099211,0.157079,-0.10324,0.242006,0.242036,...,0.161402,0.087936,0.156647,0.139015,0.140547,0.086822,0.067636,0.131985,-0.348961,0.070036
1,EXT_SOURCE_3,0.247875,0.099211,1,0.099211,1.0,0.092587,-0.068018,0.087436,0.087469,...,0.025224,0.014025,0.023058,0.025808,0.026233,0.012164,0.013862,0.02407,0.000334,0.013879
2,EXT_SOURCE_1,0.140765,0.157079,1,0.157079,0.092587,1.0,-0.078025,0.213523,0.213585,...,0.081838,0.045436,0.079123,0.06557,0.066425,0.042253,0.028365,0.061478,-0.068768,0.028305
3,DAYS_EMPLOYED,0.138171,0.10324,1,-0.10324,-0.068018,-0.078025,1.0,-0.136284,-0.136266,...,-0.020975,-0.021045,-0.019445,-0.023158,-0.021893,-0.016359,-0.009519,-0.022408,0.002354,-0.007411
4,AMT_CREDIT,0.126529,0.242006,1,0.242006,0.087436,0.213523,-0.136284,1.0,0.99999,...,0.144179,0.085112,0.138368,0.127505,0.130646,0.077888,0.060366,0.121924,-0.204004,0.060588


### 4.2.3. Remove the highly correlated, less import feature for each pair with a correlation exceeding CORRELATION_FILTER

In [30]:
# Get features to keep after correlation filtering
final_features = prioritization_with_filter[prioritization_with_filter['keep_feature'] == 1]['variable'].tolist()

print(f"Features kept after correlation filter: {len(final_features)}")
print(f"Features removed by correlation filter: {len(features_to_keep) - len(final_features)}")

# Apply final filter to all datasets
final_cols = ID_LABELS + [TARGET_LABEL, OLD_PD_LABEL] + final_features

df_train_final = df_train[final_cols].copy()
df_val_final = df_val[final_cols].copy()
df_test_final = df_test[final_cols].copy()
df_impact_final = df_impact[final_cols].copy()

print(f"\nSize of train after removal = {df_train_final.shape}.")
print(f"features removed = {len(features_to_keep) - len(final_features)}.")

# Display the final list of selected features
print(f"\nFinal selected features:")
for i, feat in enumerate(final_features, 1):
    iv_val = iv_filtered[iv_filtered['variable'] == feat]['iv'].values[0]
    print(f"  {i}. {feat} (IV: {iv_val:.4f})")

# Output example:
# Size of train after removal = (23423, 15).
# features removed = 11.

Features kept after correlation filter: 11
Features removed by correlation filter: 11

Size of train after removal = (23423, 14).
features removed = 11.

Final selected features:
  1. EXT_SOURCE_2 (IV: 0.3495)
  2. EXT_SOURCE_3 (IV: 0.2479)
  3. EXT_SOURCE_1 (IV: 0.1408)
  4. DAYS_EMPLOYED (IV: 0.1382)
  5. AMT_CREDIT (IV: 0.1265)
  6. DAYS_BIRTH (IV: 0.0930)
  7. DAYS_ID_PUBLISH (IV: 0.0899)
  8. DAYS_LAST_PHONE_CHANGE (IV: 0.0659)
  9. ELEVATORS_AVG (IV: 0.0657)
  10. APARTMENTS_MODE (IV: 0.0531)
  11. REGION_RATING_CLIENT (IV: 0.0520)


# 5. Store final datasets

In [31]:
# Save the final datasets to OUTPUT_PATH
import os

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_PATH, exist_ok=True)

# Save datasets
df_train_final.to_csv(f"{OUTPUT_PATH}/variable_selection_train.csv", index=False)
df_val_final.to_csv(f"{OUTPUT_PATH}/variable_selection_val.csv", index=False)
df_test_final.to_csv(f"{OUTPUT_PATH}/variable_selection_test.csv", index=False)
df_impact_final.to_csv(f"{OUTPUT_PATH}/variable_selection_impact.csv", index=False)

print(f"Datasets saved to '{OUTPUT_PATH}/' directory:")
print(f"  - variable_selection_train.csv: {df_train_final.shape}")
print(f"  - variable_selection_val.csv: {df_val_final.shape}")
print(f"  - variable_selection_test.csv: {df_test_final.shape}")
print(f"  - variable_selection_impact.csv: {df_impact_final.shape}")

# Also save the IV summary and prioritization table for reference
iv_summary.to_csv(f"{OUTPUT_PATH}/iv_summary.csv", index=False)
prioritization_with_filter.to_csv(f"{OUTPUT_PATH}/prioritization_table.csv", index=False)

print(f"\nReference files saved:")
print(f"  - iv_summary.csv: IV values for all features")
print(f"  - prioritization_table.csv: Full prioritization with correlation filter")

Datasets saved to 'data_out/' directory:
  - variable_selection_train.csv: (23423, 14)
  - variable_selection_val.csv: (2928, 14)
  - variable_selection_test.csv: (2928, 14)
  - variable_selection_impact.csv: (439, 14)

Reference files saved:
  - iv_summary.csv: IV values for all features
  - prioritization_table.csv: Full prioritization with correlation filter
