# 1. Import Libraries and datasets, and define global variables

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')

INPUT_PATH = "data_in"
OUTPUT_PATH = "data_out"

ID_LABELS = ["SK_ID_CURR"]
TARGET_LABEL = "TARGET"

TRAIN_FRAC = 0.8
VAL_FRAC = 0.1
TEST_FRAC = 0.1

HIGHLY_MISSING_THRESHOLD = 0.6
HIGHLY_CONCENTRATED_THRESHOLD = 0.9

# 2. Create helper classes & functions

In [12]:
def get_missing_percentage(df):
    """Calculate the percentage of missing values for each column."""
    missing_pct = df.isnull().mean()
    missing_df = pd.DataFrame({
        'variable': missing_pct.index,
        'perc_missing': missing_pct.values
    })
    return missing_df.sort_values('perc_missing', ascending=False).reset_index(drop=True)


def get_concentration(df, exclude_cols=None):
    """Calculate the concentration (max frequency) for each column."""
    if exclude_cols is None:
        exclude_cols = []
    
    results = []
    for col in df.columns:
        if col in exclude_cols:
            continue
        # Calculate value counts normalized
        vc = df[col].value_counts(normalize=True, dropna=False)
        max_conc = vc.iloc[0] if len(vc) > 0 else 0
        most_common_vals = vc[vc == max_conc].index.tolist()
        results.append({
            'variable': col,
            'values_with_most_concentration': most_common_vals,
            'concentration': max_conc
        })
    
    result_df = pd.DataFrame(results)
    return result_df.sort_values('concentration', ascending=False).reset_index(drop=True)


def print_dataset_info(df, name, target_col=None):
    """Print dataset size and target rate."""
    if target_col and target_col in df.columns:
        target_rate = df[target_col].mean()
        print(f"{name} size = {df.shape}. Target rate = {target_rate}.")
    else:
        print(f"{name} size = {df.shape}. Target rate = nan")


def convert_to_numeric(df, exclude_cols=None):
    """
    Convert all columns to numeric where possible.
    - Boolean columns are converted to 0/1
    - Object columns with numeric-like values are converted to numeric
    - 'True'/'False' strings are converted to 1/0
    """
    if exclude_cols is None:
        exclude_cols = []
    
    for col in df.columns:
        if col in exclude_cols:
            continue
        
        # Convert bool columns to int
        if df[col].dtype == 'bool':
            df[col] = df[col].astype(int)
        
        # Convert object columns
        elif df[col].dtype == 'object':
            # First, replace 'True'/'False' strings with 1/0
            df[col] = df[col].replace({'True': 1, 'False': 0})
            # Then convert to numeric (coerce will turn unconvertible to NaN)
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    return df

# 3. Import data

In [13]:
# Filename: "data_extraction.csv"
df = pd.read_csv(f"{INPUT_PATH}/data_extraction.csv")

print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows:")
df.head()

Dataset shape: (29718, 315)

First few rows:


Unnamed: 0,SK_ID_CURR,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY_x,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,...,NUM_TIMES_12M_STATUS_0,NUM_TIMES_12M_IS_MOST_RECENT_STATUS_1,NUM_TIMES_12M_STATUS_1,NUM_TIMES_12M_IS_MOST_RECENT_STATUS_C,NUM_TIMES_12M_STATUS_C,NUM_TIMES_12M_IS_MOST_RECENT_STATUS_X,NUM_TIMES_12M_STATUS_X,NUM_TIMES_12M_IS_MOST_RECENT_STATUS_nan,NUM_TIMES_12M_STATUS_nan,IS_CREDIT_ENDDATE_MISSING
0,100004,0.0,0,67500.0,135000.0,6750.0,135000.0,0.010032,-19046,-225,...,,,,,,,,,,
1,100012,0.0,0,135000.0,405000.0,20250.0,405000.0,0.019689,-14469,-2019,...,,,,,,,,,,
2,100021,0.0,1,81000.0,270000.0,13500.0,270000.0,0.010966,-9776,-191,...,,,,,,,,,,
3,100022,0.0,0,112500.0,157500.0,7875.0,157500.0,0.04622,-17718,-7804,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,100024,0.0,0,135000.0,427500.0,21375.0,427500.0,0.015221,-18252,-4286,...,,,,,,,,,,


# 4. Steps

## 4.1. Step 1: Treat Boolean and categorical variables

In [14]:
# Check if some categorical variables are actually numeric, remove true categorical variables, and
# convert Boolean variables to binary.

# Identify column types
print("Data types distribution (before treatment):")
print(df.dtypes.value_counts())

# Identify boolean columns (True/False)
bool_cols = df.select_dtypes(include=['bool']).columns.tolist()
print(f"\nBoolean columns: {len(bool_cols)}")

# Identify object (string/categorical) columns
object_cols = df.select_dtypes(include=['object']).columns.tolist()
print(f"Object/categorical columns: {len(object_cols)}")

# Convert all non-ID columns to numeric
df = convert_to_numeric(df, exclude_cols=ID_LABELS)

print(f"\nData types distribution (after treatment):")
print(df.dtypes.value_counts())

# Check remaining object columns (these would be true categorical)
remaining_object_cols = df.select_dtypes(include=['object']).columns.tolist()
remaining_object_cols = [c for c in remaining_object_cols if c not in ID_LABELS]

if remaining_object_cols:
    print(f"\nRemoving {len(remaining_object_cols)} true categorical columns: {remaining_object_cols}")
    df = df.drop(columns=remaining_object_cols)

print(f"\nDataset shape after treatment: {df.shape}")

Data types distribution (before treatment):
bool       155
float64    114
int64       41
object       5
Name: count, dtype: int64

Boolean columns: 155
Object/categorical columns: 5

Data types distribution (after treatment):
int64      196
float64    119
Name: count, dtype: int64

Dataset shape after treatment: (29718, 315)


## 4.2. Step 2: Split to impact, training, validation, and testing

In [15]:
# Include stratification on the target: 
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
# Impact records are those that have the flag "is_test" equals to 1. It doesn't have a target value.

# Print extraction info
print_dataset_info(df, "Extraction", TARGET_LABEL)

# Separate impact records (is_test == 1) from development records
df_impact = df[df['is_test'] == 1].drop(columns=['is_test']).reset_index(drop=True)
df_dev = df[df['is_test'] == 0].drop(columns=['is_test']).reset_index(drop=True)

# Get development target for stratification
y_dev = df_dev[TARGET_LABEL]

# First split: train + val vs test (using remaining fraction)
# TRAIN_FRAC is 0.8 of total dev, so test is 0.1/0.9 of remaining
test_size = TEST_FRAC / (TRAIN_FRAC + VAL_FRAC + TEST_FRAC)

df_train_val, df_test = train_test_split(
    df_dev,
    test_size=test_size,
    stratify=y_dev,
    random_state=42
)

# Second split: train vs val
val_size = VAL_FRAC / (TRAIN_FRAC + VAL_FRAC)

df_train, df_val = train_test_split(
    df_train_val,
    test_size=val_size,
    stratify=df_train_val[TARGET_LABEL],
    random_state=42
)

# Reset indices for clean dataframes
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# Print info for all splits
print_dataset_info(df_train, "Train", TARGET_LABEL)
print_dataset_info(df_val, "Val", TARGET_LABEL)
print_dataset_info(df_test, "Test", TARGET_LABEL)
print_dataset_info(df_impact, "Impact", TARGET_LABEL)

# Output example.
# Extraction size = (29718, 315). Target rate = 0.05478329177909082.
# Train size = (23423, 314). Target rate = 0.054775220936686166.
# Val size = (2928, 314). Target rate = 0.054986338797814206.
# Test size = (2928, 314). Target rate = 0.0546448087431694
# Impact size = (439, 314). Target rate = nan

Extraction size = (29718, 315). Target rate = 0.05478329177909082.
Train size = (23423, 314). Target rate = 0.05481791401613798.
Val size = (2928, 314). Target rate = 0.0546448087431694.
Test size = (2928, 314). Target rate = 0.0546448087431694.
Impact size = (439, 314). Target rate = nan.


## 4.3. Step 3: Treat missing values (based on training)

### 4.3.1. Remove features at or above highly_missing_threshold

In [16]:
# Calculate missing percentages on training data (excluding ID and target)
feature_cols = [col for col in df_train.columns if col not in ID_LABELS + [TARGET_LABEL]]
missing_df = get_missing_percentage(df_train[feature_cols])

# Find features with missing rate >= threshold
highly_missing = missing_df[missing_df['perc_missing'] >= HIGHLY_MISSING_THRESHOLD]

print("Features with high missing rates:")
print(highly_missing.head().to_string())

# Get list of columns to remove
cols_to_remove = highly_missing['variable'].tolist()

# Remove from all datasets
df_train = df_train.drop(columns=cols_to_remove)
df_val = df_val.drop(columns=cols_to_remove)
df_test = df_test.drop(columns=cols_to_remove)
df_impact = df_impact.drop(columns=cols_to_remove)

print(f"\nSize of train after removal = {df_train.shape}.")
print(f"# features removed = {len(cols_to_remove)}.")

# Output example:
# variable 	perc_missing
# 0 	COMMONAREA_AVG 	0.668830
# 1 	COMMONAREA_MEDI 	0.668830
# 2 	COMMONAREA_MODE 	0.668830
# 3 	NONLIVINGAPARTMENTS_AVG 	0.662938
# 4 	NONLIVINGAPARTMENTS_MEDI 	0.662938

# Size of train after removal = (23423, 298).
# # features removed = 16.

Features with high missing rates:
                   variable  perc_missing
0            COMMONAREA_AVG      0.670196
1           COMMONAREA_MEDI      0.670196
2           COMMONAREA_MODE      0.670196
3  NONLIVINGAPARTMENTS_MEDI      0.664902
4   NONLIVINGAPARTMENTS_AVG      0.664902

Size of train after removal = (23423, 298).
# features removed = 16.


### 4.3.2. Create a binary flag for every variable that has a missing value

In [17]:
# Identify columns with missing values in training set (excluding ID and target)
feature_cols = [col for col in df_train.columns if col not in ID_LABELS + [TARGET_LABEL]]

# Find columns with any missing values
cols_with_missing = [col for col in feature_cols if df_train[col].isnull().any()]

print(f"Number of columns with missing values: {len(cols_with_missing)}")

# Create binary flags for each column with missing values
for col in cols_with_missing:
    flag_col_name = f"{col}_missing"
    df_train[flag_col_name] = df_train[col].isnull().astype(int)
    df_val[flag_col_name] = df_val[col].isnull().astype(int)
    df_test[flag_col_name] = df_test[col].isnull().astype(int)
    df_impact[flag_col_name] = df_impact[col].isnull().astype(int)

print(f"Size of train after adding missing flags = {df_train.shape}.")
print(f"# features added = {len(cols_with_missing)}.")

# Output example:
# Size of train after removal = (23423, 393).
# # features added = 95.

Number of columns with missing values: 95
Size of train after adding missing flags = (23423, 393).
# features added = 95.


### 4.3.3. Impute missing values with median value and apply results on validation and testing

In [18]:
# checkout https://scikit-learn.org/stable/modules/impute.html

# Get current feature columns (excluding ID and target)
feature_cols = [col for col in df_train.columns if col not in ID_LABELS + [TARGET_LABEL]]

# Create imputer based on training data
imputer = SimpleImputer(strategy='median')
remaining_missing = get_missing_percentage(df_train[feature_cols])
print(remaining_missing.head())
# Fit on training data and transform all datasets
df_train[feature_cols] = imputer.fit_transform(df_train[feature_cols])
df_val[feature_cols] = imputer.transform(df_val[feature_cols])
df_test[feature_cols] = imputer.transform(df_test[feature_cols])
df_impact[feature_cols] = imputer.transform(df_impact[feature_cols])

# Verify no missing values remain
remaining_missing = get_missing_percentage(df_train[feature_cols])
remaining_missing = remaining_missing[remaining_missing['perc_missing'] > 0]

if len(remaining_missing) > 0:
    print("Columns still with missing values after imputation:")
    print(remaining_missing.head())
else:
    print("All missing values have been imputed successfully.")

print(f"\nTrain shape after imputation: {df_train.shape}")

# Output example:
# variable 	perc_missing
# 0 	NUM_TIMES_12M_IS_MOST_RECENT_STATUS_0 	0.502455
# 1 	NUM_TIMES_12M_IS_MOST_RECENT_STATUS_1 	0.502455
# 2 	NUM_TIMES_12M_IS_MOST_RECENT_STATUS_C 	0.502455
# 3 	NUM_TIMES_12M_IS_MOST_RECENT_STATUS_X 	0.502455
# 4 	NUM_TIMES_12M_IS_MOST_RECENT_STATUS_nan 	0.502455

            variable  perc_missing
0       LANDAREA_AVG      0.554156
1      LANDAREA_MEDI      0.554156
2      LANDAREA_MODE      0.554156
3  BASEMENTAREA_MEDI      0.544508
4   BASEMENTAREA_AVG      0.544508
All missing values have been imputed successfully.

Train shape after imputation: (23423, 393)


## 4.4. Step 4: Remove features below highly_concentrated_threshold (based on training)

In [19]:
# Get current feature columns (excluding ID and target)
feature_cols = [col for col in df_train.columns if col not in ID_LABELS + [TARGET_LABEL]]

# Calculate concentration for each feature
concentration_df = get_concentration(df_train[feature_cols])

# Find features with concentration >= threshold
highly_concentrated = concentration_df[
    concentration_df['concentration'] >= HIGHLY_CONCENTRATED_THRESHOLD
]

print("Features with high concentration:")
print(highly_concentrated.head().to_string())

# Get list of columns to remove
cols_to_remove_concentrated = highly_concentrated['variable'].tolist()

# Remove from all datasets
df_train = df_train.drop(columns=cols_to_remove_concentrated)
df_val = df_val.drop(columns=cols_to_remove_concentrated)
df_test = df_test.drop(columns=cols_to_remove_concentrated)
df_impact = df_impact.drop(columns=cols_to_remove_concentrated)

print(f"\nSize of train after removal = {df_train.shape}.")
print(f"# features removed = {len(cols_to_remove_concentrated)}.")

# Output example:
# variable 	values_with_most_concentration 	concentration
# 0 	CODE_GENDER_nan 	[0.0] 	1.0
# 1 	FLAG_DOCUMENT_12 	[0.0] 	1.0
# 2 	FLAG_DOCUMENT_2 	[0.0] 	1.0
# 3 	FLAG_DOCUMENT_20 	[0.0] 	1.0
# 4 	FLAG_MOBIL 	[1.0] 	1.0

# Size of train after removal = (23423, 202).
# # features removed = 191.

Features with high concentration:
                                     variable values_with_most_concentration  concentration
0     NUM_TIMES_12M_IS_MOST_RECENT_STATUS_nan                          [0.0]            1.0
1                    NUM_TIMES_12M_STATUS_nan                          [0.0]            1.0
2           NUM_TIMES_12M_CREDIT_CURRENCY_nan                          [0.0]            1.0
3             NUM_TIMES_12M_CREDIT_ACTIVE_nan                          [0.0]            1.0
4  NUM_TIMES_12M_CREDIT_TYPE_Real estate loan                          [0.0]            1.0

Size of train after removal = (23423, 202).
# features removed = 191.


# 5. Store final treated datasets

In [20]:
import os

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_PATH, exist_ok=True)

# Save treated datasets
df_train.to_csv(f"{OUTPUT_PATH}/data_treatment_train.csv", index=False)
df_val.to_csv(f"{OUTPUT_PATH}/data_treatment_val.csv", index=False)
df_test.to_csv(f"{OUTPUT_PATH}/data_treatment_test.csv", index=False)
df_impact.to_csv(f"{OUTPUT_PATH}/data_treatment_impact.csv", index=False)

print("Saved treated datasets to:")
print(f"  - {OUTPUT_PATH}/data_treatment_train.csv")
print(f"  - {OUTPUT_PATH}/data_treatment_val.csv")
print(f"  - {OUTPUT_PATH}/data_treatment_test.csv")
print(f"  - {OUTPUT_PATH}/data_treatment_impact.csv")

print("\nFinal dataset shapes:")
print(f"  Train: {df_train.shape}")
print(f"  Val: {df_val.shape}")
print(f"  Test: {df_test.shape}")
print(f"  Impact: {df_impact.shape}")

Saved treated datasets to:
  - data_out/data_treatment_train.csv
  - data_out/data_treatment_val.csv
  - data_out/data_treatment_test.csv
  - data_out/data_treatment_impact.csv

Final dataset shapes:
  Train: (23423, 202)
  Val: (2928, 202)
  Test: (2928, 202)
  Impact: (439, 202)
