In [1]:
import polars as pl
import pandas as pd
import numpy as np
import gc
import time

from autogluon.tabular import TabularPredictor
from autogluon.core.metrics import make_scorer

from sklearn.preprocessing import StandardScaler


def reduce_mem_usage_pandas(df: pd.DataFrame) -> pd.DataFrame:
    """
    Redukcja zużycia pamięci w Pandas poprzez downcast typów numerycznych i konwersję do kategorii.
    """
    for col in df.columns:
        col_type = df[col].dtype

        if pd.api.types.is_integer_dtype(col_type):
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
            elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
            elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)
            else:
                df[col] = df[col].astype(np.int64)
        
        elif pd.api.types.is_float_dtype(col_type):
            df[col] = df[col].astype(np.float32)
        
        elif pd.api.types.is_object_dtype(col_type):
            num_unique_values = df[col].nunique()
            num_total_values = len(df[col])
            if num_unique_values / num_total_values < 0.5:
                df[col] = df[col].astype('category')
    
    return df

def reduce_mem_usage_polars(df: pl.DataFrame) -> pl.DataFrame:
    """
    Redukcja pamięci w Polars przez downcast typów numerycznych.
    """
    for col in df.columns:
        dtype = df[col].dtype
        # Sprawdź typy
        if dtype == pl.Int64:
            col_min = df[col].min()
            col_max = df[col].max()
            if col_min >= np.iinfo(np.int8).min and col_max <= np.iinfo(np.int8).max:
                df = df.with_columns(pl.col(col).cast(pl.Int8))
            elif col_min >= np.iinfo(np.int16).min and col_max <= np.iinfo(np.int16).max:
                df = df.with_columns(pl.col(col).cast(pl.Int16))
            elif col_min >= np.iinfo(np.int32).min and col_max <= np.iinfo(np.int32).max:
                df = df.with_columns(pl.col(col).cast(pl.Int32))
        elif dtype == pl.Float64:
            col_min = df[col].min()
            col_max = df[col].max()
            # Można zdecydować się np. tylko na Float32
            df = df.with_columns(pl.col(col).cast(pl.Float32))
    return df

class CONFIG:
    target_col = "responder_6"
    lag_cols_original = ["date_id", "symbol_id"] + [f"responder_{idx}" for idx in range(9)]
    valid_ratio = 0.1
    start_dt = 1100
    lags_cols = [f"responder_{i}_lag_1" for i in range(9)]

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
features = [f'feature_{i:02d}' for i in range(79)]
target = CONFIG.target_col
weight_col = 'weight'

train_path1 = 'train.parquet'

print("Reading training data with Polars...")
df_polars = pl.read_parquet(train_path1)

Reading training data with Polars...


In [3]:
print("Reducing memory usage in Polars (df_polars)...")
df_polars = reduce_mem_usage_polars(df_polars)

Reducing memory usage in Polars (df_polars)...


In [4]:
def preprocess_data_polars(
    df: pl.DataFrame,
    features: list,
    target: str,
    weight_col: str,
    fallback_value: float = 3.0
) -> pl.DataFrame:
    required_columns = set(features + [target, weight_col])
    missing_columns = required_columns - set(df.columns)
    if missing_columns:
        raise ValueError(f"Missing columns in dataset: {missing_columns}")

    print("Missing values per column before imputation (Polars):")
    missing_before = df.select(
        [pl.col(col).is_null().sum().alias(f"{col}_missing") for col in required_columns]
    )
    print(missing_before)
    fill_expressions = []
    for col in features:
        fill_expressions.append(pl.col(col).fill_null(fallback_value))
    fill_expressions.append(pl.col(weight_col).fill_null(1.9))
    df = df.with_columns(fill_expressions)

    weight_array = df[weight_col].to_numpy().reshape(-1, 1)
    scaler = StandardScaler()
    weight_scaled = scaler.fit_transform(weight_array).flatten()
    
    df = df.with_columns([
        pl.Series(weight_col, weight_scaled).alias(weight_col).cast(pl.Float32)
    ])

    # Logowanie brakujących wartości po imputacji
    print("Missing values per column after imputation (Polars):")
    missing_after = df.select(
        [pl.col(col).is_null().sum().alias(f"{col}_missing") for col in required_columns]
    )
    print(missing_after)

    return df

In [5]:
df_polars = preprocess_data_polars(
    df_polars,
    features=features,
    target=target,
    weight_col=weight_col,
    fallback_value=3.0
)

print("Tworzenie lagów w Polars za pomocą window functions...")

df_polars = df_polars.sort(["symbol_id", "date_id"])

lag_expressions = [
    pl.col(f"responder_{i}").shift(1).over("symbol_id").alias(f"responder_{i}_lag_1")
    for i in range(9)
]

df_polars = df_polars.with_columns(lag_expressions)

Missing values per column before imputation (Polars):
shape: (1, 81)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ feature_4 ┆ feature_0 ┆ feature_0 ┆ feature_1 ┆ … ┆ feature_3 ┆ feature_0 ┆ feature_4 ┆ feature_ │
│ 8_missing ┆ 3_missing ┆ 4_missing ┆ 8_missing ┆   ┆ 6_missing ┆ 9_missing ┆ 3_missing ┆ 46_missi │
│ ---       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ng       │
│ u32       ┆ u32       ┆ u32       ┆ u32       ┆   ┆ u32       ┆ u32       ┆ u32       ┆ ---      │
│           ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆ u32      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ 0         ┆ 0         ┆ 0         ┆ 0         ┆ … ┆ 0         ┆ 0         ┆ 40        ┆ 1248     │
└───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴──────────┘
Missing values per col

In [6]:

print("Reducing memory usage in Polars after adding lags...")
df_polars = reduce_mem_usage_polars(df_polars)

print("Podział na zestawy treningowe i walidacyjne...")

df = df_polars.to_pandas()

df = reduce_mem_usage_pandas(df)

del df_polars
gc.collect()

Reducing memory usage in Polars after adding lags...
Podział na zestawy treningowe i walidacyjne...


0

In [7]:
categorical_cols = ['symbol_id']
for col in categorical_cols:
    if col in df.columns:
        df[col] = df[col].astype('category')

len_train = len(df)
valid_records = int(len_train * CONFIG.valid_ratio)
len_ofl_mdl = len_train - valid_records

df = df.sort_values('date_id').reset_index(drop=True)

last_tr_dt = df['date_id'].iloc[len_ofl_mdl]
print(f"\n len_train = {len_train}")
print(f" len_ofl_mdl = {len_ofl_mdl}")
print(f"---> Last offline train date = {last_tr_dt}\n")

validation_data = df[df['date_id'] > last_tr_dt]

del df

gc.collect()
train_features = features + CONFIG.lags_cols


 len_train = 4239026
 len_ofl_mdl = 3815124
---> Last offline train date = 1604

Zapis danych do plików Parquet...
Rozpoczynam trenowanie modelu AutoGluon...


In [10]:
from autogluon.common.features.feature_metadata import FeatureMetadata

def generate_feature_metadata_pandas(df: pd.DataFrame, features: list, categorical_threshold: int = 50) -> FeatureMetadata:
    """
    Generuje obiekt FeatureMetadata na podstawie typów danych w Pandas DataFrame.

    Args:
        df (pd.DataFrame): Pandas DataFrame.
        features (list): Lista nazw cech.
        categorical_threshold (int): Próg liczby unikalnych wartości, poniżej którego cecha jest traktowana jako kategoryczna.

    Returns:
        FeatureMetadata: Obiekt FeatureMetadata.
    """
    type_map_raw = {}
    type_group_map_special = {}

    for feature in features:
        dtype = df[feature].dtype
        if pd.api.types.is_integer_dtype(dtype):
            type_map_raw[feature] = 'int'
        elif pd.api.types.is_float_dtype(dtype):
            type_map_raw[feature] = 'float'
        elif pd.api.types.is_datetime64_any_dtype(dtype):
            type_map_raw[feature] = 'datetime'
        elif pd.api.types.is_categorical_dtype(dtype):
            type_map_raw[feature] = 'category'
        elif pd.api.types.is_object_dtype(dtype):
            unique_count = df[feature].nunique()
            if unique_count <= categorical_threshold:
                type_map_raw[feature] = 'category'
            else:
                type_map_raw[feature] = 'object'
        else:
            type_map_raw[feature] = 'object'

    feature_metadata = FeatureMetadata(type_map_raw=type_map_raw, type_group_map_special=type_group_map_special)
    return feature_metadata

feature_metadata = generate_feature_metadata_pandas(training_data, train_features)


In [12]:
predictor = TabularPredictor(
    label=target,
    sample_weight=weight_col,
    verbosity=3,
    log_to_file=True,
    path=f'AutogluonModels_{int(time.time())}/',
    problem_type='regression',
    
).fit(
    train_data=training_data[train_features + [target, weight_col]],
    time_limit=12 * 3600,
    ag_args_fit={
        'num_gpus': 1,
        'num_stack_levels': 2
    },
    presets='medium_quality',
    feature_metadata=feature_metadata,
    auto_stack=True,
    num_bag_folds=5,
)

Verbosity: 3 (Detailed Logging)
AutoGluon Version:  1.2
Python Version:     3.10.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Fri Mar 29 23:14:13 UTC 2024
CPU Count:          32
GPU Count:          1
Memory Avail:       23.77 GB / 31.34 GB (75.8%)
Disk Space Avail:   68.62 GB / 481.95 GB (14.2%)
Presets specified: ['medium_quality']
User Specified kwargs:
{'ag_args_fit': {'num_gpus': 1, 'num_stack_levels': 2},
 'auto_stack': True,
 'num_bag_folds': 5}
Full kwargs:
{'_feature_generator_kwargs': None,
 '_save_bag_folds': None,
 'ag_args': None,
 'ag_args_ensemble': None,
 'ag_args_fit': {'num_gpus': 1, 'num_stack_levels': 2},
 'auto_stack': True,
 'calibrate': 'auto',
 'delay_bag_sets': False,
 'ds_args': {'clean_up_fits': True,
             'detection_time_frac': 0.25,
             'enable_callbacks': False,
             'enable_ray_logging': True,
             'holdout_data': None,
             'holdout_frac': 0.1111111111111111,
             'memo

In [9]:
predictor = TabularPredictor.load("/mnt/h/Studia/magisterskie/1 sem/ProjektSemestralny/AutogluonModels_1736707301")

In [14]:
leaderboard = predictor.leaderboard(validation_data, extra_metrics=[weighted_r2_scorer], extra_info=True)
leaderboard

Unnamed: 0,model,score_test,weighted_r2,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,...,hyperparameters,hyperparameters_fit,ag_args_fit,features,compile_time,child_hyperparameters,child_hyperparameters_fit,child_ag_args_fit,ancestors,descendants
0,LightGBMXT_BAG_L1,-0.778313,0.018066,-0.890442,root_mean_squared_error,61.672315,1390.907716,57686.332518,61.672315,1390.907716,...,"{'use_orig_features': True, 'valid_stacker': T...",{},"{'max_memory_usage_ratio': 1.0, 'max_time_limi...","[feature_48, feature_03, feature_04, feature_1...",,"{'learning_rate': 0.05, 'extra_trees': True}",{'num_boost_round': 9105},"{'max_memory_usage_ratio': 1.0, 'max_time_limi...",[],"[WeightedEnsemble_L2, WeightedEnsemble_L3]"
1,WeightedEnsemble_L3,-0.778313,0.018066,-0.890442,root_mean_squared_error,61.6805,1390.941053,57686.467386,0.008186,0.033336,...,"{'use_orig_features': False, 'valid_stacker': ...",{},"{'max_memory_usage_ratio': 1.0, 'max_time_limi...",[LightGBMXT_BAG_L1],,"{'ensemble_size': 25, 'subsample_size': 1000000}",{'ensemble_size': 1},"{'max_memory_usage_ratio': 1.0, 'max_time_limi...",[LightGBMXT_BAG_L1],[]
2,WeightedEnsemble_L2,-0.778313,0.018066,-0.890442,root_mean_squared_error,61.685861,1390.960119,57686.473538,0.013547,0.052403,...,"{'use_orig_features': False, 'valid_stacker': ...",{},"{'max_memory_usage_ratio': 1.0, 'max_time_limi...",[LightGBMXT_BAG_L1],,"{'ensemble_size': 25, 'subsample_size': 1000000}",{'ensemble_size': 1},"{'max_memory_usage_ratio': 1.0, 'max_time_limi...",[LightGBMXT_BAG_L1],[]
