In [26]:
import os, gc, warnings, random, pickle
from pathlib import Path #文件系统路径的对象化处理，比直接用字符串更安全、方便的路径操作。
import numpy as np
import pandas as pd
import polars as pl
from tqdm.auto import tqdm
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import KFold
# import torch

In [27]:
class Config:
    VERSION=2
    SEED=42
    N_FOLDS=3
    BOOSTERS=['lgbm','xgb','cat']
    MAX_ROUNDS=2500
    EARLY_STOP=100
    VERBOSE=1
    Data_dir=r'./'
    MODEL_DIR=Path('./models');os.makedirs(Path('./models'),exist_ok=True) # exist_ok =True则不抛出异常
    OOF_DIR=Path('./oof');os.makedirs(Path('./oof'),exist_ok=True)
    TARGET_COUNT=424
    FEATURES_TO_ADD=['target']
    
    XGB_PARAMS={'objective': 'reg:squarederror'
        , 'learning_rate': 0.005,'max_depth':4,'random_state':SEED
        ,'tree_method':'hist'}
    
    LGBM_PARAMS = {
        'objective': 'regression', 'metric': 'rmse',
        'learning_rate': 0.005, 'num_leaves': 8, 'seed': SEED,
        'device': 'gpu', 'gpu_platform_id': 0, 'gpu_device_id': 0,
    }
    
    CATBOOST_PARAMS = {        'loss_function': 'RMSE', 'learning_rate': 0.005,
        'iterations': MAX_ROUNDS, 'depth': 4,
        'random_seed': SEED, 'verbose': False,
        'task_type': 'GPU', 'devices': '0:1',}

In [28]:
# ===================
# seed control
# ===================
def set_seed(seed=Config.SEED):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)##Python 的哈希种子（hash seed）。 有些操作基于hash的随机操作
set_seed()

In [None]:
def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
            # Print current column type
            print("******************************")
            print("Column: ",col)
            print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
            # Print new column type
            print("dtype after: ",props[col].dtype)
            print("******************************")
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props, NAlist

In [29]:
# ==================
# Feature engineering
# ==================
def add_features(df):
    df['dayofweek'] = df['date_id'] % 7
    df['month'] = (df['date_id'] // 30) % 12
    df['quarter'] = df['month'] // 3
    df['day_of_month'] = df['date_id'] % 30

    df['is_weekend'] = df['dayofweek'].isin([5, 6]).astype(int)
    df['is_month_start'] = (df['day_of_month'] == 0).astype(int)
    df['is_month_end'] = (df['day_of_month'] == 29).astype(int)

    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(0, inplace=True)
    return df

In [30]:
# ====================================================
# Train Booster
# ====================================================
def train_model(booster,x_tr,y_tr,x_val,y_val):
    y_tr=np.nan_to_num(y_tr)
    y_val=np.nan_to_num(y_val)
    if booster =='lgbm':
        train_set=lgb.Dataset(x_tr,y_tr)
        val_set=lgb.Dataset(x_val,y_val)
        model=lgb.train(
            Config.LGBM_PARAMS
            ,train_set,valid_sets=val_set,
            num_boost_round=Config.MAX_ROUNDS,
            callbacks=[lgb.early_stopping(stopping_rounds=10, verbose=False),
                       lgb.log_evaluation(Config.VERBOSE)]
                        )
        return model,model.predict(x_val)
    if booster =='xgb':
        train_set=xgb.DMatrix(data=x_tr,label=y_tr)
        val_set=xgb.DMatrix(data=x_val,label=y_val)
        xgb_model=xgb.train(params=Config.XGB_PARAMS,dtrain=train_set,evals=[(train_set, 'train'), (val_set, 'test')]
                  ,num_boost_round=Config.MAX_ROUNDS
                  ,early_stopping_rounds=Config.EARLY_STOP,
                    verbose_eval=Config.VERBOSE
                            )
        return xgb_model, xgb_model.predict(val_set)
    if booster =='cat':
        train_set=Pool(data=x_tr,label=y_tr)
        val_set=Pool(data=x_val,label=y_val)
        cat_model=CatBoostRegressor(*Config.CATBOOST_PARAMS)
        cat_model.fit(train_set,eval_set=val_set)
        return cat_model,cat_model.predict(val_set)
    

In [31]:
# ====================================================
# Training CV Wrapper
# ====================================================
def run_cv(booster,df,features):
    df = df.copy()
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.dropna(subset=['target'],inplace=True)
    oof_preds=np.zeros(len(df))
    kf=KFold(n_splits=Config.N_FOLDS,shuffle=True,random_state=Config.SEED)
    for fold, (train_index, val_idx) in enumerate(kf.split(df)):
        x_tr=df.iloc[train_index][features]
        y_tr=df.iloc[train_index]['target']
        x_val=df.iloc[val_idx][features]
        y_val=df.iloc[val_idx]['target']
        
        model,val_preds=train_model(booster,x_tr, y_tr, x_val, y_val)
        oof_preds[val_idx]=val_preds
        
        with open(Config.MODEL_DIR/f'{booster}_fold{fold}.pkl','wb') as f:
            pickle.dump(model,f)
        del model,x_tr,y_tr,x_val,y_val
        gc.collect()
    np.save(Config.OOF_DIR/f'oof_{booster}.npy',oof_preds)

In [32]:
# ====================================================
# Load Data
# ====================================================
train_df=pl.read_csv(Config.Data_dir+r'/train.csv',infer_schema_length=10000).to_pandas()
label_df=pl.read_csv(Config.Data_dir+'train_labels.csv').to_pandas()

In [33]:
train_df.shape,label_df.shape

((1917, 558), (1917, 425))

In [34]:
features=list(train_df.columns[1:])+Config.FEATURES_TO_ADD

In [45]:
label_df,_=reduce_mem_usage(label_df)
train_df,_=reduce_mem_usage(train_df)

Memory usage of properties dataframe is : 6.215980529785156  MB
******************************
Column:  date_id
dtype before:  int64
dtype after:  uint16
******************************
******************************
Column:  target_0
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  target_1
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  target_2
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  target_3
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  target_4
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  target_5
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  target_6
dtype before:  f

In [46]:
df_all=[]
for j,col in tqdm(enumerate(label_df.columns[1:])):
    temp = train_df.copy()
    temp['target'] = label_df[col]
    temp=add_features(temp)
    temp=temp.dropna(subset=['target'])
    df_all.append(temp)

train_full = pd.concat(df_all, axis=0).reset_index(drop=True)

0it [00:00, ?it/s]

In [47]:
train_full,_=reduce_mem_usage(train_full)

Memory usage of properties dataframe is : 1711.5404052734375  MB
******************************
Column:  date_id
dtype before:  uint16
dtype after:  uint16
******************************
******************************
Column:  LME_AH_Close
dtype before:  float32
dtype after:  float32
******************************
******************************
Column:  LME_CA_Close
dtype before:  float32
dtype after:  float32
******************************
******************************
Column:  LME_PB_Close
dtype before:  float32
dtype after:  float32
******************************
******************************
Column:  LME_ZS_Close
dtype before:  float32
dtype after:  float32
******************************
******************************
Column:  JPX_Gold_Mini_Futures_Open
dtype before:  float32
dtype after:  float32
******************************
******************************
Column:  JPX_Gold_Rolling-Spot_Futures_Open
dtype before:  uint16
dtype after:  uint16
******************************
*****

In [48]:
gc.collect()

22

In [38]:
features_all=features + [
        'dayofweek', 'month', 'quarter', 'day_of_month',
        'is_weekend', 'is_month_start', 'is_month_end']

In [49]:
train_full

Unnamed: 0,date_id,LME_AH_Close,LME_CA_Close,LME_PB_Close,LME_ZS_Close,JPX_Gold_Mini_Futures_Open,JPX_Gold_Rolling-Spot_Futures_Open,JPX_Gold_Standard_Futures_Open,JPX_Platinum_Mini_Futures_Open,JPX_Platinum_Standard_Futures_Open,...,FX_NOKJPY,FX_ZARGBP,target,dayofweek,month,quarter,day_of_month,is_weekend,is_month_start,is_month_end
0,0,2264.5,7205.0,2570.0,3349.0,4170.0,4215,4170,2199.0,2163,...,13.822740,0.059163,0.005948,0,0,0,0,0,1,0
1,1,2228.0,7147.0,2579.0,3327.0,4170.0,4215,4170,2199.0,2163,...,13.888146,0.059895,0.005783,1,0,0,1,0,0,0
2,2,2250.0,7188.5,2587.0,3362.0,4684.0,4691,4684,3363.0,3367,...,13.983675,0.060037,0.001048,2,0,0,2,0,0,0
3,3,2202.5,7121.0,2540.0,3354.0,4728.0,4737,4729,3430.0,3426,...,14.035571,0.059983,0.001700,3,0,0,3,0,0,0
4,4,2175.0,7125.0,2604.0,3386.0,4170.0,4215,4170,2199.0,2163,...,14.013760,0.059503,-0.003272,4,0,0,4,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
812803,1912,2450.0,9523.5,1961.5,2676.5,15086.0,15440,15085,4461.5,4467,...,14.058107,0.041388,-1.212359,1,3,1,22,0,0,0
812804,1913,2471.5,9519.5,1980.5,2710.5,15165.0,15509,15162,4495.0,4490,...,14.082236,0.041630,-0.141053,2,3,1,23,0,0,0
812805,1914,2471.5,9533.5,1974.0,2693.0,15040.0,15477,15044,4544.5,4555,...,14.126606,0.041457,-0.127688,3,3,1,24,0,0,0
812806,1915,2456.0,9500.5,1970.0,2697.5,15420.0,15752,15420,4670.0,4685,...,14.095322,0.041368,-0.012187,4,3,1,25,0,0,0


In [50]:
# ====================================================
# Train All Boosters
# ====================================================
for booster in Config.BOOSTERS:
    print(f"Training {booster}")
    run_cv(booster,train_full,features_all)

Training lgbm


LightGBMError: bad allocation