In [1]:
import os, gc, warnings, random, pickle
from pathlib import Path #文件系统路径的对象化处理，比直接用字符串更安全、方便的路径操作。
import numpy as np
import pandas as pd
import polars as pl
from tqdm.auto import tqdm
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import KFold
# import torch

In [29]:
class Config:
    VERSION=2
    SEED=42
    N_FOLDS=53
    BOOSTERS=['lgbm','xgb','cat']
    MAX_ROUNDS=2500
    EARLY_STOPP=100
    VERBOSE=1
    Data_dir=r'./'
    MODEL_DIR=Path('./models');os.makedirs(Path('./models'),exist_ok=True) # exist_ok =True则不抛出异常
    OOF_DIR=Path('./oof');os.makedirs(Path('./oof'),exist_ok=True)
    TARGET_COUNT=424
    FEATURES_TO_ADD=['target_id']
    
    XGB_PARAMS={'objective': 'regression','metric': 'rmse'
        , 'learning_rate': 0.005,'max_depth':4,'random_state':SEED
        ,'tree_method':'hist','predictor':'predictor'}
    
    LGBM_PARAMS = {
        'objective': 'regression', 'metric': 'rmse',
        'learning_rate': 0.005, 'num_leaves': 8, 'seed': SEED,
        'device': 'gpu', 'gpu_platform_id': 0, 'gpu_device_id': 0,
    }

In [17]:
# ===================
# seed control
# ===================
def set_seed(seed=Config.SEED):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)##Python 的哈希种子（hash seed）。 有些操作基于hash的随机操作
set_seed()

In [None]:
# ==================
# Feature engineering
# ==================
def add_features(df):
    df['dayofweek'] = df['date_id'] % 7
    df['month'] = (df['date_id'] // 30) % 12
    df['quarter'] = df['month'] // 3
    df['day_of_month'] = df['date_id'] % 30

    df['is_weekend'] = df['dayofweek'].isin([5, 6]).astype(int)
    df['is_month_start'] = (df['day_of_month'] == 0).astype(int)
    df['is_month_end'] = (df['day_of_month'] == 29).astype(int)

    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(0, inplace=True)
    return df

In [40]:
# ====================================================
# Train Booster
# ====================================================
def train_model(booster,x_tr,y_tr,x_val,y_val):
    if booster =='lgbm':
        train_set=lgb.Dataset(x_tr,y_tr)
        val_set=lgb.Dataset(x_val,y_val)
        model=lgb.train(
            Config.LGBM_PARAMS
            ,train_set,valid_sets=val_set,
            num_boost_round=Config.MAX_ROUNDS,
            callbacks=[lgb.early_stopping(stopping_rounds=10, verbose=False),
                       lgb.log_evaluation(Config.VERBOSE)]
                        )
        return model,model.predict(x_val)
    if booster =='xgb':
        

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 560
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 20
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: Intel(R) Iris(R) Xe Graphics, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 64 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 20 dense feature groups (0.00 MB) transferred to GPU in 0.000769 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 992.390186
[1]	valid_0's rmse: 98.5844
[2]	valid_0's rmse: 98.5964
[3]	valid_0's rmse: 98.6094
[4]	valid_0's rmse: 98.6231
[5]	valid_0's rmse: 98.6378
[6]	valid_0's rmse: 98.6532
[7]	valid_0's rmse: 98.6694
[8]	valid_0's rmse: 98.6865
[9]	valid_0's rmse: 98.7043
[10]	valid_0's rmse: 98.7229
[11]	valid_0's rmse: 98.7422


In [26]:
booster='lgbm'
X=np.random.rand(100,20)
y=100*X.sum(axis=1)+np.random.rand(100)*0.01
x_tr=X[:80,:]
y_tr=y[:80]
x_val=X[80:,:]
y_val=y[80:]

In [None]:
xgb_model=xgb.Booster(model_file=Config.MODEL_DIR+'/xgb.model')

In [5]:
train=pl.read_csv(Config.Data_dir+r'/train.csv',infer_schema_length=10000)
test=pl.read_csv(Config.Data_dir+'test.csv')
train_labels=pl.read_csv(Config.Data_dir+'train_labels.csv')

In [6]:
train.shape,train_labels.shape,test.shape

((1917, 558), (1917, 425), (90, 559))