In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings

# Ignore FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# 读取训练集
train_set = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/train.csv')

# 首先删除包含目标变量缺失值的行
train_set = train_set.dropna(subset=['target'])

# 选择数值类型的列来计算中位数
numerical_cols = train_set.select_dtypes(include=[np.number]).columns
median_values = train_set[numerical_cols].median()

# 数据清洗
# 填充数值类型列的缺失值
train_set[numerical_cols] = train_set[numerical_cols].fillna(median_values)

# 处理离群值
for col in ['imbalance_size']:
    upper_limit = train_set[col].quantile(0.99)
    train_set[col] = np.where(train_set[col] > upper_limit, upper_limit, train_set[col])

# 变量变换
# 对长尾分布的特征进行对数变换
for col in ['imbalance_size']:
    train_set[col + '_log'] = np.log(train_set[col] + 1e-9)

# 特征选择
# 删除不需要的列
train_set.drop(['row_id', 'time_id', 'date_id'], axis=1, inplace=True)

# 标准化，仅针对连续数值特征
continuous_features = train_set.select_dtypes(include=['float64', 'int64']).columns
continuous_features = continuous_features.drop(['imbalance_buy_sell_flag', 'stock_id','target'])  # 排除分类特征
scaler = StandardScaler()
train_set[continuous_features] = scaler.fit_transform(train_set[continuous_features])

# 重新保存修改后的数据集
train_set.to_csv('/kaggle/working/train_modified.csv', index=False)

print('Modified train set has been saved successfully.')

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

# Define preprocessing function
def pre_process_LGBM(df):
    # Avoid division by zero
    df['matched_size'] = df['matched_size'].replace(0, 1e-9)
    df['bid_size'] = df['bid_size'].replace0(0, 1e-9)
    df['ask_size'] = df['ask_size'].replace(0, 1e-9)

    # Create new features
    df['imbalance_ratio'] = df['imbalance_size'] / df['matched_size']
    df['imbl_size1'] = (df['bid_size'] - df['ask_size']) / (df['bid_size'] + df['ask_size'])
    df['imbl_size2'] = (df['imbalance_size'] - df['matched_size']) / (df['imbalance_size'] + df['matched_size'])
    return df

# Load the preprocessed training dataset
data = pd.read_csv('/kaggle/working/train_modified.csv')

# Apply preprocessing function
data = pre_process_LGBM(data)

# Separate features and target variable
X = data.drop('target', axis=1)
y = data['target']

# Fill or remove NaN values
X.fillna(X.median(), inplace=True)
y.fillna(y.median(), inplace=True)

# Split the training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a LightGBM model with preset parameters
best_LGBM = lgb.LGBMRegressor(num_leaves=70, learning_rate=0.1, feature_fraction=1.0, bagging_fraction=1.0, bagging_freq=5, metric='rmse', verbose=-1)
best_LGBM.fit(X_train, y_train)

# Make predictions
y_pred = best_LGBM.predict(X_test)

In [None]:
import optiver2023
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# 定义整体预处理函数，包括两个阶段的预处理
def full_preprocess(df, is_train=True):
    # 如果存在 'currently_scored' 特征，删除它
    if 'currently_scored' in df.columns:
        df.drop('currently_scored', axis=1, inplace=True)

    # 第一阶段预处理
    # 选择数值类型的列来计算中位数，但对于训练集排除目标变量
    numerical_cols = df.select_dtypes(include=[np.number]).columns
    if is_train:
        numerical_cols = numerical_cols.drop('target')

    median_values = df[numerical_cols].median()
    df[numerical_cols] = df[numerical_cols].fillna(median_values)

    for col in ['imbalance_size']:
        upper_limit = df[col].quantile(0.99)
        df[col] = np.where(df[col] > upper_limit, upper_limit, df[col])

    for col in ['imbalance_size']:
        df[col + '_log'] = np.log(df[col] + 1e-9)

    df.drop(['row_id', 'time_id', 'date_id'], axis=1, errors='ignore', inplace=True)

    # 标准化
    continuous_features = df.select_dtypes(include=['float64', 'int64']).columns
    if is_train:
        continuous_features = continuous_features.drop(['target', 'imbalance_buy_sell_flag', 'stock_id'])
    
    scaler = StandardScaler()
    df[continuous_features] = scaler.fit_transform(df[continuous_features])

    # 第二阶段预处理
    df['matched_size'] = df['matched_size'].replace(0, 1e-9)
    df['bid_size'] = df['bid_size'].replace(0, 1e-9)
    df['ask_size'] = df['ask_size'].replace(0, 1e-9)

    df['imbalance_ratio'] = df['imbalance_size'] / df['matched_size']
    df['imbl_size1'] = (df['bid_size'] - df['ask_size']) / (df['bid_size'] + df['ask_size'])
    df['imbl_size2'] = (df['imbalance_size'] - df['matched_size']) / (df['imbalance_size'] + df['matched_size'])

    return df

# 使用比赛API获取隐藏测试集
env = optiver2023.make_env()  # 确保只调用一次
iter_test = env.iter_test()

# 假设 LGBM_model 是你的预训练模型
# 假设 X 是你的特征集

for (test_df, revealed_targets, sample_prediction_df) in iter_test:
    # 对测试数据应用整体预处理
    test_df_processed = full_preprocess(test_df, is_train=False)

    # 选择在训练模型时使用的特征
    X_test = test_df_processed[X.columns]

    # 填充NaN值
    X_test.fillna(X.median(), inplace=True)
    
    # 使用训练好的模型进行预测
    y_pred_test = LGBM_model.predict(X_test)
    
    # 填充预测结果到sample_prediction_df
    sample_prediction_df['target'] = y_pred_test
    
    # 提交预测结果
    env.predict(sample_prediction_df)