In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/optiver-trading-at-the-close/public_timeseries_testing_util.py
/kaggle/input/optiver-trading-at-the-close/train.csv
/kaggle/input/optiver-trading-at-the-close/example_test_files/sample_submission.csv
/kaggle/input/optiver-trading-at-the-close/example_test_files/revealed_targets.csv
/kaggle/input/optiver-trading-at-the-close/example_test_files/test.csv
/kaggle/input/optiver-trading-at-the-close/optiver2023/competition.cpython-310-x86_64-linux-gnu.so
/kaggle/input/optiver-trading-at-the-close/optiver2023/__init__.py


In [2]:
import warnings

# Ignore FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# 读取训练集
train_set = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/train.csv')

# 首先删除包含目标变量缺失值的行
train_set = train_set.dropna(subset=['target'])

# 选择数值类型的列来计算中位数
numerical_cols = train_set.select_dtypes(include=[np.number]).columns
median_values = train_set[numerical_cols].median()

# 数据清洗
# 填充数值类型列的缺失值
train_set[numerical_cols] = train_set[numerical_cols].fillna(median_values)

# 处理离群值
for col in ['imbalance_size']:
    upper_limit = train_set[col].quantile(0.99)
    train_set[col] = np.where(train_set[col] > upper_limit, upper_limit, train_set[col])

# 变量变换
# 对长尾分布的特征进行对数变换
for col in ['imbalance_size']:
    train_set[col + '_log'] = np.log(train_set[col] + 1e-9)

# 特征选择
# 删除不需要的列
train_set.drop(['row_id', 'time_id', 'date_id'], axis=1, inplace=True)

# 特征构建
# 创建买卖价差特征
train_set['bid_ask_spread'] = train_set['ask_price'] - train_set['bid_price']
# 创建买卖不平衡方向与大小的交互特征
train_set['imbalance_flag_size'] = train_set['imbalance_buy_sell_flag'] * train_set['imbalance_size_log']

# 重新保存修改后的数据集
train_set.to_csv('/kaggle/working/train_modified.csv', index=False)

print('Modified train set has been saved successfully.')



In [None]:
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error

# Load the data
train_data = pd.read_csv('/kaggle/working/train_modified.csv')

# Split features and target variable
X = train_data.drop(['target'], axis=1)
y = train_data['target']

# Split the training set and validation set
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Set grid search parameters
parameters = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}  # Regularization strength parameter

# Create a Ridge regression model instance
ridge = Ridge()

# Create a grid search instance with 5-fold cross-validation
grid_search = GridSearchCV(ridge, parameters, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Get the best parameters and best model
best_params_ridge = grid_search.best_params_
best_model_ridge = grid_search.best_estimator_

# Make predictions and evaluate performance on the validation set using the best model
y_pred_ridge = best_model_ridge.predict(X_val)
mse_ridge = mean_squared_error(y_val, y_pred_ridge)
print(f'Best Parameters: {best_params_ridge}')
print(f'Mean Squared Error: {mse_ridge}')

# Retrain the model on the entire training set
best_model_ridge.fit(X, y)


In [None]:
far_price_median = train_set['far_price'].median()
near_price_median = train_set['near_price'].median()

# 定义整体预处理函数，包括两个阶段的预处理
def pre_process_ridge(df):
    # 如果存在 'currently_scored' 特征，删除它
    df.drop(['date_id', 'row_id', 'currently_scored'], axis=1, errors='ignore', inplace=True)
        
    # 使用训练集中位数填充 'far_price' 和 'near_price' 的缺失值
    df['far_price'].fillna(far_price_median, inplace=True)
    df['near_price'].fillna(near_price_median, inplace=True)

    for col in ['imbalance_size']:
        upper_limit = df[col].quantile(0.99)
        df[col] = np.where(df[col] > upper_limit, upper_limit, df[col])

    for col in ['imbalance_size']:
        df[col + '_log'] = np.log(df[col] + 1e-9)
    
    # 特征构建
    # 创建买卖价差特征
    df['bid_ask_spread'] = df['ask_price'] - df['bid_price']
    # 创建买卖不平衡方向与大小的交互特征
    df['imbalance_flag_size'] = df['imbalance_buy_sell_flag'] * df['imbalance_size_log']

    return df

In [None]:
import pandas as pd
import optiver2023

# 假设您已经定义了pre_process函数和best_model_ridge模型

# 初始化Kaggle环境
env = optiver2023.make_env()
# 迭代测试集
iter_test = env.iter_test()

# 对于测试集中的每个批次
for (test_df, revealed_targets, sample_prediction_df) in iter_test:
    # 对测试数据进行预处理
    # 注意: 不要在这里拟合任何预处理步骤，只进行变换
    
    X_test = pre_process_ridge(test_df)
    
    # 使用模型进行预测
    y_pred = best_model_ridge.predict(X_test)
    
    # 填充预测结果到sample_prediction_df
    sample_prediction_df['target'] = y_pred
    
    # 提交预测结果
    env.predict(sample_prediction_df)
