In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,mean_absolute_error

file = pd.read_csv('train.csv')
file.drop(["date_id","seconds_in_bucket","row_id"], axis=1, inplace=True)
stock_id_list = []
for stock_id, df in file.groupby(['stock_id']):
    ## fill the missing far_price values with the mean of the far_price
    df['far_price'].fillna(df['far_price'].mean(), inplace=True)
    df['near_price'].fillna(df['near_price'].mean(), inplace=True)
    df = df[['stock_id', 'imbalance_size', 'imbalance_buy_sell_flag','reference_price', 'matched_size', 'far_price', 'near_price','bid_price', 'bid_size', 'ask_price', 'ask_size', 'wap', 'time_id','target']]
    
    if len(df) < 26455:
        continue
    else:
        stock_id_list.append(df)

## change the stock_id_list dimension to (time,stock_id, 13)

stock_id_list = np.array(stock_id_list).astype(np.float32) ## (stock_id,time,13) 
features = stock_id_list[:,:,:-1]
target = stock_id_list[:,:,-1]

In [9]:
def xgmodel(features=features,target=target):
    mask = ~np.isnan(target).any(axis=1)
    features = features[mask]
    target = target[mask]

    def break_into_subsequences_with_padding(data, target, length):
        subsequences = []
        subtargets = []
        total_sequences = data.shape[1] // length
        for i in range(total_sequences):
            subsequences.append(data[:, i*length:(i+1)*length])
            subtargets.append(target[:, i*length:(i+1)*length])
        
        # Handle the last subsequence which might need padding
        if data.shape[1] % length != 0:
            padding_size = length - data.shape[1] % length
            start_idx = total_sequences * length  # start index for the last subsequence
            last_subseq = np.pad(data[:, start_idx:], ((0,0), (0, padding_size), (0,0)), mode='constant')
            last_target = np.pad(target[:, start_idx:], ((0,0), (0, padding_size)), mode='constant')
            subsequences.append(last_subseq)
            subtargets.append(last_target)
        
        return np.concatenate(subsequences, axis=0), np.concatenate(subtargets, axis=0)

    # Define the subsequence length
    subseq_length = 256  # for example

    # Break sequences into smaller subsequences and pad if necessary
    features_subseq, target_subseq = break_into_subsequences_with_padding(features, target, subseq_length)
    features = features_subseq
    target = target_subseq
    
    features_2d = features.reshape(-1, features.shape[-1])
    target_2d = target.reshape(-1)

    # Combine into a single DataFrame
    data = pd.DataFrame(features_2d)
    data['target'] = target_2d

    # Drop rows where 'target' is NaN
    data = data.dropna(subset=['target'])

    # Split data into training and testing sets
    X = data.drop(columns=['target'])
    y = data['target']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train XGBoost model, using MAE as the objective function
    model = xgb.XGBRegressor(objective='reg:squarederror', reg_alpha=0.5)
    

    model.fit(X_train, y_train)

    # Predict
    y_pred = model.predict(X_test)

    # Calculate MAE
    mae = mean_absolute_error(y_test, y_pred)
    print(f'MAE: {mae:.4f}')
    

    test_features_2d = test_features.reshape(-1, test_features.shape[-1])
    test_data = pd.DataFrame(test_features_2d)
    # Predict using the trained model
    y_pred = model.predict(test_data)
    return y_pred

In [5]:
xg_pred = xgmodel()

MAE: 6.1146


In [8]:
xg_pred

array([-0.4845071 ,  0.04586141,  1.0194033 , ..., -0.01175968,
        0.00138574, -0.14717986], dtype=float32)