In [3]:
import pandas as pd
import gc
# Initialize a list to hold samples from each file
samples = []
# Load a sample from each file
for i in range(10):

    file_path = f"/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id={i}/part-0.parquet"
    chunk = pd.read_parquet(file_path)
    
    # Take a sample of the data (adjust sample size as needed)
    #sample_chunk = chunk.sample(n=500000, random_state=42)  # For example, 100 rows
    sample_chunk = chunk[:500000]
    samples.append(sample_chunk)
# Concatenate all samples into one DataFrame if needed
del chunk
gc.collect()  # Forces garbage collection
sample_df = pd.concat(samples, ignore_index=True)
del samples
gc.collect()

0

In [4]:
sample_df.head()


Unnamed: 0,date_id,time_id,symbol_id,weight,feature_00,feature_01,feature_02,feature_03,feature_04,feature_05,...,feature_78,responder_0,responder_1,responder_2,responder_3,responder_4,responder_5,responder_6,responder_7,responder_8
0,0,0,1,3.889038,,,,,,0.851033,...,-0.281498,0.738489,-0.069556,1.380875,2.005353,0.186018,1.218368,0.775981,0.346999,0.095504
1,0,0,7,1.370613,,,,,,0.676961,...,-0.302441,2.965889,1.190077,-0.523998,3.849921,2.626981,5.0,0.703665,0.216683,0.778639
2,0,0,9,2.285698,,,,,,1.056285,...,-0.096792,-0.864488,-0.280303,-0.326697,0.375781,1.271291,0.099793,2.109352,0.670881,0.772828
3,0,0,10,0.690606,,,,,,1.139366,...,-0.296244,0.408499,0.223992,2.294888,1.097444,1.225872,1.225376,1.114137,0.775199,-1.379516
4,0,0,14,0.44057,,,,,,0.9552,...,3.418133,-0.373387,-0.502764,-0.348021,-3.928148,-1.591366,-5.0,-3.57282,-1.089123,-5.0


In [5]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
# Separate features and responders
features = sample_df.filter(regex='^feature_')
responders = sample_df.filter(regex='^responder_')
weights = sample_df['weight']
# Convert to numpy arrays for TensorFlow
X = features.values  # Features for input
#y = responders.values  # Responders for output
# Assuming you have a DataFrame `y_train` with all responders
y = responders[['responder_6']].values  # Keep only responder_6
X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)
y = np.nan_to_num(y, nan=0.0, posinf=0.0, neginf=0.0)

In [6]:
Is_keras = True


In [7]:
train_size = int(len(X) * 0.8)

# Sequential split
X_train = X[:train_size]
X_val = X[train_size:]
y_train = y[:train_size]
y_val = y[train_size:]
weights_train = weights[:train_size]
weights_val = weights[train_size:]

print(f"Train shapes: {X_train.shape}, {y_train.shape}, {weights_train.shape}")
print(f"Validation shapes: {X_val.shape}, {y_val.shape}, {weights_val.shape}")

Train shapes: (4000000, 79), (4000000, 1), (4000000,)
Validation shapes: (1000000, 79), (1000000, 1), (1000000,)


In [8]:
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
# Define the Autoencoder model
input_dim = X_train.shape[1]  # Number of features
latent_dim = 32  # Dimension of the bottleneck layer
encoder_input = layers.Input(shape=(input_dim,))
x = layers.Dense(128, activation='relu')(encoder_input)
x = layers.Dense(64, activation='relu')(x)
bottleneck = layers.Dense(latent_dim, activation='linear', name='bottleneck')(x)  # Encoder output
# Decoder
x = layers.Dense(64, activation='relu')(bottleneck)
x = layers.Dense(128, activation='relu')(x)
decoder_output = layers.Dense(input_dim, activation='linear')(x)
autoencoder = models.Model(encoder_input, decoder_output, name="Autoencoder")
# Compile the Autoencoder
autoencoder.compile(optimizer="adam", loss="mse")
autoencoder.summary()
# Define callbacks
early_stopping = EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True, min_delta = 0.00001)
reduce_lr = ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=3, min_lr=1e-6)
# Train the Autoencoder
history = autoencoder.fit(
    X_train, X_train,
    validation_data=(X_val, X_val),
    epochs=1,
    batch_size=32,
    callbacks=[early_stopping, reduce_lr]
)
# Extract the encoder
encoder = models.Model(encoder_input, bottleneck, name="Encoder")
encoder.save("/kaggle/working/pretrained_encoder.keras")

[1m125000/125000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m244s[0m 2ms/step - loss: 1.1913 - val_loss: 0.1222 - learning_rate: 0.0010


In [9]:
# Define a learning rate schedule
def learning_rate_scheduler_xgb(epoch):
    initial_rate = 0.3
    decay_rate = 0.999
    return initial_rate * (decay_rate ** (np.log(epoch)))

In [10]:
from xgboost import XGBRegressor
# Create an XGBoost model
model_xgb = XGBRegressor(
    n_estimators=5000,
    learning_rate=learning_rate_scheduler_xgb,
    tree_method='hist',
    max_depth=6,
    random_state=42
)
# Fit the model with sample weights and validation dataset
model_xgb.fit(
    X_train,
    y_train,
 #   sample_weight=weights_train,
    eval_set=[(X_val, y_val)],
#    sample_weight_eval_set=[weights_train, weights_val],
    eval_metric='rmse',
    early_stopping_rounds=10,
    verbose=False
)



In [11]:
y_pred = model_xgb.predict(X_val)


In [12]:
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_val, y_pred, squared=False)
r2 = r2_score(y_val, y_pred)
print(f"RMSE: {mse}")
print(f"R²: {r2}")

RMSE: 0.8626458048820496
R²: 0.003791198326100864
