# Crypto Forecasting Kaggle Project – Neural Network Modelling

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.feature_selection import VarianceThreshold, mutual_info_regression
from sklearn.metrics import r2_score
from scipy.stats import pearsonr
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, GaussianNoise
from optuna.integration import TFKerasPruningCallback
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.layers import Activation
from tensorflow.keras.optimizers import Adam
import warnings
import optuna
warnings.filterwarnings('ignore')


## Load Data With memory optimisation, and create dtypes for all collumns

In [12]:
train_df = pd.read_parquet("train.parquet")

float_cols = [col for col in train_df.columns if col.startswith('X') or col == 'label']
dtypes = {col: 'float32' for col in float_cols}
if 'timestamp' in train_df.columns:
    dtypes['timestamp'] = 'int64'
if 'asset_id' in train_df.columns:
    dtypes['asset_id'] = 'int32'
train_df = train_df.astype(dtypes)
test_df = pd.read_parquet("test.parquet")
test_dtypes = {col: 'float32' for col in test_df.columns if col.startswith('X')}
if 'timestamp' in test_df.columns:
    test_dtypes['timestamp'] = 'int64'
if 'asset_id' in test_df.columns:
    test_dtypes['asset_id'] = 'int32'
if 'id' in test_df.columns:
    test_dtypes['id'] = 'int64'
test_df = test_df.astype(test_dtypes)

## Handle infinity and NaN

In [13]:
train_df.replace([np.inf, -np.inf], np.nan, inplace=True)
test_df.replace([np.inf, -np.inf], np.nan, inplace=True)
train_df.fillna(train_df.median(numeric_only=True), inplace=True)
test_df.fillna(train_df.median(numeric_only=True), inplace=True)

## Split Features and Target

In [14]:
# Selected features from feature selection
selected_features = ['X758','X778','X611','X344','X345','X465',
                     'X614','X385','X610','X445','X752','X759',
                     'X444','X757','X751']

# Prepare training and test features
X = train_df[selected_features]
y = train_df['label']
X_test_feat = test_df[selected_features]

# Split into training and validation sets
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Print basic stats
print("Train features stats:")
print(X_train.describe())
print("\nTrain target stats:")
print(y_train.describe())
print("\nValid features stats:")
print(X_valid.describe())
print("\nValid target stats:")
print(y_valid.describe())

Train features stats:
                X758           X778           X611           X344  \
count  420708.000000  420708.000000  420708.000000  420708.000000   
mean        0.002617      -0.003382       0.314378      -0.051555   
std         1.034226       1.002533       0.913698       0.851696   
min        -2.987701      -0.807793      -4.075869      -4.814550   
25%        -0.818695      -0.760120      -0.118007      -0.483976   
50%         0.044913      -0.605572       0.268551      -0.144311   
75%         0.868943       1.012918       0.667088       0.314403   
max         2.544050       1.765409       5.781154       5.227328   

                X345           X465           X614           X385  \
count  420708.000000  420708.000000  420708.000000  420708.000000   
mean       -0.066188       0.056253       0.018427       0.078397   
std         0.978137       0.852683       0.734919       0.779965   
min       -14.317780      -6.590180      -2.863041      -3.870314   
25%        

## Preprocessing

In [None]:
# Scale features using RobustScaler (less sensitive to outliers)
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train.values)
X_valid_scaled = scaler.transform(X_valid.values)

# Optional: Remove near-zero variance features
selector = VarianceThreshold(threshold=1e-4)
X_train_reduced = selector.fit_transform(X_train_scaled)
X_valid_reduced = selector.transform(X_valid_scaled)

# Prepare test set
X_test_feat = test_df[selected_features]  # only use selected features
X_test_scaled = scaler.transform(X_test_feat.values)
X_test_reduced = selector.transform(X_test_scaled)

# Input dimension for neural network
input_dim = X_train_reduced.shape[1]


# Hyperparameter Tuning

In [16]:
# Neural network hyperparameter tuning (Optuna) - commented
"""
def objective(trial):
    # Suggest hyperparameters
    hidden_units1 = trial.suggest_int('hidden_units1', 64, 256)
    hidden_units2 = trial.suggest_int('hidden_units2', 32, 128)
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128])
    l2_reg = trial.suggest_float('l2_reg', 1e-5, 1e-2, log=True)

    # Build and train model...
    # Evaluate with Pearson correlation
    return -score

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=60)
best_params = study.best_trial.params
"""

# Use best parameters from previous tuning
best_params = {
    'hidden_units1': 237,
    'hidden_units2': 40,
    'dropout_rate': 0.1025,
    'learning_rate': 5e-5,
    'batch_size': 32,
    'l2_reg': 0.00195
}


## Building our Neural Network, and Submission

In [18]:
# Split train into features and target
X_full = train_df[selected_features]
y_full = train_df['label']
X_test = test_df[selected_features]

# Fit preprocessing on training data
scaler = RobustScaler()
X_full_scaled = scaler.fit_transform(X_full.values)

selector = VarianceThreshold(threshold=1e-4)
X_full_reduced = selector.fit_transform(X_full_scaled)

# Transform test set with the same fitted objects
X_test_scaled = scaler.transform(X_test.values)
X_test_reduced = selector.transform(X_test_scaled)

# Build final model
model = Sequential([
    GaussianNoise(0.01),
    Dense(best_params['hidden_units1'], input_dim=X_full_reduced.shape[1],
          activation='swish', kernel_regularizer=l2(best_params['l2_reg'])),
    BatchNormalization(),
    Dropout(best_params['dropout_rate']),
    Dense(best_params['hidden_units2'], activation='swish',
          kernel_regularizer=l2(best_params['l2_reg'])),
    BatchNormalization(),
    Dropout(best_params['dropout_rate']),
    Dense(32, activation='swish', kernel_regularizer=l2(best_params['l2_reg'])),
    Dense(1)
])

model.compile(optimizer=Adam(learning_rate=best_params['learning_rate']), loss='mse')

print("Training final model on FULL data...")
model.fit(X_full_reduced, y_full.values,
          epochs=150,
          batch_size=best_params['batch_size'],
          verbose=1)

# Predict + clip
preds = model.predict(X_test_reduced, verbose=0).flatten()
preds = np.clip(preds, y_full.quantile(0.01), y_full.quantile(0.99))


Training final model on FULL data...
Epoch 1/150
[1m16434/16434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 2ms/step - loss: 1.3821
Epoch 2/150
[1m16434/16434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 2ms/step - loss: 1.1540
Epoch 3/150
[1m16434/16434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 2ms/step - loss: 1.0966
Epoch 4/150
[1m16434/16434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 2ms/step - loss: 1.0484
Epoch 5/150
[1m16434/16434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 2ms/step - loss: 1.0222
Epoch 6/150
[1m16434/16434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 2ms/step - loss: 1.0108
Epoch 7/150
[1m16434/16434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 2ms/step - loss: 0.9755
Epoch 8/150
[1m16434/16434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 2ms/step - loss: 0.9711
Epoch 9/150
[1m16434/16434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 2ms/step - loss: 0.9464


In [21]:
preds = preds.flatten()
submission = pd.DataFrame({
    'ID': np.arange(1, len(preds) + 1),     # 1, 2, 3, ..., not 0-based
    'prediction': preds
})

submission.to_csv("submission.csv", index=False)