# 📘 LightGBM Binary Classification Pipeline

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.feature_selection import VarianceThreshold, mutual_info_regression
from sklearn.metrics import r2_score
from scipy.stats import pearsonr
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, GaussianNoise
from optuna.integration import TFKerasPruningCallback
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.layers import Activation
from tensorflow.keras.optimizers import AdamW
import warnings
import optuna
warnings.filterwarnings('ignore')


##  Step 1: Load Data With memory optimisation, and create dtypes for all collumns

In [11]:
train_df = pd.read_parquet("train.parquet")

float_cols = [col for col in train_df.columns if col.startswith('X') or col == 'label']
dtypes = {col: 'float32' for col in float_cols}
if 'timestamp' in train_df.columns:
    dtypes['timestamp'] = 'int64'
if 'asset_id' in train_df.columns:
    dtypes['asset_id'] = 'int32'
train_df = train_df.astype(dtypes)
test_df = pd.read_parquet("test.parquet")
test_dtypes = {col: 'float32' for col in test_df.columns if col.startswith('X')}
if 'timestamp' in test_df.columns:
    test_dtypes['timestamp'] = 'int64'
if 'asset_id' in test_df.columns:
    test_dtypes['asset_id'] = 'int32'
if 'id' in test_df.columns:
    test_dtypes['id'] = 'int64'
test_df = test_df.astype(test_dtypes)

## Step 2: Handle infinity and NaN

In [12]:
train_df.replace([np.inf, -np.inf], np.nan, inplace=True)
test_df.replace([np.inf, -np.inf], np.nan, inplace=True)
train_df.fillna(train_df.median(numeric_only=True), inplace=True)
test_df.fillna(train_df.median(numeric_only=True), inplace=True)

## Step 3: Split Features and Target

In [13]:
X = train_df.drop(columns=["label"])
y = train_df["label"]
has_timestamp = 'timestamp' in X.columns
has_asset_id = 'asset_id' in X.columns

corr_matrix = X.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
# To keep higher MI, compute temp MI and drop lower one (optional refinement)
temp_mi = mutual_info_regression(X[to_drop], y, random_state=42)
keep_mask = temp_mi > np.median(temp_mi)  # Keep higher MI
to_drop = [to_drop[i] for i in range(len(to_drop)) if not keep_mask[i]]  # Drop lower
X = X.drop(to_drop, axis=1)
X_test_feat = test_df.drop(columns=to_drop + ['id', 'label'], errors='ignore')  # Apply to test

subsample_idx = np.random.choice(X.index, size=int(0.5 * len(X)), replace=False)
X_sub = X.loc[subsample_idx]
y_sub = y.loc[subsample_idx]

X_train, X_valid, y_train, y_valid = train_test_split(X_sub, y_sub, test_size=0.1, random_state=42)
print("Train features stats:")
print(X_train.describe())
print("\nTrain target stats:")
print(y_train.describe())
print("\nValid features stats:")
print(X_valid.describe())
print("\nValid target stats:")
print(y_valid.describe())

Train features stats:
             bid_qty        ask_qty        buy_qty       sell_qty  \
count  236648.000000  236648.000000  236648.000000  236648.000000   
mean        9.924902      10.169776     131.973107     132.800539   
std        15.325980      16.146150     311.790785     315.958964   
min         0.001000       0.001000       0.000000       0.000000   
25%         2.643000       2.691000      26.378000      26.986000   
50%         6.401000       6.557000      57.066000      57.810000   
75%        13.060000      13.332250     127.312000     128.858000   
max      1114.932000    1176.689000   17609.567000   17685.503000   

              volume             X1             X2             X9  \
count  236648.000000  236648.000000  236648.000000  236648.000000   
mean      264.773646      -0.033790      -0.002266      -0.036379   
std       598.387023       0.910371       0.933545       0.888094   
min         0.000000      -4.321375      -8.953153      -4.645874   
25%        

## Step 4: Preprcessing

In [14]:
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train.values)  # Use .values to avoid column name issues
selector = VarianceThreshold(threshold=1e-4)
X_train_reduced = selector.fit_transform(X_train_scaled)

X_valid_scaled = scaler.transform(X_valid.values)
X_valid_reduced = selector.transform(X_valid_scaled)

X_test_feat = test_df.drop(columns=['id', 'label'], errors='ignore')
# Align test columns to train
common_cols = list(set(X.columns) & set(X_test_feat.columns))
X_test_feat = X_test_feat[common_cols]
X_test_scaled = scaler.transform(X_test_feat.values)
X_test_reduced = selector.transform(X_test_scaled)

mi_scores = mutual_info_regression(X_train_reduced, y_train, random_state=42)
mi_mask = mi_scores > np.percentile(mi_scores, 20)
X_train_reduced = X_train_reduced[:, mi_mask]
X_valid_reduced = X_valid_reduced[:, mi_mask]
X_test_reduced = X_test_reduced[:, mi_mask]

input_dim = X_train_reduced.shape[1]  # For NN input layer

Hyperparameter Tuning

In [15]:

from tensorflow.keras import regularizers


# Assume your data is already defined and preprocessed:
# X_train_reduced, X_valid_reduced, y_train, y_valid
input_dim = X_train_reduced.shape[1]

def objective(trial):
    # Suggest hyperparameters
    hidden_units1 = trial.suggest_int('hidden_units1', 64, 256)
    hidden_units2 = trial.suggest_int('hidden_units2', 32, 128)
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128])
    l2_reg = trial.suggest_float('l2_reg', 1e-5, 1e-2, log=True)

    # Build model with L2 regularization
    model = Sequential()
    model.add(Dense(hidden_units1, input_dim=input_dim, activation='relu',
                    kernel_regularizer=regularizers.l2(l2_reg)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(hidden_units2, activation='relu',
                    kernel_regularizer=regularizers.l2(l2_reg)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(32, activation='relu',
                    kernel_regularizer=regularizers.l2(l2_reg)))
    model.add(Dense(1))  # Regression output

    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])

    # Callbacks
    es = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    pruning_callback = TFKerasPruningCallback(trial, 'val_loss')

    # Train model
    history = model.fit(X_train_reduced, y_train,
                        validation_data=(X_valid_reduced, y_valid),
                        epochs=100,
                        batch_size=batch_size,
                        callbacks=[es, pruning_callback],
                        verbose=0)

    # Evaluation
    preds = model.predict(X_valid_reduced).flatten()

    # Pearson correlation (negated because Optuna minimizes)
    score = pearsonr(y_valid, preds)[0]
    if np.isnan(score):
        return float('inf')  # Penalize if Pearson is NaN

    return -score

# Run the optimization
""" study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=60)

# Extract best parameters
trial = study.best_trial
best_params = trial.params

# Output best result
print("\nBest Trial:")
print(f"  Value (negative Pearson): {trial.value}")
print("  Params:")
for key, value in best_params.items():
    print(f"    {key}: {value}")
 """
best_params = {
    'hidden_units1': 237,
    'hidden_units2': 40,
    'dropout_rate': 0.1025043950801565,
    'learning_rate': 5.0006641823893714026e-05,
    'batch_size': 32,
    'l2_reg': 0.0019536930681741563
}

## Optuna gave these results

In [16]:


cv = TimeSeriesSplit(n_splits=5)  # Force TimeSeriesSplit, increased splits

#mre effective
cv_scores = []

for train_idx, valid_idx in cv.split(X):
    X_tr, X_val = X.iloc[train_idx], X.iloc[valid_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[valid_idx]
    
    # Preprocessing per fold to avoid mismatch
    scaler_fold = StandardScaler()
    X_tr_scaled = scaler_fold.fit_transform(X_tr)
    selector_fold = VarianceThreshold(threshold=1e-4)
    X_tr_reduced = selector_fold.fit_transform(X_tr_scaled)
    
    X_val_scaled = scaler_fold.transform(X_val)
    X_val_reduced = selector_fold.transform(X_val_scaled)
    
    # MI per fold to match dimensions
    mi_scores_fold = mutual_info_regression(X_tr_reduced, y_tr, random_state=42)
    mi_mask_fold = mi_scores_fold > np.percentile(mi_scores_fold, 20)
    X_tr_reduced = X_tr_reduced[:, mi_mask_fold]
    X_val_reduced = X_val_reduced[:, mi_mask_fold]
    
    # Build and fit NN
    model = Sequential()
    model.add(GaussianNoise(0.01))
    model.add(Dense(best_params['hidden_units1'], input_dim=X_tr_reduced.shape[1], activation='swish', kernel_regularizer=l2(0.001)))
    model.add(BatchNormalization())
    model.add(Dropout(best_params['dropout_rate']))
    model.add(Dense(best_params['hidden_units2'], activation='swish', kernel_regularizer=l2(0.001)))
    model.add(BatchNormalization())
    model.add(Dropout(best_params['dropout_rate']))
    model.add(Dense(32, activation='swish', kernel_regularizer=l2(0.001)))
    model.add(Dense(1))

    optimizer = tf.keras.optimizers.AdamW(
        learning_rate=best_params['learning_rate'],
        weight_decay=0.001    
        )
    model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])

    es = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
    model.fit(X_tr_reduced, y_tr, validation_data=(X_val_reduced, y_val),
              epochs=150, batch_size=best_params['batch_size'], callbacks=[es], verbose=1)
    
    preds = model.predict(X_val_reduced).flatten()
    corr = pearsonr(y_val, preds)[0]
    cv_scores.append(corr)
    

print("CV Pearson Scores:", cv_scores)
print("Mean CV Pearson:", np.mean(cv_scores))
## Cross-validation with optimised Params
X_full_scaled = scaler.fit_transform(X)
X_full_reduced = selector.transform(X_full_scaled)[:, mi_mask]

Epoch 1/150
[1m2740/2740[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 5ms/step - loss: 1.4919 - mae: 0.7096 - val_loss: 1.6726 - val_mae: 0.7740
Epoch 2/150
[1m2740/2740[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 5ms/step - loss: 1.0011 - mae: 0.5375 - val_loss: 1.7903 - val_mae: 0.8246
Epoch 3/150
[1m2740/2740[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 5ms/step - loss: 0.8412 - mae: 0.4764 - val_loss: 1.8248 - val_mae: 0.8423
Epoch 4/150
[1m2740/2740[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 5ms/step - loss: 0.7520 - mae: 0.4422 - val_loss: 1.8305 - val_mae: 0.8454
Epoch 5/150
[1m2740/2740[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 5ms/step - loss: 0.6862 - mae: 0.4114 - val_loss: 1.9183 - val_mae: 0.8793
Epoch 6/150
[1m2740/2740[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 4ms/step - loss: 0.6378 - mae: 0.3934 - val_loss: 1.8647 - val_mae: 0.8688
Epoch 7/150
[1m2740/2740[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

## Cross-validation with optimised Params

In [17]:
X_full_scaled = scaler.fit_transform(X)
X_full_reduced = selector.transform(X_full_scaled)[:, mi_mask]

model = Sequential()
model.add(GaussianNoise(0.05))
model.add(Dense(best_params['hidden_units1'], input_dim=X_full_reduced.shape[1], activation='swish', kernel_regularizer=l2(0.001)))
model.add(BatchNormalization())
model.add(Dropout(best_params['dropout_rate']))
model.add(Dense(best_params['hidden_units2'], activation='swish', kernel_regularizer=l2(0.001)))
model.add(BatchNormalization())
model.add(Dropout(best_params['dropout_rate']))
model.add(Dense(32, activation='swish', kernel_regularizer=l2(0.001)))
model.add(Dense(1))

optimizer = tf.keras.optimizers.AdamW(
    learning_rate=best_params['learning_rate'],
    weight_decay=0.001
)
model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])

model.fit(X_full_reduced, y, epochs=150, batch_size=best_params['batch_size'], verbose=1)

# Predict on Test set
test_preds = model.predict(X_test_reduced).flatten()
test_preds = np.clip(test_preds, y.quantile(0.01), y.quantile(0.99))

# Submission
ids = test_df['id'] if 'id' in test_df.columns else range(len(test_preds))
submission = pd.DataFrame({"id": ids, "prediction": test_preds})
submission.to_csv("submission.csv", index=False)
print("✅ Submission saved to submission.csv")

Epoch 1/150
[1m16434/16434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 4ms/step - loss: 1.4342 - mae: 0.6748
Epoch 2/150
[1m16434/16434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 4ms/step - loss: 1.1309 - mae: 0.6343
Epoch 3/150
[1m16434/16434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 4ms/step - loss: 1.0269 - mae: 0.6199
Epoch 4/150
[1m16434/16434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 4ms/step - loss: 0.9737 - mae: 0.6103
Epoch 5/150
[1m16434/16434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 4ms/step - loss: 0.9167 - mae: 0.5966
Epoch 6/150
[1m16434/16434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 4ms/step - loss: 0.8785 - mae: 0.5878
Epoch 7/150
[1m16434/16434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 4ms/step - loss: 0.8264 - mae: 0.5739
Epoch 8/150
[1m16434/16434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 4ms/step - loss: 0.7991 - mae: 0.5662
Epoch 9/150
[1m16434/16434[0m 

KeyboardInterrupt: 

In [None]:
print(submission.head())
df = pd.read_csv('submission.csv')

# Increase every value in the first column by one
# Assuming the first column is numeric; if it's 'ID', this will increment IDs
df.iloc[:, 0] = df.iloc[:, 0] + 1

# Save the updated DataFrame (replace 'output_file.csv' with desired name)
df.to_csv('updated_s.csv', index=False)

print("\n Updated DataFrame:")
print(df.head())