In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
import optuna
import xgboost as xgb
import numpy as np
import gc # Garbage Collection
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error, root_mean_squared_error, r2_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks, regularizers
from tensorflow.keras.optimizers import Adam, AdamW


In [2]:
X = np.load("D:/PythonDataSci/cdc project/data/processed/X_train.npy")
y = np.load("D:/PythonDataSci/cdc project/data/processed/y_train.npy")

X_test  = np.load("D:/PythonDataSci/cdc project/data/processed/X_test.npy")
test_ids = np.load("D:/PythonDataSci/cdc project/data/processed/test_ids.npy")

In [3]:
X_train, X_valid, y_train, y_valid = train_test_split(X,y,test_size=0.2,random_state=7)

In [4]:
y_train_log = np.log1p(y_train)
y_valid_log = np.log1p(y_valid)

In [None]:
xgb = XGBRegressor(n_estimators=1000, learning_rate=0.02, max_depth=5, device='cuda',n_jobs=-1)
xgb.fit(X_train, y_train_log)
y_prd_log = xgb.predict(X_valid)

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


In [None]:
try:
    import torch
    def clear_gpu():
        torch.cuda.empty_cache()
except ImportError:
    def clear_gpu():
        pass

def objective(trial):
    gc.collect()
    clear_gpu()
    
    param = {
        'n_estimators': 2000,
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 7), 
        'max_bin': 128, 
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 0.9),
        'subsample': trial.suggest_float('subsample', 0.6, 0.9),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),  
        'device': 'cuda', 
        'n_jobs': -1,
        'random_state': 42,
        'verbosity': 0
    }
    
    try:
        model = xgb.XGBRegressor(**param)
        
        model.fit(
            X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            verbose=False
        )
        
        preds = model.predict(X_valid)
        rmse = np.sqrt(mean_squared_error(y_valid, preds))
        
        del model
        gc.collect()
        clear_gpu()
        
        return rmse

    except Exception as e:
        print(f"Trial failed with error: {e}")
        return float('inf') 

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

[I 2025-12-30 16:49:40,764] A new study created in memory with name: no-name-b9859f81-bad0-4a9e-89bb-45be942e8353
[I 2025-12-30 16:50:08,298] Trial 0 finished with value: 117098.85710800084 and parameters: {'learning_rate': 0.02335863963375386, 'max_depth': 5, 'colsample_bytree': 0.6935916160955015, 'subsample': 0.8613529447924582, 'reg_alpha': 1.1012454434131733, 'reg_lambda': 0.10474540040495997}. Best is trial 0 with value: 117098.85710800084.
[I 2025-12-30 16:51:06,869] Trial 1 finished with value: 115783.07551624287 and parameters: {'learning_rate': 0.0060297511801735825, 'max_depth': 7, 'colsample_bytree': 0.5864705011664011, 'subsample': 0.6295304002577121, 'reg_alpha': 1.916964949702383e-07, 'reg_lambda': 0.47407472304305526}. Best is trial 1 with value: 115783.07551624287.
[I 2025-12-30 16:51:50,897] Trial 2 finished with value: 115058.13487102944 and parameters: {'learning_rate': 0.005126653039481932, 'max_depth': 6, 'colsample_bytree': 0.7196040589241918, 'subsample': 0.6165

[I 2025-12-30 17:02:20,131] Trial 16 finished with value: 112277.49395137033 and parameters: {'learning_rate': 0.011178982956450563, 'max_depth': 5, 'colsample_bytree': 0.6034854651325706, 'subsample': 0.6854569745901181, 'reg_alpha': 0.09426738682137052, 'reg_lambda': 6.156082790201138e-07}. Best is trial 16 with value: 112277.49395137033.


In [7]:
X_tabular = X_train[:, :22]
X_embed   = X_train[:, 22:]

scaler = StandardScaler()
X_embed_scaled = scaler.fit_transform(X_embed)

pca = PCA(n_components=100) 
X_embed_pca = pca.fit_transform(X_embed_scaled)

print(f"Reduced dimensions from {X_embed.shape[1]} to {X_embed_pca.shape[1]}")

X_train_final = np.hstack([X_tabular, X_embed_pca])

X_valid_tabular = X_valid[:, :22]
X_valid_embed   = X_valid[:, 22:]

X_valid_embed_scaled = scaler.transform(X_valid_embed) 
X_valid_embed_pca    = pca.transform(X_valid_embed_scaled)

X_valid_final = np.hstack([X_valid_tabular, X_valid_embed_pca])

Reduced dimensions from 1280 to 100


Using log preds with low dim pca embeddings

In [None]:
# --- OPTIONAL: Clear CUDA Cache ---
try:
    import torch
    def clear_gpu():
        torch.cuda.empty_cache()
except ImportError:
    def clear_gpu():
        pass

def objective(trial):
    # SAFETY: Aggressively clean memory
    gc.collect()
    clear_gpu()
    
    # --- STEP 1: LOG TRANSFORM TARGETS ---
    # We use np.log1p (log(1+x)) to be safe against 0s, 
    # but np.log is fine if all y > 0.
    y_train_log = np.log(y_train) 
    y_valid_log = np.log(y_valid)
    
    param = {
        'n_estimators': 2000,
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 7), 
        'max_bin': 128, 
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 0.9),
        'subsample': trial.suggest_float('subsample', 0.6, 0.9),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),  
        'device': 'cuda', 
        'n_jobs': -1,
        'random_state': 42,
        'verbosity': 0
    }
    
    try:
        model = xgb.XGBRegressor(**param)
        
        # --- STEP 2: TRAIN ON LOG TARGETS ---
        model.fit(
            X_train_final, y_train_log,
            # Validation set must also be log-transformed for internal metrics to make sense
            eval_set=[(X_valid_final, y_valid_log)], 
            verbose=False
        )
        
        # --- STEP 3: PREDICT (Returns Log Values) ---
        preds_log = model.predict(X_valid_final)
        
        # --- STEP 4: INVERSE TRANSFORM (Exp) ---
        # Convert predictions back to original currency/scale
        preds_original = np.exp(preds_log)
        
        # --- STEP 5: CALCULATE RMSE ON ORIGINAL SCALE ---
        # We compare 'preds_original' vs 'y_valid' (the raw data)
        rmse = np.sqrt(mean_squared_error(y_valid, preds_original))
        
        del model
        gc.collect()
        clear_gpu()
        
        return rmse

    except Exception as e:
        print(f"Trial failed with error: {e}")
        return float('inf') 

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=60)

[I 2026-01-01 20:32:12,397] A new study created in memory with name: no-name-e824f5b6-d027-47b5-9b81-b025eedef0ee
[I 2026-01-01 20:32:19,595] Trial 0 finished with value: 116552.13151204056 and parameters: {'learning_rate': 0.05918354950458916, 'max_depth': 5, 'colsample_bytree': 0.8722430587827523, 'subsample': 0.6540008601981402, 'reg_alpha': 7.77403451533842, 'reg_lambda': 2.1880436000663848e-06}. Best is trial 0 with value: 116552.13151204056.
[I 2026-01-01 20:32:23,620] Trial 1 finished with value: 109098.43775233447 and parameters: {'learning_rate': 0.07900024150662363, 'max_depth': 3, 'colsample_bytree': 0.7831561338906451, 'subsample': 0.8373152944220122, 'reg_alpha': 0.003135473024115725, 'reg_lambda': 1.2477268748837542}. Best is trial 1 with value: 109098.43775233447.
[I 2026-01-01 20:32:33,483] Trial 2 finished with value: 113221.69346905212 and parameters: {'learning_rate': 0.042056582591707865, 'max_depth': 6, 'colsample_bytree': 0.8250959362916875, 'subsample': 0.6392477

Trial 15 finished with value: 112267.19670500373 and parameters: {'learning_rate': 0.0486074461648618, 'max_depth': 4, 'colsample_bytree': 0.6148933024076417, 'subsample': 0.7056132339243473, 'reg_alpha': 0.00032494489089217554, 'reg_lambda': 0.0013207991543530502}. Best is trial 15 with value: 112267.19670500373.


In [None]:
best_params = study.best_params

# Add the fixed parameters back (since Optuna only holds the variable ones)
best_params['n_estimators'] = 5000
best_params['device'] = 'cuda'
best_params['n_jobs'] = -1
best_params['random_state'] = 42

final_model = xgb.XGBRegressor(**best_params)
final_model.fit(
    X_train_final, y_train_log, 
    eval_set=[(X_valid_final, y_valid_log)], 
    verbose=100
)
preds_log = final_model.predict(X_valid_final)
prd = np.exp(preds_log)

[0]	validation_0-rmse:0.51004
[100]	validation_0-rmse:0.18793
[200]	validation_0-rmse:0.17174
[300]	validation_0-rmse:0.16783
[400]	validation_0-rmse:0.16560
[500]	validation_0-rmse:0.16417
[600]	validation_0-rmse:0.16310
[700]	validation_0-rmse:0.16256
[800]	validation_0-rmse:0.16200
[900]	validation_0-rmse:0.16165
[1000]	validation_0-rmse:0.16127
[1100]	validation_0-rmse:0.16106
[1200]	validation_0-rmse:0.16077
[1300]	validation_0-rmse:0.16084
[1400]	validation_0-rmse:0.16082
[1500]	validation_0-rmse:0.16085
[1600]	validation_0-rmse:0.16087
[1700]	validation_0-rmse:0.16094
[1800]	validation_0-rmse:0.16077
[1900]	validation_0-rmse:0.16085
[2000]	validation_0-rmse:0.16097
[2100]	validation_0-rmse:0.16096
[2200]	validation_0-rmse:0.16100
[2300]	validation_0-rmse:0.16097
[2400]	validation_0-rmse:0.16095
[2500]	validation_0-rmse:0.16111
[2600]	validation_0-rmse:0.16112
[2700]	validation_0-rmse:0.16120
[2800]	validation_0-rmse:0.16114
[2900]	validation_0-rmse:0.16120
[3000]	validation_0-rm

In [None]:
xgb_rmse = root_mean_squared_error(y_valid, prd)
xgb_r2 = r2_score(y_true=y_valid, y_pred=prd)
print("XGBoost")
print("RMSE: ", xgb_rmse)
print("R2 score: ", xgb_r2)

XGBoost
RMSE:  107008.3203125
R2 score:  0.9055907130241394


In [135]:
from sklearn.metrics import root_mean_squared_error, r2_score
xgb_rmse = root_mean_squared_error(y_valid, prd)
xgb_r2 = r2_score(y_true=y_valid, y_pred=prd)
print("XGBoost")
print("RMSE: ", xgb_rmse)
print("R2 score: ", xgb_r2)

XGBoost
RMSE:  108720.703125
R2 score:  0.9025450348854065


Lets try ANN

In [8]:
input_tab = layers.Input(shape=(22,), name='Tabular_Input')
input_emb = layers.Input(shape=(100,), name='Embedding_Input')

x_tab = layers.Dense(32, kernel_regularizer=regularizers.l2(0.001))(input_tab)
x_tab = layers.BatchNormalization()(x_tab)
x_tab = layers.Activation('relu')(x_tab)
x_tab = layers.Dropout(0.2)(x_tab) 

x_emb = layers.Dense(64, kernel_regularizer=regularizers.l2(0.001))(input_emb)
x_emb = layers.BatchNormalization()(x_emb)
x_emb = layers.Activation('relu')(x_emb)
x_emb = layers.Dropout(0.4)(x_emb)

concat = layers.Concatenate()([x_tab, x_emb])

x = layers.Dense(64, kernel_regularizer=regularizers.l2(0.001))(concat)
x = layers.BatchNormalization()(x)
x = layers.Activation('relu')(x)
x = layers.Dropout(0.4)(x)
x = layers.Dense(32, kernel_regularizer=regularizers.l2(0.001))(x)
x = layers.Activation('relu')(x)

output = layers.Dense(1, name='output')(x)

model = models.Model(inputs=[input_tab, input_emb], outputs=output)

optimizer = Adam(learning_rate=0.001) 
model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])

my_callbacks = [
    callbacks.EarlyStopping(patience=20, restore_best_weights=True, monitor='val_loss'),
    callbacks.ReduceLROnPlateau(factor=0.5, patience=5, monitor='val_loss')
]

history = model.fit(
    [X_tabular, X_embed_pca], y_train,
    validation_data=([X_valid_tabular, X_valid_embed_pca], y_valid),
    epochs=200,
    batch_size=32,
    callbacks=my_callbacks,
    verbose=1
)

Epoch 1/200
[1m406/406[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 419791536128.0000 - mae: 536569.3750 - val_loss: 411199700992.0000 - val_mae: 538634.6875 - learning_rate: 0.0010
Epoch 2/200
[1m406/406[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 417533591552.0000 - mae: 535234.6250 - val_loss: 410343473152.0000 - val_mae: 538639.0625 - learning_rate: 0.0010
Epoch 3/200
[1m406/406[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 410690519040.0000 - mae: 531011.3125 - val_loss: 403659128832.0000 - val_mae: 534504.6875 - learning_rate: 0.0010
Epoch 4/200
[1m406/406[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 397624442880.0000 - mae: 522674.9375 - val_loss: 381140893696.0000 - val_mae: 520352.2500 - learning_rate: 0.0010
Epoch 5/200
[1m406/406[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - loss: 377707003904.0000 - mae: 509510.2500 - val_loss: 346579927040.0000 - 

In [9]:
y_pred_train = model.predict([X_tabular, X_embed_pca]).flatten()
y_pred_valid = model.predict([X_valid_tabular, X_valid_embed_pca]).flatten()

train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
valid_rmse = np.sqrt(mean_squared_error(y_valid, y_pred_valid))

train_r2 = r2_score(y_train, y_pred_train)
valid_r2 = r2_score(y_valid, y_pred_valid)

print("-" * 30)
print("ANN PERFORMANCE REPORT")
print("-" * 30)
print(f"Train RMSE : {train_rmse:,.0f}")
print(f"Valid RMSE : {valid_rmse:,.0f}")
print("-" * 30)
print(f"Train R²   : {train_r2:.4f}")
print(f"Valid R²   : {valid_r2:.4f}")
print("-" * 30)

[1m406/406[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
------------------------------
ANN PERFORMANCE REPORT
------------------------------
Train RMSE : 160,921
Valid RMSE : 185,616
------------------------------
Train R²   : 0.8037
Valid R²   : 0.7159
------------------------------


Lets try with full embeddings now

In [12]:
input_dim_emb = X_embed.shape[1]
input_dim_tab = X_tabular.shape[1] 

# --- INPUTS ---
input_emb = layers.Input(shape=(input_dim_emb,), name='Full_Embeddings')
input_tab = layers.Input(shape=(input_dim_tab,), name='Tabular_Data')

x_emb = layers.Dense(512)(input_emb)
x_emb = layers.BatchNormalization()(x_emb) 
x_emb = layers.Activation('swish')(x_emb)  # 'swish' often beats 'relu' for deep nets
x_emb = layers.Dropout(0.3)(x_emb) 

x_emb = layers.Dense(256)(x_emb)
x_emb = layers.BatchNormalization()(x_emb)
x_emb = layers.Activation('swish')(x_emb)
x_emb = layers.Dropout(0.3)(x_emb)

x_emb = layers.Dense(128)(x_emb)
x_emb = layers.BatchNormalization()(x_emb)
x_emb = layers.Activation('swish')(x_emb)

x_tab = layers.Dense(64)(input_tab)
x_tab = layers.BatchNormalization()(x_tab)
x_tab = layers.Activation('swish')(x_tab)
x_tab = layers.Dropout(0.1)(x_tab) # Low dropout, these features are precious

concat = layers.Concatenate()([x_tab, x_emb])

x = layers.Dense(256)(concat)
x = layers.BatchNormalization()(x)
x = layers.Activation('swish')(x)
x = layers.Dropout(0.4)(x) # Higher dropout here to prevent memorization

x = layers.Dense(128)(x)
x = layers.BatchNormalization()(x)
x = layers.Activation('swish')(x)
x = layers.Dropout(0.3)(x)

x = layers.Dense(64)(x)
x = layers.Activation('swish')(x)

output = layers.Dense(1, name='output')(x)

model = models.Model(inputs=[input_emb, input_tab], outputs=output)

optimizer = AdamW(learning_rate=0.001, weight_decay=0.004)

model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])

my_callbacks = [
    callbacks.EarlyStopping(patience=15, restore_best_weights=True, monitor='val_loss'),
    callbacks.ReduceLROnPlateau(factor=0.2, patience=5, monitor='val_loss', verbose=1)
]

history = model.fit(
    [X_embed, X_tabular], y_train,
    validation_data=([X_valid_embed, X_valid_tabular], y_valid),
    epochs=150, 
    batch_size=64,
    callbacks=my_callbacks,
    verbose=1
)

Epoch 1/150
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 17ms/step - loss: 419799105536.0000 - mae: 536602.3750 - val_loss: 413168631808.0000 - val_mae: 540466.8750 - learning_rate: 0.0010
Epoch 2/150
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - loss: 418310848512.0000 - mae: 535853.7500 - val_loss: 411260354560.0000 - val_mae: 539407.1250 - learning_rate: 0.0010
Epoch 3/150
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - loss: 413518200832.0000 - mae: 533283.1250 - val_loss: 408183570432.0000 - val_mae: 538097.5000 - learning_rate: 0.0010
Epoch 4/150
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - loss: 403593592832.0000 - mae: 527560.4375 - val_loss: 402980044800.0000 - val_mae: 535557.4375 - learning_rate: 0.0010
Epoch 5/150
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - loss: 387250814976.0000 - mae: 517904.3750 - val_loss: 398591426560.00

In [13]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

# 1. Generate Predictions
# Pass both inputs: [Full Embeddings, Tabular Data]
print("Generating predictions...")
y_pred_train = model.predict([X_embed, X_tabular], verbose=0).flatten()
y_pred_valid = model.predict([X_valid_embed, X_valid_tabular], verbose=0).flatten()

train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
valid_rmse = np.sqrt(mean_squared_error(y_valid, y_pred_valid))

train_r2 = r2_score(y_train, y_pred_train)
valid_r2 = r2_score(y_valid, y_pred_valid)

train_mae = mean_absolute_error(y_train, y_pred_train)
valid_mae = mean_absolute_error(y_valid, y_pred_valid)

print("\n" + "="*40)
print(f"   COMPLEX ANN EVALUATION REPORT")
print("="*40)

print(f"{'Metric':<10} | {'Train':<12} | {'Validation':<12}")
print("-" * 40)
print(f"{'RMSE':<10} | {train_rmse:,.2f}    | {valid_rmse:,.2f}")
print(f"{'MAE':<10} | {train_mae:,.2f}    | {valid_mae:,.2f}")
print(f"{'R²':<10} | {train_r2:.4f}      | {valid_r2:.4f}")
print("-" * 40)

Generating predictions...

   COMPLEX ANN EVALUATION REPORT
Metric     | Train        | Validation  
----------------------------------------
RMSE       | 96,440.93    | 172,524.42
MAE        | 70,165.66    | 108,924.51
R²         | 0.9295      | 0.7546
----------------------------------------
