In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/amlc2025-dataset/test_combined_embeddings.npy
/kaggle/input/amlc2025-dataset/sample_test.csv
/kaggle/input/amlc2025-dataset/sample_test_out.csv
/kaggle/input/amlc2025-dataset/combined_embeddings.npy
/kaggle/input/amlc2025-dataset/train.csv
/kaggle/input/amlc2025-dataset/test.csv
/kaggle/input/amlc2025-dataset/test_out_final.csv
/kaggle/input/amlc2025-dataset/train_out.csv


In [2]:
import pandas as pd
import numpy as np
import scipy.sparse
import gc

In [3]:
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge

In [4]:
import xgboost as xgb
import lightgbm as lgb

In [5]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Concatenate, Dropout, Bidirectional, LSTM, GRU
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping

2025-10-13 14:26:32.313789: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-10-13 14:26:32.335063: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-10-13 14:26:32.341644: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [6]:
from sentence_transformers import SentenceTransformer

In [7]:
import faiss

In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

In [9]:
class CFG:
    N_SPLITS = 5
    RANDOM_STATE = 42
    DATA_PATH = '/kaggle/input/amlc2025-dataset/train.csv'
    TEST_DATA_PATH = '/kaggle/input/amlc2025-dataset/test.csv'
    
    # Text Embedding Models
    # Using a diverse pair: one for general semantics, one optimized for similarity
    ST_MODELS = [
        'sentence-transformers/all-MiniLM-L12-v2',
        'BAAI/bge-base-en-v1.5',                     
        'sentence-transformers/all-distilroberta-v1',
        'sentence-transformers/paraphrase-mpnet-base-v2'
    ]
    BGE_MODEL_INDEX = 1 # The BGE model is the second in the list, at index 1
    
    # KNN Feature Generation
    KNN_N_NEIGHBORS = 10
    
    # DenseNet (1D CNN) Config
    # CNN_VOCAB_SIZE = 30000
    # CNN_MAX_LEN = 60
    # CNN_EMBEDDING_DIM = 128
    DL_VOCAB_SIZE = 30000
    DL_MAX_LEN = 60

In [10]:
# Define the SMAPE metric function
def smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    ratio = np.where(denominator == 0, 0, numerator / denominator)
    return np.mean(ratio) * 100

In [11]:
def create_extreme_engineered_features(df):
    df_out = pd.DataFrame(index=df.index)
    text_col = 'catalog_content'
    
    # Meta Features
    df_out[f'{text_col}_length'] = df[text_col].str.len()
    df_out[f'{text_col}_word_count'] = df[text_col].str.split().str.len()
    df_out[f'{text_col}_capital_ratio'] = df[text_col].apply(lambda t: sum(1 for c in t if c.isupper()) / (len(t) + 1e-9))
    
    # Entity Extraction (Example lists - expand these!)
    BRANDS = ['sony', 'samsung', 'nike', 'apple', 'kitchenaid']
    MATERIALS = ['cotton', 'leather', 'silk', 'gold', 'nylon', 'polyester', 'silver']
    
    text_lower = df[text_col].str.lower()
    df_out['has_brand'] = text_lower.apply(lambda t: any(brand in t for brand in BRANDS)).astype(int)
    for material in MATERIALS:
        df_out[f'has_{material}'] = text_lower.str.contains(material).astype(int)
        
    # Regex Features
    df_out['extracted_inch'] = text_lower.str.extract(r'(\d+\.?\d*)\s*(inch|\"|in\b)').iloc[:, 0].astype(float)
    df_out['extracted_gb'] = text_lower.str.extract(r'(\d+)\s*gb').iloc[:, 0].astype(float)
    
    # Fill NaNs from regex with a neutral value like 0
    return df_out.fillna(0)

In [12]:
# Replace your old KNN function with this improved one.
# It's now located in SECTION 4 for clarity.

# in SECTION 4.3
def generate_knn_features(index_embeds, y_index, query_embeds, k):
    """
    Generates KNN features.
    'index_embeds' are used to build the search space.
    'query_embeds' are the items we want to find neighbors for.
    """
    d = index_embeds.shape[1]
    index = faiss.IndexFlatL2(d)
    
    # Check if GPU is available
    try:
        res = faiss.StandardGpuResources()
        gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
        gpu_index.add(index_embeds.astype(np.float32))
        search_index = gpu_index
    except AttributeError:
        # Fallback to CPU if faiss-gpu is not installed or fails
        print("   FAISS-GPU not found or failed, falling back to CPU.")
        index.add(index_embeds.astype(np.float32))
        search_index = index

    # If query is the same as index, we are doing self-search. K+1 and ignore first result.
    is_self_search = np.array_equal(index_embeds, query_embeds)
    if is_self_search:
        _, I = search_index.search(query_embeds.astype(np.float32), k + 1)
        I = I[:, 1:] # Exclude the first column which is the item itself
    else:
        _, I = search_index.search(query_embeds.astype(np.float32), k)
    
    neighbor_prices = y_index[I]
    
    knn_feats = np.zeros((len(query_embeds), 3))
    knn_feats[:, 0] = np.mean(neighbor_prices, axis=1)
    knn_feats[:, 1] = np.median(neighbor_prices, axis=1)
    knn_feats[:, 2] = np.std(neighbor_prices, axis=1)
    
    return knn_feats

print("3/3: KNN feature generation function is ready.")

3/3: KNN feature generation function is ready.


In [13]:
df = pd.read_csv(CFG.DATA_PATH)

In [14]:
extreme_engineered_features = create_extreme_engineered_features(df)

In [15]:
from tensorflow.keras.layers import Bidirectional, LSTM, GRU, Attention

In [16]:
# Define the custom Attention layer
class Attention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(Attention, self).__init__(**kwargs)
    def build(self, input_shape):
        self.W = self.add_weight(name="att_weight", shape=(input_shape[-1], 1), initializer="normal")
        self.b = self.add_weight(name="att_bias", shape=(input_shape[1], 1), initializer="zeros")
        super(Attention, self).build(input_shape)
    def call(self, x):
        et = tf.keras.backend.squeeze(tf.keras.backend.tanh(tf.keras.backend.dot(x, self.W) + self.b), axis=-1)
        at = tf.keras.backend.softmax(et)
        at = tf.keras.backend.expand_dims(at, axis=-1)
        output = x * at
        return tf.keras.backend.sum(output, axis=1)

# Model 3: LSTM with Attention
def create_lstm_attention_model(vocab_size, max_len, embedding_dim):
    inputs = Input(shape=(max_len,))
    embedding = Embedding(vocab_size, embedding_dim)(inputs)
    bilstm = Bidirectional(LSTM(128, return_sequences=True))(embedding)
    attention = Attention()(bilstm)
    dense = Dense(64, activation='relu')(attention)
    outputs = Dense(1)(dense)
    return Model(inputs, outputs)

# Model 4: 1D CNN with Multiple Kernels
def create_multikernel_cnn_model(vocab_size, max_len, embedding_dim):
    inputs = Input(shape=(max_len,))
    embedding = Embedding(vocab_size, embedding_dim)(inputs)
    
    conv_layers = []
    for kernel_size in [2, 3, 5]:
        conv = Conv1D(filters=64, kernel_size=kernel_size, activation='relu')(embedding)
        pool = GlobalMaxPooling1D()(conv)
        conv_layers.append(pool)
    
    concatenated = Concatenate()(conv_layers)
    dense = Dense(128, activation='relu')(concatenated)
    dropout = Dropout(0.5)(dense)
    outputs = Dense(1)(dropout)
    return Model(inputs, outputs)

# Model 5: Multi-Input Neural Network
def create_multi_input_model(vocab_size, max_len, embedding_dim, num_tabular_feats):
    # Text input branch
    text_input = Input(shape=(max_len,), name='text_input')
    embedding = Embedding(vocab_size, embedding_dim)(text_input)
    gru = GRU(64)(embedding)
    text_branch = Dense(32, activation='relu')(gru)
    
    # Tabular input branch
    tabular_input = Input(shape=(num_tabular_feats,), name='tabular_input')
    tabular_branch = Dense(32, activation='relu')(tabular_input)
    
    # Merge branches
    concatenated = Concatenate()([text_branch, tabular_branch])
    dense = Dense(64, activation='relu')(concatenated)
    outputs = Dense(1)(dense)
    
    return Model(inputs=[text_input, tabular_input], outputs=outputs)

print("New model architectures are defined.")

New model architectures are defined.


In [17]:
combined_embeddings = np.load('/kaggle/input/amlc2025-dataset/' + 'combined_embeddings.npy').astype(np.float32)

In [18]:
oof_xgb = np.zeros(len(df))
oof_lgbm = np.zeros(len(df))
oof_lstm_att = np.zeros(len(df))
oof_cnn_multi = np.zeros(len(df))
oof_nn_multi_input = np.zeros(len(df))
X_text = df['catalog_content']
y = df['price'].values

In [19]:
kf = KFold(n_splits=CFG.N_SPLITS, shuffle=True, random_state=CFG.RANDOM_STATE)

In [20]:
# Pre-calculate the dimensions of each sentence transformer model to find the BGE slice
print("Calculating embedding dimensions for KNN slicing...")
model_dims = [SentenceTransformer(m).get_sentence_embedding_dimension() for m in CFG.ST_MODELS]
bge_start_index = sum(model_dims[:CFG.BGE_MODEL_INDEX])
bge_end_index = bge_start_index + model_dims[CFG.BGE_MODEL_INDEX]
print(f"BGE embeddings will be sliced from column {bge_start_index} to {bge_end_index}.")


Calculating embedding dimensions for KNN slicing...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

BGE embeddings will be sliced from column 384 to 1152.


In [21]:
for fold, (train_idx, val_idx) in enumerate(kf.split(df)):
    print(f"\n===== FOLD {fold+1} / {CFG.N_SPLITS} =====")
    
    # --- Split All Data for this Fold ---
    X_train_text, X_val_text = df['catalog_content'].iloc[train_idx], df['catalog_content'].iloc[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    X_train_eng, X_val_eng = extreme_engineered_features.iloc[train_idx].values.astype(np.float32), extreme_engineered_features.iloc[val_idx].values.astype(np.float32)
    X_train_embed, X_val_embed = combined_embeddings[train_idx], combined_embeddings[val_idx]

    # --- Model 1: XGBoost (Keyword Expert) ---
    print("1/5: Training XGBoost...")
    tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=25000)
    X_train_tfidf = tfidf.fit_transform(X_train_text)
    X_val_tfidf = tfidf.transform(X_val_text)
    
    X_train_xgb = scipy.sparse.hstack((X_train_tfidf, X_train_eng))
    X_val_xgb = scipy.sparse.hstack((X_val_tfidf, X_val_eng))

    # Using default but robust parameters
    xgb_model = xgb.XGBRegressor(tree_method='hist', device='cuda')
    xgb_model.fit(X_train_xgb, y_train, eval_set=[(X_val_xgb, y_val)], early_stopping_rounds=50, verbose=False)
    oof_xgb[val_idx] = xgb_model.predict(X_val_xgb)
    print(f"   Fold {fold+1} XGB SMAPE: {smape(y_val, oof_xgb[val_idx]):.4f}%")

    # --- Model 2: LightGBM (Semantic & Market Expert) ---
    print("2/5: Training LightGBM...")
    bge_train_embeds = X_train_embed[:, bge_start_index:bge_end_index]
    bge_val_embeds = X_val_embed[:, bge_start_index:bge_end_index]
    
    knn_feats_train = generate_knn_features(bge_train_embeds, y_train, bge_train_embeds, CFG.KNN_N_NEIGHBORS)
    knn_feats_val = generate_knn_features(bge_train_embeds, y_train, bge_val_embeds, CFG.KNN_N_NEIGHBORS)

    X_train_lgbm = np.hstack([X_train_embed, X_train_eng, knn_feats_train])
    X_val_lgbm = np.hstack([X_val_embed, X_val_eng, knn_feats_val])
    
    # Using default but robust parameters
    lgbm_model = lgb.LGBMRegressor(n_estimators=1000, learning_rate=0.05, num_leaves=40, subsample=0.8, colsample_bytree=0.8, device='gpu', random_state=CFG.RANDOM_STATE)
    lgbm_model.fit(X_train_lgbm, y_train, eval_set=[(X_val_lgbm, y_val)], callbacks=[lgb.early_stopping(50, verbose=False)])
    oof_lgbm[val_idx] = lgbm_model.predict(X_val_lgbm)
    print(f"   Fold {fold+1} LGBM SMAPE: {smape(y_val, oof_lgbm[val_idx]):.4f}%")

    # --- Tokenize text for all DL models (do this once per fold) ---
    print("   Tokenizing text for DL models...")
    tokenizer = Tokenizer(num_words=CFG.DL_VOCAB_SIZE, oov_token="<OOV>")
    tokenizer.fit_on_texts(X_train_text)
    X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train_text), maxlen=CFG.DL_MAX_LEN)
    X_val_seq = pad_sequences(tokenizer.texts_to_sequences(X_val_text), maxlen=CFG.DL_MAX_LEN)

    # Common settings for DL model training
    es = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=0)
    dl_batch_size = 256
    dl_epochs = 30

    # --- Model 3: LSTM with Attention ---
    print("3/5: Training LSTM w/ Attention...")
    model_lstm = create_lstm_attention_model(CFG.DL_VOCAB_SIZE, CFG.DL_MAX_LEN, 128)
    model_lstm.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')
    model_lstm.fit(X_train_seq, y_train, validation_data=(X_val_seq, y_val),
                   epochs=dl_epochs, batch_size=dl_batch_size, callbacks=[es], verbose=0)
    oof_lstm_att[val_idx] = model_lstm.predict(X_val_seq, batch_size=dl_batch_size*2).squeeze()
    print(f"   Fold {fold+1} LSTM w/ Attention SMAPE: {smape(y_val, oof_lstm_att[val_idx]):.4f}%")
    del model_lstm; gc.collect()

    # --- Model 4: CNN with Multi-Kernel ---
    print("4/5: Training Multi-Kernel CNN...")
    model_cnn = create_multikernel_cnn_model(CFG.DL_VOCAB_SIZE, CFG.DL_MAX_LEN, 128)
    model_cnn.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')
    model_cnn.fit(X_train_seq, y_train, validation_data=(X_val_seq, y_val),
                  epochs=dl_epochs, batch_size=dl_batch_size, callbacks=[es], verbose=0)
    oof_cnn_multi[val_idx] = model_cnn.predict(X_val_seq, batch_size=dl_batch_size*2).squeeze()
    print(f"   Fold {fold+1} Multi-Kernel CNN SMAPE: {smape(y_val, oof_cnn_multi[val_idx]):.4f}%")
    del model_cnn; gc.collect()
    
    # --- Model 5: Multi-Input NN ---
    print("5/5: Training Multi-Input NN...")
    # Assemble the specific tabular features for this model
    tabular_train_feats = np.hstack([X_train_eng, knn_feats_train])
    tabular_val_feats = np.hstack([X_val_eng, knn_feats_val])
    
    model_multi_input = create_multi_input_model(CFG.DL_VOCAB_SIZE, CFG.DL_MAX_LEN, 128, tabular_train_feats.shape[1])
    model_multi_input.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')
    
    model_multi_input.fit([X_train_seq, tabular_train_feats], y_train,
                          validation_data=([X_val_seq, tabular_val_feats], y_val),
                          epochs=dl_epochs, batch_size=dl_batch_size, callbacks=[es], verbose=0)
    
    preds = model_multi_input.predict([X_val_seq, tabular_val_feats], batch_size=dl_batch_size*2)
    oof_nn_multi_input[val_idx] = preds.squeeze()
    print(f"   Fold {fold+1} Multi-Input NN SMAPE: {smape(y_val, oof_nn_multi_input[val_idx]):.4f}%")
    del model_multi_input; gc.collect()

    # --- Clean up at the end of the fold ---
    print("--- Fold Complete. Cleaning memory. ---")
    del X_train_text, X_val_text, y_train, y_val, X_train_eng, X_val_eng, X_train_embed, X_val_embed
    del X_train_tfidf, X_val_tfidf, X_train_xgb, X_val_xgb, xgb_model
    del bge_train_embeds, bge_val_embeds, knn_feats_train, knn_feats_val, X_train_lgbm, X_val_lgbm, lgbm_model
    del tokenizer, X_train_seq, X_val_seq, tabular_train_feats, tabular_val_feats
    gc.collect()


===== FOLD 1 / 5 =====
1/5: Training XGBoost...




   Fold 1 XGB SMAPE: 63.3805%
2/5: Training LightGBM...
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 687204
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 2703
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...




[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 2694 dense feature groups (154.27 MB) transferred to GPU in 0.146713 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 23.598634
   Fold 1 LGBM SMAPE: 61.3546%
   Tokenizing text for DL models...
3/5: Training LSTM w/ Attention...


I0000 00:00:1760365928.275846     235 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1760365928.276543     235 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355


[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 206ms/step
   Fold 1 LSTM w/ Attention SMAPE: 61.3010%
4/5: Training Multi-Kernel CNN...
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 31ms/step
   Fold 1 Multi-Kernel CNN SMAPE: 61.5148%
5/5: Training Multi-Input NN...
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 54ms/step
   Fold 1 Multi-Input NN SMAPE: nan%
--- Fold Complete. Cleaning memory. ---

===== FOLD 2 / 5 =====
1/5: Training XGBoost...




   Fold 2 XGB SMAPE: 62.2772%
2/5: Training LightGBM...
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 687204
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 2703
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 2694 dense feature groups (154.27 MB) transferred to GPU in 0.149166 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 23.620979
   Fold 2 LGBM SMAPE: 60.5203%
   Tokenizing text for DL models...
3/5: Training LSTM w/ Attention...
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 207ms/step
   Fold 2 LSTM w/ Attention SMAPE: 67.8961%
4/5: Training Multi-Kernel CNN...
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 31ms/step
   Fold 2 Multi-Kernel CNN SMAPE: 61.6807



   Fold 3 XGB SMAPE: 62.6807%
2/5: Training LightGBM...
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 687203
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 2703
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 2694 dense feature groups (154.27 MB) transferred to GPU in 0.148882 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 23.709702
   Fold 3 LGBM SMAPE: 60.6524%
   Tokenizing text for DL models...
3/5: Training LSTM w/ Attention...
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 189ms/step
   Fold 3 LSTM w/ Attention SMAPE: 62.2922%
4/5: Training Multi-Kernel CNN...
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 31ms/step
   Fold 3 Multi-Kernel CNN SMAPE: 61.2121



   Fold 4 XGB SMAPE: 61.9190%
2/5: Training LightGBM...
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 687204
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 2703
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 2694 dense feature groups (154.27 MB) transferred to GPU in 0.148355 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 23.677128
   Fold 4 LGBM SMAPE: 62.5231%
   Tokenizing text for DL models...
3/5: Training LSTM w/ Attention...
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 188ms/step
   Fold 4 LSTM w/ Attention SMAPE: 61.3268%
4/5: Training Multi-Kernel CNN...
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 34ms/step
   Fold 4 Multi-Kernel CNN SMAPE: 60.1618



   Fold 5 XGB SMAPE: 73.7448%
2/5: Training LightGBM...
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 687202
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 2703
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 2694 dense feature groups (154.27 MB) transferred to GPU in 0.147139 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 23.631827
   Fold 5 LGBM SMAPE: 60.4405%
   Tokenizing text for DL models...
3/5: Training LSTM w/ Attention...
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 189ms/step
   Fold 5 LSTM w/ Attention SMAPE: 64.2732%
4/5: Training Multi-Kernel CNN...
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 31ms/step
   Fold 5 Multi-Kernel CNN SMAPE: 60.8016

In [22]:
print("\n--- Generating Final Submission File ---")

EXTERNAL_TRAIN_PREDS_PATH = '/kaggle/input/amlc2025-dataset/train_out.csv'
EXTERNAL_PRED_COLUMN_NAME = 'price' # The name of the prediction column in your CSV
ID_COLUMN = 'sample_id'

df_train_out = pd.read_csv(EXTERNAL_TRAIN_PREDS_PATH)

merged_train_preds = pd.merge(df[[ID_COLUMN]], df_train_out[[ID_COLUMN, EXTERNAL_PRED_COLUMN_NAME]], on=ID_COLUMN, how='left')
merged_train_preds[EXTERNAL_PRED_COLUMN_NAME] = merged_train_preds[EXTERNAL_PRED_COLUMN_NAME].fillna(0)

# Extract the predictions as a NumPy array in the correct order
external_oof_preds = merged_train_preds[EXTERNAL_PRED_COLUMN_NAME].values

print(f"External predictions merged successfully. Shape: {external_oof_preds.shape}")

X_meta = np.column_stack((
    oof_xgb, 
    oof_lgbm, 
    oof_lstm_att, 
    oof_cnn_multi, 
    oof_nn_multi_input,
    external_oof_preds  # Adding the 6th feature
))
print(f"Final meta-features shape: {X_meta.shape}")


--- Generating Final Submission File ---
External predictions merged successfully. Shape: (75000,)
Final meta-features shape: (75000, 6)


In [24]:
# print(f"Meta-features shape: {X_meta.shape}")

# # --- 2. Train the Ridge Regressor ---
# # A simple, robust Ridge model is an excellent choice for a meta-model.
# # It learns the optimal linear combination of the base model predictions.
# meta_model = Ridge(alpha=1.0)
# meta_model.fit(X_meta, y)

# print("Meta-model trained successfully.")

# # --- 3. Evaluate the Full Ensemble ---
# # Predict on the OOFs to get the final, blended cross-validation score.
# # This is the most reliable estimate of your leaderboard score.
# final_oof_preds = meta_model.predict(X_meta)
# final_oof_smape = smape(y, final_oof_preds)

# print(f"\nOverall OOF SMAPE of the full 5-model ensemble: {final_oof_smape:.4f}%")

# # Print the weights to see which models were most important
# model_names = ['XGB', 'LGBM', 'LSTM_Att', 'CNN_Multi', 'NN_Multi_Input', 'image_model']
# print("Meta-Model Weights:")
# for name, coef in zip(model_names, meta_model.coef_):
#     print(f"   {name}: {coef:.4f}")



In [26]:
# --- 4. Train the Ridge Regressor ---
# --- NEW: DIAGNOSTIC AND FIX BLOCK ---
print("\n--- Checking for NaNs in OOF predictions ---")

# Diagnostic: Find out which model is causing the problem
model_names = ['XGB', 'LGBM', 'LSTM_Att', 'CNN_Multi', 'NN_Multi_Input', 'External']
for i, name in enumerate(model_names):
    nan_count = np.isnan(X_meta[:, i]).sum()
    if nan_count > 0:
        print(f"WARNING: Found {nan_count} NaN values in OOF predictions for model: {name}")

# Immediate Fix: Impute NaNs with the column median
if np.isnan(X_meta).any():
    print("Imputing NaN values with the median of each column...")
    from sklearn.impute import SimpleImputer
    imputer = SimpleImputer(strategy='median')
    X_meta = imputer.fit_transform(X_meta)
    print("NaNs imputed successfully.")

# --- 2. Train the Ridge Regressor (this will now work) ---
meta_model = Ridge(alpha=1.0)
meta_model.fit(X_meta, y)
print("Meta-model trained successfully.")



--- Checking for NaNs in OOF predictions ---
Imputing NaN values with the median of each column...
NaNs imputed successfully.
Meta-model trained successfully.


In [27]:
# --- 5. Evaluate the Full Ensemble ---
final_oof_preds = meta_model.predict(X_meta)
final_oof_smape = smape(y, final_oof_preds)
print(f"\nOverall OOF SMAPE of the full 6-model ensemble: {final_oof_smape:.4f}%")

# Print the weights to see the importance of the external model
model_names = ['XGB', 'LGBM', 'LSTM_Att', 'CNN_Multi', 'NN_Multi_Input', 'External_Model']
print("Meta-Model Weights:")
for name, coef in zip(model_names, meta_model.coef_):
    print(f"   {name}: {coef:.4f}")


Overall OOF SMAPE of the full 6-model ensemble: 47.3280%
Meta-Model Weights:
   XGB: 0.3567
   LGBM: 0.2251
   LSTM_Att: 0.0009
   CNN_Multi: 0.1921
   NN_Multi_Input: -0.0000
   External_Model: 0.4778


In [28]:
df_test = pd.read_csv(CFG.TEST_DATA_PATH)
test_extreme_engineered_features = create_extreme_engineered_features(df_test)

In [29]:
test_combined_embeddings = np.load('/kaggle/input/amlc2025-dataset/' + 'test_combined_embeddings.npy').astype(np.float32)

In [30]:
full_train_engineered = create_extreme_engineered_features(df)
test_engineered = create_extreme_engineered_features(df_test)

In [31]:
# TF-IDF Features (CRITICAL: Fit on full train, then transform both)
final_tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=25000)
full_train_tfidf = final_tfidf_vectorizer.fit_transform(df['catalog_content'])
X_test_tfidf = final_tfidf_vectorizer.transform(df_test['catalog_content'])


In [32]:
# KNN Features (Index on full train, query for both train and test)
full_train_bge_embeds = combined_embeddings[:, bge_start_index:bge_end_index]
test_bge_embeds = test_combined_embeddings[:, bge_start_index:bge_end_index]
final_train_knn_feats = generate_knn_features(full_train_bge_embeds, y, full_train_bge_embeds, CFG.KNN_N_NEIGHBORS).astype(np.float32)
test_knn_feats = generate_knn_features(full_train_bge_embeds, y, test_bge_embeds, CFG.KNN_N_NEIGHBORS).astype(np.float32)

In [33]:
# Tokenizer for DL Models (CRITICAL: Fit on full train, then transform both)
final_tokenizer = Tokenizer(num_words=CFG.DL_VOCAB_SIZE, oov_token="<OOV>")
final_tokenizer.fit_on_texts(df['catalog_content'])
full_train_seq = pad_sequences(final_tokenizer.texts_to_sequences(df['catalog_content']), maxlen=CFG.DL_MAX_LEN)
X_test_seq = pad_sequences(final_tokenizer.texts_to_sequences(df_test['catalog_content']), maxlen=CFG.DL_MAX_LEN)


# --- 3. Re-train Base Models on 100% of the Training Data and Predict on Test ---
print("\n3/4: Re-training base models on full data and generating test predictions...")


3/4: Re-training base models on full data and generating test predictions...


In [35]:
# --- Final Model 1: XGBoost ---
print("   Training final XGBoost model...")
full_train_xgb = scipy.sparse.hstack((full_train_tfidf, full_train_engineered.values))
X_test_xgb = scipy.sparse.hstack((X_test_tfidf, test_engineered.values))
# Use the tuned hyperparameters from your CFG class
# final_xgb_model = xgb.XGBRegressor(**CFG.XGB_PARAMS)
final_xgb_model = xgb.XGBRegressor(tree_method='hist', device='cuda')
# We don't use early stopping here, we want the model to train for all n_estimators
final_xgb_model.fit(full_train_xgb, y)
test_preds_xgb = final_xgb_model.predict(X_test_xgb)
print("   ...done.")


   Training final XGBoost model...
   ...done.


In [37]:
# --- Final Model 2: LightGBM ---
print("   Training final LightGBM model...")
full_train_lgbm = np.hstack([combined_embeddings, full_train_engineered.values, final_train_knn_feats])
X_test_lgbm = np.hstack([test_combined_embeddings, test_engineered.values, test_knn_feats])
# final_lgbm_model = lgb.LGBMRegressor(**CFG.LGBM_PARAMS)
final_lgbm_model = lgb.LGBMRegressor(device='gpu')
final_lgbm_model.fit(full_train_lgbm, y)
test_preds_lgbm = final_lgbm_model.predict(X_test_lgbm)
print("   ...done.")

   Training final LightGBM model...
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 687208
[LightGBM] [Info] Number of data points in the train set: 75000, number of used features: 2703
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 2694 dense feature groups (192.83 MB) transferred to GPU in 0.185837 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 23.647654
   ...done.


In [39]:
# # --- Final Model 3: LSTM w/ Attention ---
# print("   Training final LSTM w/ Attention model...")
# # lstm_params = CFG.LSTM_ATT_PARAMS.copy()
# # lr = lstm_params.pop('learning_rate')
# lr = 0.001
# final_lstm_model = create_lstm_attention_model(CFG.DL_VOCAB_SIZE, CFG.DL_MAX_LEN)
# final_lstm_model.compile(optimizer=Adam(learning_rate=lr), loss='mean_squared_error')
# # Train on full data for a fixed number of epochs (e.g., the average from your OOF runs)
# final_lstm_model.fit(full_train_seq, y, epochs=18, batch_size=256, verbose=0)
# test_preds_lstm_att = final_lstm_model.predict(X_test_seq, batch_size=512).squeeze()
# print("   ...done.")

# --- Final Model 3: LSTM w/ Attention ---
print("   Training final LSTM w/ Attention model...")

# Set the learning rate (as you were doing)
lr = 0.001

# --- THIS IS THE CORRECTED LINE ---
# We now provide the missing third argument for embedding_dim. 128 is a robust choice.
final_lstm_model = create_lstm_attention_model(CFG.DL_VOCAB_SIZE, CFG.DL_MAX_LEN, 128) 

# The rest of the code remains the same
final_lstm_model.compile(optimizer=Adam(learning_rate=lr), loss='mean_squared_error')
# Train on full data for a fixed number of epochs
final_lstm_model.fit(full_train_seq, y, epochs=18, batch_size=256, verbose=0)
test_preds_lstm_att = final_lstm_model.predict(X_test_seq, batch_size=512).squeeze()
print("   ...done.")

   Training final LSTM w/ Attention model...
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 180ms/step
   ...done.


In [None]:
print("hello")

hello


In [44]:
# --- Final Model 4: Multi-Kernel CNN ---
print("   Training final Multi-Kernel CNN model...")
# cnn_params = CFG.CNN_MULTI_PARAMS.copy()
# lr = cnn_params.pop('learning_rate')
lr = 0.001
final_cnn_model = create_multikernel_cnn_model(CFG.DL_VOCAB_SIZE, CFG.DL_MAX_LEN, 128)
final_cnn_model.compile(optimizer=Adam(learning_rate=lr), loss='mean_squared_error')
final_cnn_model.fit(full_train_seq, y, epochs=10, batch_size=256, verbose=1)
test_preds_cnn_multi = final_cnn_model.predict(X_test_seq, batch_size=512).squeeze()
print("   ...done.")

   Training final Multi-Kernel CNN model...
Epoch 1/10
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 85ms/step - loss: 1088.2666
Epoch 2/10
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 84ms/step - loss: 714.2386
Epoch 3/10
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 84ms/step - loss: 685.3861
Epoch 4/10
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 85ms/step - loss: 560.0074
Epoch 5/10
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 84ms/step - loss: 497.7874
Epoch 6/10
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 85ms/step - loss: 372.4983
Epoch 7/10
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 85ms/step - loss: 351.5409
Epoch 8/10
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 86ms/step - loss: 278.7570
Epoch 9/10
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 85ms/step - loss: 275.6773
Epoch 10/1

In [45]:
# # --- Final Model 5: Multi-Input NN ---
# print("   Training final Multi-Input NN model...")
# full_tabular_feats = np.hstack([full_train_engineered.values, final_train_knn_feats])
# test_tabular_feats = np.hstack([test_engineered.values, test_knn_feats])
# # multi_input_params = CFG.NN_MULTI_INPUT_PARAMS.copy()
# # lr = multi_input_params.pop('learning_rate')
# lr = 0.001
# final_multi_input_model = create_multi_input_model(CFG.DL_VOCAB_SIZE, CFG.DL_MAX_LEN, full_tabular_feats.shape[1])
# final_multi_input_model.compile(optimizer=Adam(learning_rate=lr), loss='mean_squared_error')
# final_multi_input_model.fit([full_train_seq, full_tabular_feats], y, epochs=10, batch_size=256, verbose=1)
# test_preds_nn_multi_input = final_multi_input_model.predict([X_test_seq, test_tabular_feats], batch_size=512).squeeze()
# print("   ...done.")

# --- Final Model 5: Multi-Input NN ---
print("   Training final Multi-Input NN model...")

# First, assemble the tabular features for both train and test sets
full_tabular_feats = np.hstack([full_train_engineered.values, final_train_knn_feats])
test_tabular_feats = np.hstack([test_engineered.values, test_knn_feats])

# Set the learning rate
lr = 0.001

# --- THIS IS THE CORRECTED LINE ---
# We now provide all four required arguments in the correct order:
# 1. vocab_size
# 2. max_len
# 3. embedding_dim (the missing one)
# 4. num_tabular_feats
final_multi_input_model = create_multi_input_model(
    CFG.DL_VOCAB_SIZE, 
    CFG.DL_MAX_LEN, 
    128,  # The missing embedding_dim
    full_tabular_feats.shape[1]
)

# The rest of the code remains the same
final_multi_input_model.compile(optimizer=Adam(learning_rate=lr), loss='mean_squared_error')
final_multi_input_model.fit([full_train_seq, full_tabular_feats], y, epochs=10, batch_size=256, verbose=1)
test_preds_nn_multi_input = final_multi_input_model.predict([X_test_seq, test_tabular_feats], batch_size=512).squeeze()
print("   ...done.")

   Training final Multi-Input NN model...
Epoch 1/10
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 99ms/step - loss: nan
Epoch 2/10
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 100ms/step - loss: nan
Epoch 3/10
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 101ms/step - loss: nan
Epoch 4/10
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 102ms/step - loss: nan
Epoch 5/10
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 101ms/step - loss: nan
Epoch 6/10
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 99ms/step - loss: nan
Epoch 7/10
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 98ms/step - loss: nan
Epoch 8/10
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 98ms/step - loss: nan
Epoch 9/10
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 98ms/step - loss: nan
Epoch 10/10
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━

In [46]:
# --- 4. Final Sanity Check ---
print("\n4/4: All test predictions have been generated successfully.")
print(f"Shape of test_preds_xgb: {test_preds_xgb.shape}")
print(f"Shape of test_preds_lgbm: {test_preds_lgbm.shape}")
print(f"Shape of test_preds_lstm_att: {test_preds_lstm_att.shape}")
print(f"Shape of test_preds_cnn_multi: {test_preds_cnn_multi.shape}")
print(f"Shape of test_preds_nn_multi_input: {test_preds_nn_multi_input.shape}")


4/4: All test predictions have been generated successfully.
Shape of test_preds_xgb: (75000,)
Shape of test_preds_lgbm: (75000,)
Shape of test_preds_lstm_att: (75000,)
Shape of test_preds_cnn_multi: (75000,)
Shape of test_preds_nn_multi_input: (75000,)


In [51]:
# ==============================================================================
# SECTION 8: FINAL SUBMISSION (WITH ROBUST NaN HANDLING)
# ==============================================================================
print("\n--- Generating Final Submission File with Robust NaN Handling ---")

# --- (This assumes you have already run the code to generate all test_preds_... variables) ---
# --- (And that you have the trained `meta_model` in memory) ---

# --- 1. Load and Merge External Test Predictions ---
EXTERNAL_TEST_PREDS_PATH = '/kaggle/input/amlc2025-dataset/test_out_final.csv'
ID_COLUMN = 'sample_id'
EXTERNAL_PRED_COLUMN_NAME = 'price'

print(f"Loading external test predictions from: {EXTERNAL_TEST_PREDS_PATH}")
df_test_out = pd.read_csv(EXTERNAL_TEST_PREDS_PATH)
merged_test_preds = pd.merge(df_test[[ID_COLUMN]], df_test_out[[ID_COLUMN, EXTERNAL_PRED_COLUMN_NAME]], on=ID_COLUMN, how='left')
# Your .fillna(0) here is perfect for handling IDs missing from the external file
merged_test_preds[EXTERNAL_PRED_COLUMN_NAME] = merged_test_preds[EXTERNAL_PRED_COLUMN_NAME].fillna(0)
external_test_preds = merged_test_preds[EXTERNAL_PRED_COLUMN_NAME].values
print(f"External test predictions merged successfully. Shape: {external_test_preds.shape}")


# --- 2. Create the Final Meta-Dataset for the Test Set ---
# The order MUST be identical to the training meta-dataset
X_test_meta = np.column_stack((
    test_preds_xgb, 
    test_preds_lgbm, 
    test_preds_lstm_att, 
    test_preds_cnn_multi, 
    test_preds_nn_multi_input,
    external_test_preds
))


# --- 3. DIAGNOSE AND FIX NANS in the Final Stacked Array ---
print("\n--- Checking for NaNs in FINAL test predictions ---")
# Diagnostic: Find out which model is causing the problem on the test set
model_names = ['XGB', 'LGBM', 'LSTM_Att', 'CNN_Multi', 'NN_Multi_Input', 'External_Model']
for i, name in enumerate(model_names):
    nan_count = np.isnan(X_test_meta[:, i]).sum()
    if nan_count > 0:
        print(f"WARNING: Found {nan_count} NaN values in TEST predictions for model: {name}")

# --- THIS IS THE CRITICAL FIX ---
# Check if any NaNs exist in the entire array and replace them with 0.
# np.nan_to_num() is the most efficient way to do this for NumPy arrays.
if np.isnan(X_test_meta).any():
    print("Replacing NaN values with 0...")
    X_test_meta = np.nan_to_num(X_test_meta, nan=0.0)
    print("NaNs replaced successfully.")


# --- 4. Make Final Predictions with the Cleaned Data ---
print("\nMaking final predictions with the meta-model...")
# This will now work because X_test_meta is guaranteed to be free of NaNs.
final_predictions = meta_model.predict(X_test_meta)


# --- 5. Create the Submission File ---
print("\nCreating submission.csv...")
submission_df = pd.DataFrame({'id': df_test[ID_COLUMN], 'price': final_predictions})
submission_df['price'] = submission_df['price'].clip(0) # Final safety net
submission_df.to_csv('submission.csv', index=False)

print("\nSubmission file created successfully!")
print("Top 5 rows of submission.csv:")
print(submission_df.head())


--- Generating Final Submission File with Robust NaN Handling ---
Loading external test predictions from: /kaggle/input/amlc2025-dataset/test_out_final.csv
External test predictions merged successfully. Shape: (75000,)

--- Checking for NaNs in FINAL test predictions ---
Replacing NaN values with 0...
NaNs replaced successfully.

Making final predictions with the meta-model...

Creating submission.csv...

Submission file created successfully!
Top 5 rows of submission.csv:
       id      price
0  100179  17.905004
1  245611  16.870453
2  146263  20.393772
3   95658   9.268154
4   36806  56.891477


In [None]:
print("hello")