# Stacking
- ### Models
    - xgboost
    - lightgbm
    - LSTM
    - Ridge Regression for met model

In [1]:
# !pip install -r /kaggle/input/requirements-amlc/req.txt

In [2]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/amlc2025-dataset/sample_test.csv
/kaggle/input/amlc2025-dataset/sample_test_out.csv
/kaggle/input/amlc2025-dataset/train.csv
/kaggle/input/amlc2025-dataset/test.csv
/kaggle/input/requirements-amlc/req2.txt


In [3]:
import pandas as pd
import numpy as np
import scipy.sparse
import gc

In [4]:
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error # As a proxy for NN loss

In [5]:
import xgboost as xgb
import lightgbm as lgb

In [6]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping

2025-10-12 04:50:28.215325: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-10-12 04:50:28.236851: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-10-12 04:50:28.243326: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [7]:
from sentence_transformers import SentenceTransformer

In [8]:
import sentence_transformers
sentence_transformers.__version__

'5.1.1'

In [9]:
class CFG:
    N_SPLITS = 5
    RANDOM_STATE = 42
    DATA_PATH = '/kaggle/input/amlc2025-dataset/train.csv'
    
    # LSTM Config
    LSTM_VOCAB_SIZE = 20000
    LSTM_MAX_LEN = 60
    LSTM_EMBEDDING_DIM = 100
    
    # Sentence Transformer Model Name
    ST_MODEL_NAME = 'all-MiniLM-L6-v2'

In [10]:
# Define the SMAPE metric function
def smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    ratio = np.where(denominator == 0, 0, numerator / denominator)
    return np.mean(ratio) * 100

In [11]:
df = pd.read_csv(CFG.DATA_PATH)

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75000 entries, 0 to 74999
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   sample_id        75000 non-null  int64  
 1   catalog_content  75000 non-null  object 
 2   image_link       75000 non-null  object 
 3   price            75000 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 2.3+ MB


In [12]:
def create_engineered_features(df):
    df_out = pd.DataFrame(index=df.index)
    text_col = 'catalog_content'
    
    df_out[f'{text_col}_length'] = df[text_col].str.len()
    df_out[f'{text_col}_word_count'] = df[text_col].str.split().str.len()
    df_out[f'{text_col}_digit_count'] = df[text_col].apply(lambda t: sum(1 for c in t if c.isdigit()))
    
    return df_out

In [13]:
print("Creating engineered features...")
engineered_features = create_engineered_features(df)
print(f"Engineered features shape: {engineered_features.shape}")

Creating engineered features...
Engineered features shape: (75000, 3)


In [14]:
X = df[['catalog_content']]
y = df['price'].values
X_engineered = engineered_features.values

In [15]:
oof_xgb_tfidf = np.zeros(len(df))
oof_lgbm_st = np.zeros(len(df))
oof_lstm = np.zeros(len(df))


In [16]:
kf = KFold(n_splits=CFG.N_SPLITS, shuffle=True, random_state=CFG.RANDOM_STATE)

In [17]:
st_model = SentenceTransformer(CFG.ST_MODEL_NAME)

In [18]:
# physical_devices = tf.config.list_physical_devices('GPU')
# print(physical_devices)
# tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)
# tf.config.experimental.set_memory_growth(physical_devices[1], enable=True)

In [19]:
for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    print(f"\n===== FOLD {fold+1} / {CFG.N_SPLITS} =====")
    
    # Split data for this fold
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    X_eng_train, X_eng_val = X_engineered[train_idx], X_engineered[val_idx]

    # --- Model 1: XGBoost on TF-IDF + Engineered Features ---
    print("Training XGBoost on TF-IDF...")
    tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=25000)
    
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['catalog_content'])
    X_val_tfidf = tfidf_vectorizer.transform(X_val['catalog_content'])
    
    X_train_xgb = scipy.sparse.hstack((X_train_tfidf, X_eng_train))
    X_val_xgb = scipy.sparse.hstack((X_val_tfidf, X_eng_val))

    xgb_model = xgb.XGBRegressor(tree_method='hist', device='cuda', random_state=CFG.RANDOM_STATE)
    xgb_model.fit(X_train_xgb, y_train)
    
    preds_xgb = xgb_model.predict(X_val_xgb)
    oof_xgb_tfidf[val_idx] = preds_xgb
    print(f"Fold {fold+1} XGB TF-IDF SMAPE: {smape(y_val, preds_xgb):.4f}%")
    
    # --- Model 2: LightGBM on SentenceTransformer + Engineered Features ---
    print("Training LightGBM on SentenceTransformer...")
    X_train_st = st_model.encode(X_train['catalog_content'].tolist(), show_progress_bar=False)
    X_val_st = st_model.encode(X_val['catalog_content'].tolist(), show_progress_bar=False)
    
    X_train_lgbm = np.hstack((X_train_st, X_eng_train))
    X_val_lgbm = np.hstack((X_val_st, X_eng_val))
    
    lgbm_model = lgb.LGBMRegressor(device='gpu', random_state=CFG.RANDOM_STATE)
    lgbm_model.fit(X_train_lgbm, y_train)
    
    preds_lgbm = lgbm_model.predict(X_val_lgbm)
    oof_lgbm_st[val_idx] = preds_lgbm
    print(f"Fold {fold+1} LGBM ST SMAPE: {smape(y_val, preds_lgbm):.4f}%")

    # --- Model 3: LSTM with internal Embedding Layer ---
    print("Training LSTM...")
    tokenizer = Tokenizer(num_words=CFG.LSTM_VOCAB_SIZE, oov_token="<OOV>")
    tokenizer.fit_on_texts(X_train['catalog_content'])
    
    X_train_seq = tokenizer.texts_to_sequences(X_train['catalog_content'])
    X_val_seq = tokenizer.texts_to_sequences(X_val['catalog_content'])
    
    X_train_pad = pad_sequences(X_train_seq, maxlen=CFG.LSTM_MAX_LEN, padding='post', truncating='post')
    X_val_pad = pad_sequences(X_val_seq, maxlen=CFG.LSTM_MAX_LEN, padding='post', truncating='post')
    
    lstm_model = Sequential([
        Input(shape=(CFG.LSTM_MAX_LEN,)),
        Embedding(input_dim=CFG.LSTM_VOCAB_SIZE, output_dim=CFG.LSTM_EMBEDDING_DIM),
        # LSTM(64, return_sequences=False),
        # changing the above line since due to a kaggle env issue cuDNN kernel not supported for the LSTM Layer
        LSTM(64, return_sequences=False, activation='tanh'), 
        Dense(32, activation='relu'),
        Dense(1)
    ])
    lstm_model.compile(loss='mean_squared_error', optimizer='adam')
    
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    
    lstm_model.fit(X_train_pad, y_train,
                   validation_data=(X_val_pad, y_val),
                   epochs=20,
                   batch_size=128,
                   callbacks=[early_stopping],
                   verbose=0) # Set to 1 to see epoch progress
    
    preds_lstm = lstm_model.predict(X_val_pad, batch_size=512).squeeze()
    oof_lstm[val_idx] = preds_lstm
    print(f"Fold {fold+1} LSTM SMAPE: {smape(y_val, preds_lstm):.4f}%")
    
    # Clean up memory
    gc.collect()


===== FOLD 1 / 5 =====
Training XGBoost on TF-IDF...
Fold 1 XGB TF-IDF SMAPE: 62.9343%
Training LightGBM on SentenceTransformer...
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 98512
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 387
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 387 dense feature groups (22.20 MB) transferred to GPU in 0.022692 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 23.598634
Fold 1 LGBM ST SMAPE: 70.2821%
Training LSTM...


I0000 00:00:1760244840.744436     565 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1760244840.744932     565 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355


[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 60ms/step
Fold 1 LSTM SMAPE: 60.1558%

===== FOLD 2 / 5 =====
Training XGBoost on TF-IDF...
Fold 2 XGB TF-IDF SMAPE: 62.1370%
Training LightGBM on SentenceTransformer...
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 98513
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 387
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 387 dense feature groups (22.20 MB) transferred to GPU in 0.021506 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 23.620979
Fold 2 LGBM ST SMAPE: 69.5084%
Training LSTM...
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 63ms/step
Fold 2 LSTM SMAPE: 58.0344%

===== FOLD 3 / 5 =====
Training XGBoost on TF

In [20]:
print("\n--- Training Meta-Model ---")
X_meta = np.column_stack((oof_xgb_tfidf, oof_lgbm_st, oof_lstm))
meta_model = Ridge(alpha=1.0)
meta_model.fit(X_meta, y)
print("Meta-model trained successfully.")
print(f"Final blended OOF predictions shape: {X_meta.shape}")
final_oof_preds = meta_model.predict(X_meta)
final_oof_smape = smape(y, final_oof_preds)
print(f"\nOverall OOF SMAPE of the full ensemble: {final_oof_smape:.4f}%")
print("Weights of the meta-model (XGB, LGBM, LSTM):", meta_model.coef_)


--- Training Meta-Model ---
Meta-model trained successfully.
Final blended OOF predictions shape: (75000, 3)

Overall OOF SMAPE of the full ensemble: 58.3435%
Weights of the meta-model (XGB, LGBM, LSTM): [0.40070418 0.22180141 0.58282787]


In [21]:
TEST_DATA_PATH = '/kaggle/input/amlc2025-dataset/test.csv'

In [22]:
df_test = pd.read_csv(TEST_DATA_PATH)

In [24]:
test_ids = df_test['sample_id']
X_test = df_test[['catalog_content']]

In [26]:
print("Re-training models on the full training dataset...")
print("Training final XGBoost model...")
# Use the same vectorizer, but fit on ALL training data
final_tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=25000)
full_train_tfidf = final_tfidf_vectorizer.fit_transform(df['catalog_content'])
X_test_tfidf = final_tfidf_vectorizer.transform(X_test['catalog_content'])

# Combine with engineered features
full_train_engineered = create_engineered_features(df)
test_engineered = create_engineered_features(df_test)
full_train_xgb = scipy.sparse.hstack((full_train_tfidf, full_train_engineered.values))
X_test_xgb = scipy.sparse.hstack((X_test_tfidf, test_engineered.values))

# Train the final XGB model
final_xgb_model = xgb.XGBRegressor(tree_method='hist', device='cuda', random_state=CFG.RANDOM_STATE)
final_xgb_model.fit(full_train_xgb, y)
test_preds_xgb = final_xgb_model.predict(X_test_xgb)


Re-training models on the full training dataset...
Training final XGBoost model...


In [27]:
# --- Model 2: LightGBM on SentenceTransformer ---
print("Training final LightGBM model...")
full_train_st = st_model.encode(df['catalog_content'].tolist(), show_progress_bar=False)
X_test_st = st_model.encode(X_test['catalog_content'].tolist(), show_progress_bar=False)

full_train_lgbm = np.hstack((full_train_st, full_train_engineered.values))
X_test_lgbm = np.hstack((X_test_st, test_engineered.values))

# Train the final LGBM model
final_lgbm_model = lgb.LGBMRegressor(device='gpu', random_state=CFG.RANDOM_STATE)
final_lgbm_model.fit(full_train_lgbm, y)
test_preds_lgbm = final_lgbm_model.predict(X_test_lgbm)

Training final LightGBM model...
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 98517
[LightGBM] [Info] Number of data points in the train set: 75000, number of used features: 387
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 387 dense feature groups (27.75 MB) transferred to GPU in 0.026546 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 23.647654


In [28]:
# --- Model 3: LSTM ---
print("Training final LSTM model...")
# Use the same tokenizer, but fit on ALL training data
final_tokenizer = Tokenizer(num_words=CFG.LSTM_VOCAB_SIZE, oov_token="<OOV>")
final_tokenizer.fit_on_texts(df['catalog_content'])

full_train_seq = final_tokenizer.texts_to_sequences(df['catalog_content'])
X_test_seq = final_tokenizer.texts_to_sequences(X_test['catalog_content'])

full_train_pad = pad_sequences(full_train_seq, maxlen=CFG.LSTM_MAX_LEN, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=CFG.LSTM_MAX_LEN, padding='post', truncating='post')


# Define and train the final LSTM model
final_lstm_model = Sequential([
    Input(shape=(CFG.LSTM_MAX_LEN,)),
    Embedding(input_dim=CFG.LSTM_VOCAB_SIZE, output_dim=CFG.LSTM_EMBEDDING_DIM),
    LSTM(64, return_sequences=False, activation='tanh'),
    Dense(32, activation='relu'),
    Dense(1)
])
final_lstm_model.compile(loss='mean_squared_error', optimizer='adam')
# Note: No validation data here, as we use all data for training
final_lstm_model.fit(full_train_pad, y, epochs=10, batch_size=128, verbose=0) # Train for a reasonable number of epochs
test_preds_lstm = final_lstm_model.predict(X_test_pad, batch_size=512).squeeze()

Training final LSTM model...
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 57ms/step


In [29]:
# --- 3. Make Final Predictions with the Meta-Model ---
print("Making final predictions with the meta-model...")
# Stack the test predictions from the base models
X_test_meta = np.column_stack((test_preds_xgb, test_preds_lgbm, test_preds_lstm))

# Use the already-trained meta_model to predict on the test meta-features
final_predictions = meta_model.predict(X_test_meta)

Making final predictions with the meta-model...


In [32]:
print("Creating submission.csv...")
submission_df = pd.DataFrame({
    'sample_id': test_ids,
    'price': final_predictions
})


Creating submission.csv...


In [33]:
submission_df['price'] = submission_df['price'].clip(0)

submission_df.to_csv('submission.csv', index=False)

print("\nSubmission file created successfully!")
print("Top 5 rows of submission.csv:")
print(submission_df.head())


Submission file created successfully!
Top 5 rows of submission.csv:
   sample_id      price
0     100179  20.975271
1     245611  12.173563
2     146263  32.372255
3      95658  11.906809
4      36806  96.555306


kaggle environment

use tf 2.17.1
and
keras 3.5.0

In [34]:
print(type(test_ids[0]))

<class 'numpy.int64'>


In [35]:
print(type(final_predictions[0]))

<class 'numpy.float64'>
