In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/requirements-amlc/req2.txt
/kaggle/input/amlc2025-dataset/sample_test.csv
/kaggle/input/amlc2025-dataset/sample_test_out.csv
/kaggle/input/amlc2025-dataset/train.csv
/kaggle/input/amlc2025-dataset/test.csv


In [2]:
import pandas as pd
import numpy as np
import scipy.sparse
import gc

In [3]:
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge

In [4]:
import xgboost as xgb
import lightgbm as lgb

In [5]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Concatenate, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping

2025-10-12 09:27:18.696434: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-10-12 09:27:18.718090: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-10-12 09:27:18.724684: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [6]:
from sentence_transformers import SentenceTransformer

In [7]:
import faiss

In [15]:
class CFG:
    N_SPLITS = 5
    RANDOM_STATE = 42
    DATA_PATH = '/kaggle/input/amlc2025-dataset/train.csv'
    TEST_DATA_PATH = '/kaggle/input/amlc2025-dataset/test.csv'
    
    # Text Embedding Models
    # Using a diverse pair: one for general semantics, one optimized for similarity
    ST_MODELS = [
        'sentence-transformers/all-MiniLM-L12-v2',
        'BAAI/bge-base-en-v1.5',                     
        'sentence-transformers/all-distilroberta-v1',
        'sentence-transformers/paraphrase-mpnet-base-v2'
    ]
    BGE_MODEL_INDEX = 1 # The BGE model is the second in the list, at index 1
    
    # KNN Feature Generation
    KNN_N_NEIGHBORS = 10
    
    # DenseNet (1D CNN) Config
    CNN_VOCAB_SIZE = 30000
    CNN_MAX_LEN = 60
    CNN_EMBEDDING_DIM = 128

In [16]:
df = pd.read_csv(CFG.DATA_PATH)

In [17]:
# Define the SMAPE metric function
def smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    ratio = np.where(denominator == 0, 0, numerator / denominator)
    return np.mean(ratio) * 100

In [18]:
def create_engineered_features(df):
    df_out = pd.DataFrame(index=df.index)
    text_col = 'catalog_content'
    df_out[f'{text_col}_length'] = df[text_col].str.len()
    df_out[f'{text_col}_word_count'] = df[text_col].str.split().str.len()
    df_out[f'{text_col}_digit_count'] = df[text_col].apply(lambda t: sum(1 for c in t if c.isdigit()))
    return df_out

In [19]:
print("1/3: Creating engineered features...")
engineered_features = create_engineered_features(df)
print(f"Engineered features shape: {engineered_features.shape}")

1/3: Creating engineered features...
Engineered features shape: (75000, 3)


In [20]:
def generate_transformer_embeddings(df, models):
    all_embeddings = []
    for model_name in models:
        print(f"   Generating embeddings for: {model_name}")
        model = SentenceTransformer(model_name)
        embeddings = model.encode(df['catalog_content'].tolist(), show_progress_bar=True)
        all_embeddings.append(embeddings)
    return np.concatenate(all_embeddings, axis=1)


In [21]:
print("2/3: Creating text embeddings...")
combined_embeddings = generate_transformer_embeddings(df, CFG.ST_MODELS)
print(f"Combined text embeddings shape: {combined_embeddings.shape}")

2/3: Creating text embeddings...
   Generating embeddings for: sentence-transformers/all-MiniLM-L12-v2


Batches:   0%|          | 0/2344 [00:00<?, ?it/s]

   Generating embeddings for: BAAI/bge-base-en-v1.5


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/2344 [00:00<?, ?it/s]

   Generating embeddings for: sentence-transformers/all-distilroberta-v1


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/2344 [00:00<?, ?it/s]

   Generating embeddings for: sentence-transformers/paraphrase-mpnet-base-v2


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/2344 [00:00<?, ?it/s]

Combined text embeddings shape: (75000, 2688)


In [26]:
# Replace your old KNN function with this improved one.
# It's now located in SECTION 4 for clarity.

# in SECTION 4.3
def generate_knn_features(index_embeds, y_index, query_embeds, k):
    """
    Generates KNN features.
    'index_embeds' are used to build the search space.
    'query_embeds' are the items we want to find neighbors for.
    """
    d = index_embeds.shape[1]
    index = faiss.IndexFlatL2(d)
    
    # Check if GPU is available
    try:
        res = faiss.StandardGpuResources()
        gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
        gpu_index.add(index_embeds.astype(np.float32))
        search_index = gpu_index
    except AttributeError:
        # Fallback to CPU if faiss-gpu is not installed or fails
        print("   FAISS-GPU not found or failed, falling back to CPU.")
        index.add(index_embeds.astype(np.float32))
        search_index = index

    # If query is the same as index, we are doing self-search. K+1 and ignore first result.
    is_self_search = np.array_equal(index_embeds, query_embeds)
    if is_self_search:
        _, I = search_index.search(query_embeds.astype(np.float32), k + 1)
        I = I[:, 1:] # Exclude the first column which is the item itself
    else:
        _, I = search_index.search(query_embeds.astype(np.float32), k)
    
    neighbor_prices = y_index[I]
    
    knn_feats = np.zeros((len(query_embeds), 3))
    knn_feats[:, 0] = np.mean(neighbor_prices, axis=1)
    knn_feats[:, 1] = np.median(neighbor_prices, axis=1)
    knn_feats[:, 2] = np.std(neighbor_prices, axis=1)
    
    return knn_feats

print("3/3: KNN feature generation function is ready.")

3/3: KNN feature generation function is ready.


In [23]:
print("\n--- Phase 2: Training Base Models with Cross-Validation ---")

# Prepare data
X_text = df['catalog_content']
y = df['price'].values
X_engineered = engineered_features.values

# OOF arrays
oof_xgb = np.zeros(len(df))
oof_lgbm = np.zeros(len(df))
oof_densenet = np.zeros(len(df))



--- Phase 2: Training Base Models with Cross-Validation ---


In [24]:
kf = KFold(n_splits=CFG.N_SPLITS, shuffle=True, random_state=CFG.RANDOM_STATE)


In [27]:
# Pre-calculate the dimensions of each sentence transformer model to find the BGE slice
print("Calculating embedding dimensions for KNN slicing...")
model_dims = [SentenceTransformer(m).get_sentence_embedding_dimension() for m in CFG.ST_MODELS]
bge_start_index = sum(model_dims[:CFG.BGE_MODEL_INDEX])
bge_end_index = bge_start_index + model_dims[CFG.BGE_MODEL_INDEX]
print(f"BGE embeddings will be sliced from column {bge_start_index} to {bge_end_index}.")

for fold, (train_idx, val_idx) in enumerate(kf.split(df)):
    print(f"\n===== FOLD {fold+1} / {CFG.N_SPLITS} =====")
    
    # --- Split Data for this Fold ---
    X_train_text, X_val_text = X_text.iloc[train_idx], X_text.iloc[val_idx]
    X_train_eng, X_val_eng = X_engineered[train_idx], X_engineered[val_idx]
    X_train_embed, X_val_embed = combined_embeddings[train_idx], combined_embeddings[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    # --- Model 1: XGBoost (Keyword Expert) ---
    print("1/3: Training XGBoost...")
    tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=30000)
    X_train_tfidf = tfidf.fit_transform(X_train_text)
    X_val_tfidf = tfidf.transform(X_val_text)
    
    X_train_xgb = scipy.sparse.hstack((X_train_tfidf, X_train_eng))
    X_val_xgb = scipy.sparse.hstack((X_val_tfidf, X_val_eng))

    xgb_model = xgb.XGBRegressor(tree_method='hist', device='cuda', random_state=CFG.RANDOM_STATE)
    xgb_model.fit(X_train_xgb, y_train)
    oof_xgb[val_idx] = xgb_model.predict(X_val_xgb)
    print(f"   Fold {fold+1} XGB SMAPE: {smape(y_val, oof_xgb[val_idx]):.4f}%")
    # trained on tfidf + engineered features

    # --- Model 2: LightGBM (Semantic & Market Expert) ---
    print("2/3: Training LightGBM...")

    # Slice out the BGE embeddings, which are best for similarity search, using our pre-calculated indices
    bge_train_embeds = X_train_embed[:, bge_start_index:bge_end_index]
    bge_val_embeds = X_val_embed[:, bge_start_index:bge_end_index]

    # Generate KNN features for BOTH the training and validation sets
    # This ensures the number of features is identical for both fit() and predict()
    print("   Generating KNN features for training set...")
    # For the training set, we find neighbors within itself
    knn_feats_train = generate_knn_features(bge_train_embeds, y_train, bge_train_embeds, CFG.KNN_N_NEIGHBORS)

    print("   Generating KNN features for validation set...")
    # For the validation set, we find neighbors from the training set (leakage-free for OOF)
    knn_feats_val = generate_knn_features(bge_train_embeds, y_train, bge_val_embeds, CFG.KNN_N_NEIGHBORS)

    # Assemble the final feature matrices, now with identical structures
    X_train_lgbm = np.hstack([X_train_embed, X_train_eng, knn_feats_train])
    X_val_lgbm = np.hstack([X_val_embed, X_val_eng, knn_feats_val])
    
    # This print statement is a great sanity check to confirm the fix
    print(f"   Train features shape: {X_train_lgbm.shape}, Val features shape: {X_val_lgbm.shape}")
    
    # Train the model and make predictions
    lgbm_model = lgb.LGBMRegressor(device='gpu', random_state=CFG.RANDOM_STATE)
    lgbm_model.fit(X_train_lgbm, y_train)
    oof_lgbm[val_idx] = lgbm_model.predict(X_val_lgbm)
    print(f"   Fold {fold+1} LGBM SMAPE: {smape(y_val, oof_lgbm[val_idx]):.4f}%")
    
    # --- Model 3: DenseNet (Deep Learning Specialist) ---
    print("3/3: Training DenseNet (1D CNN)...")
    tokenizer = Tokenizer(num_words=CFG.CNN_VOCAB_SIZE, oov_token="<OOV>")
    tokenizer.fit_on_texts(X_train_text)
    
    X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train_text), maxlen=CFG.CNN_MAX_LEN)
    X_val_seq = pad_sequences(tokenizer.texts_to_sequences(X_val_text), maxlen=CFG.CNN_MAX_LEN)

    # Simple 1D CNN model
    input_text = Input(shape=(CFG.CNN_MAX_LEN,))
    embedding = Embedding(CFG.CNN_VOCAB_SIZE, CFG.CNN_EMBEDDING_DIM)(input_text)
    conv1 = Conv1D(filters=64, kernel_size=3, activation='relu')(embedding)
    pool1 = GlobalMaxPooling1D()(conv1)
    dense1 = Dense(64, activation='relu')(pool1)
    dropout1 = Dropout(0.5)(dense1)
    output = Dense(1)(dropout1)
    cnn_model = Model(inputs=input_text, outputs=output)
    cnn_model.compile(optimizer='adam', loss='mean_squared_error')
    
    es = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    cnn_model.fit(X_train_seq, y_train, validation_data=(X_val_seq, y_val),
                  epochs=20, batch_size=128, callbacks=[es], verbose=0)
    
    oof_densenet[val_idx] = cnn_model.predict(X_val_seq, batch_size=512).squeeze()
    print(f"   Fold {fold+1} DenseNet SMAPE: {smape(y_val, oof_densenet[val_idx]):.4f}%")

    gc.collect()

Calculating embedding dimensions for KNN slicing...
BGE embeddings will be sliced from column 384 to 1152.

===== FOLD 1 / 5 =====
1/3: Training XGBoost...
   Fold 1 XGB SMAPE: 63.0302%
2/3: Training LightGBM...
   Generating KNN features for training set...
   Generating KNN features for validation set...
   Train features shape: (60000, 2694), Val features shape: (15000, 2694)
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 686797
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 2694
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 2694 dense feature groups (154.27 MB) transferred to GPU in 0.145207 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 23.598634
   Fold 1 LGBM SMAPE: 62.5634%
3/3: Training

I0000 00:00:1760266008.065772     253 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1760266008.066434     253 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355


[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step
   Fold 1 DenseNet SMAPE: 61.0471%

===== FOLD 2 / 5 =====
1/3: Training XGBoost...
   Fold 2 XGB SMAPE: 62.0947%
2/3: Training LightGBM...
   Generating KNN features for training set...
   Generating KNN features for validation set...
   Train features shape: (60000, 2694), Val features shape: (15000, 2694)
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 686798
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 2694
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 2694 dense feature groups (154.27 MB) transferred to GPU in 0.143671 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 23.620979
   Fold 2 LGBM SMAPE: 61.8145%
3/3: Traini

In [28]:
print("\n--- Phase 3: Training Meta-Model ---")

# Create the training data for the meta-model
X_meta = np.column_stack((oof_xgb, oof_lgbm, oof_densenet))

# Train the meta-model
meta_model = Ridge(alpha=1.0)
meta_model.fit(X_meta, y)

# Evaluate the final OOF predictions from the blended model
final_oof_preds = meta_model.predict(X_meta)
final_oof_smape = smape(y, final_oof_preds)

print(f"\nOverall OOF SMAPE of the full ensemble: {final_oof_smape:.4f}%")
print(f"Meta-Model Weights (XGB, LGBM, DenseNet): {meta_model.coef_}")
print("\nTraining pipeline complete. Ready for final model training and submission generation.")


--- Phase 3: Training Meta-Model ---

Overall OOF SMAPE of the full ensemble: 57.1667%
Meta-Model Weights (XGB, LGBM, DenseNet): [0.38743498 0.45836708 0.36707637]

Training pipeline complete. Ready for final model training and submission generation.


In [29]:
df_test = pd.read_csv(CFG.TEST_DATA_PATH)

In [30]:
test_ids = df_test['sample_id'] 
X_test_text = df_test['catalog_content']

In [32]:
print("1/4: Generating features for the full train and test sets...")
full_train_engineered = create_engineered_features(df)
test_engineered = create_engineered_features(df_test)

1/4: Generating features for the full train and test sets...


In [33]:
final_tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=30000)
full_train_tfidf = final_tfidf_vectorizer.fit_transform(df['catalog_content'])
X_test_tfidf = final_tfidf_vectorizer.transform(X_test_text)

In [35]:
print("   Generating embeddings for test set...")
test_embeddings = generate_transformer_embeddings(df_test, CFG.ST_MODELS)

   Generating embeddings for test set...
   Generating embeddings for: sentence-transformers/all-MiniLM-L12-v2


Batches:   0%|          | 0/2344 [00:00<?, ?it/s]

   Generating embeddings for: BAAI/bge-base-en-v1.5


Batches:   0%|          | 0/2344 [00:00<?, ?it/s]

   Generating embeddings for: sentence-transformers/all-distilroberta-v1


Batches:   0%|          | 0/2344 [00:00<?, ?it/s]

   Generating embeddings for: sentence-transformers/paraphrase-mpnet-base-v2


Batches:   0%|          | 0/2344 [00:00<?, ?it/s]

In [36]:
print("   Generating KNN features for final models...")
full_train_bge_embeds = combined_embeddings[:, bge_start_index:bge_end_index]
test_bge_embeds = test_embeddings[:, bge_start_index:bge_end_index]

   Generating KNN features for final models...


In [None]:
# KNN features for the final LGBM training set
final_train_knn_feats = generate_knn_features(full_train_bge_embeds, y, full_train_bge_embeds, CFG.KNN_N_NEIGHBORS)
# KNN features for the test set (searching within the training data)
test_knn_feats = generate_knn_features(full_train_bge_embeds, y, test_bge_embeds, CFG.KNN_N_NEIGHBORS)



In [None]:
print(type(final_train_knn_feats))
print(final_train_knn_feats.shape)
print(final_train_knn_feats[:5])

<class 'numpy.ndarray'>
(75000, 3)
[[ 8.7405      4.745      13.03754318]
 [44.5675     17.745      63.02194702]
 [ 8.8295      6.73        5.21788868]
 [61.4225     26.1475     97.51395606]
 [24.7875     20.34       19.72900482]]


AttributeError: 'numpy.ndarray' object has no attribute 'values'

In [38]:
print("\n2/4: Re-training base models on 100% of the training data...")

# --- Final Model 1: XGBoost ---
print("   Training final XGBoost model...")
full_train_xgb = scipy.sparse.hstack((full_train_tfidf, full_train_engineered.values))
X_test_xgb = scipy.sparse.hstack((X_test_tfidf, test_engineered.values))

final_xgb_model = xgb.XGBRegressor(tree_method='gpu_hist', random_state=CFG.RANDOM_STATE)
final_xgb_model.fit(full_train_xgb, y)
test_preds_xgb = final_xgb_model.predict(X_test_xgb)

# --- Final Model 2: LightGBM ---
print("   Training final LightGBM model...")
full_train_lgbm = np.hstack([combined_embeddings, full_train_engineered.values, final_train_knn_feats])
X_test_lgbm = np.hstack([test_embeddings, test_engineered.values, test_knn_feats])

final_lgbm_model = lgb.LGBMRegressor(device='gpu', random_state=CFG.RANDOM_STATE)
final_lgbm_model.fit(full_train_lgbm, y)
test_preds_lgbm = final_lgbm_model.predict(X_test_lgbm)

# --- Final Model 3: DenseNet (1D CNN) ---
print("   Training final DenseNet model...")
final_tokenizer = Tokenizer(num_words=CFG.CNN_VOCAB_SIZE, oov_token="<OOV>")
final_tokenizer.fit_on_texts(df['catalog_content'])

full_train_seq = pad_sequences(final_tokenizer.texts_to_sequences(df['catalog_content']), maxlen=CFG.CNN_MAX_LEN)
X_test_seq = pad_sequences(final_tokenizer.texts_to_sequences(X_test_text), maxlen=CFG.CNN_MAX_LEN)

# Define the model architecture again (for a clean run)
input_text = Input(shape=(CFG.CNN_MAX_LEN,))
embedding = Embedding(CFG.CNN_VOCAB_SIZE, CFG.CNN_EMBEDDING_DIM)(input_text)
conv1 = Conv1D(filters=64, kernel_size=3, activation='relu')(embedding)
pool1 = GlobalMaxPooling1D()(conv1)
dense1 = Dense(64, activation='relu')(pool1)
dropout1 = Dropout(0.5)(dense1)
output = Dense(1)(dropout1)
final_cnn_model = Model(inputs=input_text, outputs=output)
final_cnn_model.compile(optimizer='adam', loss='mean_squared_error')

# Train on the full data. No validation or early stopping needed here.
final_cnn_model.fit(full_train_seq, y, epochs=15, batch_size=128, verbose=0)
test_preds_densenet = final_cnn_model.predict(X_test_seq, batch_size=512).squeeze()



2/4: Re-training base models on 100% of the training data...
   Training final XGBoost model...



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



   Training final LightGBM model...
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 686802
[LightGBM] [Info] Number of data points in the train set: 75000, number of used features: 2694
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 2694 dense feature groups (192.83 MB) transferred to GPU in 0.179793 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 23.647654
   Training final DenseNet model...
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step


In [39]:
# --- 4. Make Final Predictions with the Meta-Model ---
print("\n3/4: Making final predictions with the meta-model...")
# Stack the test predictions from the base models
X_test_meta = np.column_stack((test_preds_xgb, test_preds_lgbm, test_preds_densenet))

# CRUCIALLY, we DO NOT re-train the meta-model. We use the one trained on our OOF predictions.
final_predictions = meta_model.predict(X_test_meta)



3/4: Making final predictions with the meta-model...


In [41]:


# --- 5. Create the Submission File ---
print("\n4/4: Creating submission.csv...")
submission_df = pd.DataFrame({
    'sample_id': test_ids,
    'price': final_predictions
})

# Apply the safety net to ensure no negative prices
submission_df['price'] = submission_df['price'].clip(0)

submission_df.to_csv('submission1.csv', index=False)

print("\nSubmission file created successfully!")
print("Top 5 rows of submission.csv:")
print(submission_df.head())


4/4: Creating submission.csv...

Submission file created successfully!
Top 5 rows of submission.csv:
   sample_id      price
0     100179  20.190771
1     245611  15.532209
2     146263  29.531922
3      95658  10.189304
4      36806  51.957387


In [None]:
print("Hello")

Hello


In [2]:
print("Saving combined embeddings to a file...")
np.save('combined_embeddings.npy', combined_embeddings)
print("Embeddings saved!")

Saving combined embeddings to a file...


NameError: name 'np' is not defined