Notebook will be run on kaggle non interactive session

In [None]:
import pandas as pd
import numpy as np
import scipy.sparse
import gc
import optuna

# Scikit-learn, Boosters, TF, Transformers, FAISS...
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
import lightgbm as lgb
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Concatenate, Dropout, Bidirectional, LSTM, GRU
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from sentence_transformers import SentenceTransformer
import faiss

In [None]:
class CFG:
    N_SPLITS = 5
    RANDOM_STATE = 42
    DATA_PATH = '/kaggle/input/amlc2025-dataset/train.csv'
    
    # --- Text Embedding Models ---
    ST_MODELS = [
        'sentence-transformers/all-MiniLM-L12-v2',
        'BAAI/bge-base-en-v1.5',
        'sentence-transformers/all-distilroberta-v1',
        'sentence-transformers/paraphrase-mpnet-base-v2'
    ]
    BGE_MODEL_INDEX = 1
    
    # --- Feature Generation Config ---
    KNN_N_NEIGHBORS = 10
    
    # --- Deep Learning Model Config ---
    DL_VOCAB_SIZE = 30000
    DL_MAX_LEN = 60

    # --- Default Hyperparameters (Placeholders for Tuning) ---
    LGBM_PARAMS = {'objective': 'regression_l1', 'metric': 'mae', 'n_estimators': 2000, 'learning_rate': 0.05, 'feature_fraction': 0.8, 'bagging_fraction': 0.8, 'num_leaves': 31, 'verbose': -1, 'n_jobs': -1, 'seed': 42}
    XGB_PARAMS = {'objective': 'reg:squarederror', 'eval_metric': 'mae', 'n_estimators': 2000, 'learning_rate': 0.05, 'max_depth': 7, 'subsample': 0.8, 'colsample_bytree': 0.8, 'tree_method': 'gpu_hist', 'seed': 42}
    LSTM_ATT_PARAMS = {'embedding_dim': 128, 'lstm_units': 64, 'dense_units': 64, 'dropout_rate': 0.3, 'learning_rate': 0.001}
    CNN_MULTI_PARAMS = {'embedding_dim': 128, 'cnn_filters': 64, 'dense_units': 128, 'dropout_rate': 0.5, 'learning_rate': 0.001}
    NN_MULTI_INPUT_PARAMS = {'embedding_dim': 64, 'gru_units': 64, 'text_branch_dense_units': 32, 'tabular_branch_dense_units': 32, 'final_dense_units': 64, 'dropout_rate': 0.4, 'learning_rate': 0.001}

In [None]:
# Define the SMAPE metric function
def smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    ratio = np.where(denominator == 0, 0, numerator / denominator)
    return np.mean(ratio) * 100

In [None]:
df = pd.read_csv(CFG.DATA_PATH)
y = df['price'].values

In [None]:
def create_extreme_engineered_features(df):
    df_out = pd.DataFrame(index=df.index)
    text_col = 'catalog_content'
    
    # Meta Features
    df_out[f'{text_col}_length'] = df[text_col].str.len()
    df_out[f'{text_col}_word_count'] = df[text_col].str.split().str.len()
    df_out[f'{text_col}_capital_ratio'] = df[text_col].apply(lambda t: sum(1 for c in t if c.isupper()) / (len(t) + 1e-9))
    
    # Entity Extraction (Example lists - expand these!)
    BRANDS = ['sony', 'samsung', 'nike', 'apple', 'kitchenaid']
    MATERIALS = ['cotton', 'leather', 'silk', 'gold', 'nylon', 'polyester', 'silver']
    
    text_lower = df[text_col].str.lower()
    df_out['has_brand'] = text_lower.apply(lambda t: any(brand in t for brand in BRANDS)).astype(int)
    for material in MATERIALS:
        df_out[f'has_{material}'] = text_lower.str.contains(material).astype(int)
        
    # Regex Features
    df_out['extracted_inch'] = text_lower.str.extract(r'(\d+\.?\d*)\s*(inch|\"|in\b)').iloc[:, 0].astype(float)
    df_out['extracted_gb'] = text_lower.str.extract(r'(\d+)\s*gb').iloc[:, 0].astype(float)
    
    # Fill NaNs from regex with a neutral value like 0
    return df_out.fillna(0)

In [None]:
def generate_transformer_embeddings(df, models):
    all_embeddings = []
    for model_name in models:
        print(f"   Generating embeddings for: {model_name}")
        model = SentenceTransformer(model_name)
        embeddings = model.encode(df['catalog_content'].tolist(), show_progress_bar=True)
        all_embeddings.append(embeddings)
    return np.concatenate(all_embeddings, axis=1)

In [None]:
def generate_knn_features(index_embeds, y_index, query_embeds, k):
    """
    Generates KNN features.
    'index_embeds' are used to build the search space.
    'query_embeds' are the items we want to find neighbors for.
    """
    d = index_embeds.shape[1]
    index = faiss.IndexFlatL2(d)
    
    # Check if GPU is available
    try:
        res = faiss.StandardGpuResources()
        gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
        gpu_index.add(index_embeds.astype(np.float32))
        search_index = gpu_index
    except AttributeError:
        # Fallback to CPU if faiss-gpu is not installed or fails
        print("   FAISS-GPU not found or failed, falling back to CPU.")
        index.add(index_embeds.astype(np.float32))
        search_index = index

    # If query is the same as index, we are doing self-search. K+1 and ignore first result.
    is_self_search = np.array_equal(index_embeds, query_embeds)
    if is_self_search:
        _, I = search_index.search(query_embeds.astype(np.float32), k + 1)
        I = I[:, 1:] # Exclude the first column which is the item itself
    else:
        _, I = search_index.search(query_embeds.astype(np.float32), k)
    
    neighbor_prices = y_index[I]
    
    knn_feats = np.zeros((len(query_embeds), 3))
    knn_feats[:, 0] = np.mean(neighbor_prices, axis=1)
    knn_feats[:, 1] = np.median(neighbor_prices, axis=1)
    knn_feats[:, 2] = np.std(neighbor_prices, axis=1)
    
    return knn_feats


In [None]:
print("Generating all features...")
extreme_engineered_features = create_extreme_engineered_features(df)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=50000)
tfidf_features = tfidf_vectorizer.fit_transform(df['catalog_content'])
combined_embeddings = generate_transformer_embeddings(df, CFG.ST_MODELS)

In [None]:
print("Splitting data to get a single fold for tuning...")
kf = KFold(n_splits=CFG.N_SPLITS, shuffle=True, random_state=CFG.RANDOM_STATE)
train_idx, val_idx = next(iter(kf.split(df))) # Get the first fold

In [None]:
# Split all feature sets
X_train_text, X_val_text = df['catalog_content'].iloc[train_idx], df['catalog_content'].iloc[val_idx]
y_train, y_val = y[train_idx], y[val_idx]
X_train_eng, X_val_eng = extreme_engineered_features.iloc[train_idx].values, extreme_engineered_features.iloc[val_idx].values
X_train_tfidf, X_val_tfidf = tfidf_features[train_idx], tfidf_features[val_idx]
X_train_embed, X_val_embed = combined_embeddings[train_idx], combined_embeddings[val_idx]

In [None]:
# Generate KNN features for the tuning fold
model_dims = [SentenceTransformer(m).get_sentence_embedding_dimension() for m in CFG.ST_MODELS]
bge_start_index = sum(model_dims[:CFG.BGE_MODEL_INDEX])
bge_end_index = bge_start_index + model_dims[CFG.BGE_MODEL_INDEX]


In [None]:
bge_train_embeds = X_train_embed[:, bge_start_index:bge_end_index]
bge_val_embeds = X_val_embed[:, bge_start_index:bge_end_index]
knn_feats_train = generate_knn_features(bge_train_embeds, y_train, bge_train_embeds, CFG.KNN_N_NEIGHBORS)
knn_feats_val = generate_knn_features(bge_train_embeds, y_train, bge_val_embeds, CFG.KNN_N_NEIGHBORS)

print("\n--- COMMON SETUP COMPLETE. ALL DATA FOR TUNING IS READY. ---")

In [None]:
# Define the custom Attention layer
class Attention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(Attention, self).__init__(**kwargs)
    def build(self, input_shape):
        self.W = self.add_weight(name="att_weight", shape=(input_shape[-1], 1), initializer="normal")
        self.b = self.add_weight(name="att_bias", shape=(input_shape[1], 1), initializer="zeros")
        super(Attention, self).build(input_shape)
    def call(self, x):
        et = tf.keras.backend.squeeze(tf.keras.backend.tanh(tf.keras.backend.dot(x, self.W) + self.b), axis=-1)
        at = tf.keras.backend.softmax(et)
        at = tf.keras.backend.expand_dims(at, axis=-1)
        output = x * at
        return tf.keras.backend.sum(output, axis=1)

# Model 3: LSTM with Attention
def create_lstm_attention_model(vocab_size, max_len, embedding_dim):
    inputs = Input(shape=(max_len,))
    embedding = Embedding(vocab_size, embedding_dim)(inputs)
    bilstm = Bidirectional(LSTM(128, return_sequences=True))(embedding)
    attention = Attention()(bilstm)
    dense = Dense(64, activation='relu')(attention)
    outputs = Dense(1)(dense)
    return Model(inputs, outputs)

# Model 4: 1D CNN with Multiple Kernels
def create_multikernel_cnn_model(vocab_size, max_len, embedding_dim):
    inputs = Input(shape=(max_len,))
    embedding = Embedding(vocab_size, embedding_dim)(inputs)
    
    conv_layers = []
    for kernel_size in [2, 3, 5]:
        conv = Conv1D(filters=64, kernel_size=kernel_size, activation='relu')(embedding)
        pool = GlobalMaxPooling1D()(conv)
        conv_layers.append(pool)
    
    concatenated = Concatenate()(conv_layers)
    dense = Dense(128, activation='relu')(concatenated)
    dropout = Dropout(0.5)(dense)
    outputs = Dense(1)(dropout)
    return Model(inputs, outputs)

# Model 5: Multi-Input Neural Network
def create_multi_input_model(vocab_size, max_len, embedding_dim, num_tabular_feats):
    # Text input branch
    text_input = Input(shape=(max_len,), name='text_input')
    embedding = Embedding(vocab_size, embedding_dim)(text_input)
    gru = GRU(64)(embedding)
    text_branch = Dense(32, activation='relu')(gru)
    
    # Tabular input branch
    tabular_input = Input(shape=(num_tabular_feats,), name='tabular_input')
    tabular_branch = Dense(32, activation='relu')(tabular_input)
    
    # Merge branches
    concatenated = Concatenate()([text_branch, tabular_branch])
    dense = Dense(64, activation='relu')(concatenated)
    outputs = Dense(1)(dense)
    
    return Model(inputs=[text_input, tabular_input], outputs=outputs)

print("New model architectures are defined.")

In [None]:
# --- Tokenize text for all DL models ---
tokenizer = Tokenizer(num_words=CFG.DL_VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train_text)
X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train_text), maxlen=CFG.DL_MAX_LEN)
X_val_seq = pad_sequences(tokenizer.texts_to_sequences(X_val_text), maxlen=CFG.DL_MAX_LEN)

In [None]:
# --- Assemble tabular features for Multi-Input NN ---
tabular_train_feats = np.hstack([X_train_eng, knn_feats_train])
tabular_val_feats = np.hstack([X_val_eng, knn_feats_val])

def nn_multi_input_objective(trial):
    params = {
        'embedding_dim': trial.suggest_categorical('embedding_dim', [64, 128]),
        'gru_units': trial.suggest_categorical('gru_units', [64, 128]),
        'text_branch_dense_units': trial.suggest_categorical('text_branch_dense_units', [32, 64]),
        'tabular_branch_dense_units': trial.suggest_categorical('tabular_branch_dense_units', [32, 64]),
        'final_dense_units': trial.suggest_categorical('final_dense_units', [64, 128]),
        'dropout_rate': trial.suggest_float('dropout_rate', 0.2, 0.6),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True)
    }
    model = create_multi_input_model(CFG.DL_VOCAB_SIZE, CFG.DL_MAX_LEN, tabular_train_feats.shape[1], **params)
    model.compile(optimizer=Adam(params['learning_rate']), loss='mean_squared_error')
    es = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    model.fit([X_train_seq, tabular_train_feats], y_train, validation_data=([X_val_seq, tabular_val_feats], y_val), epochs=30, batch_size=256, callbacks=[es], verbose=0)
    preds = model.predict([X_val_seq, tabular_val_feats], batch_size=512).squeeze()
    return smape(y_val, preds)

study = optuna.create_study(direction='minimize')
study.optimize(nn_multi_input_objective, n_trials=20)
print("\nBest Multi-Input NN Parameters:", study.best_params)