In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sentence_transformers import SentenceTransformer
import re
import os
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import joblib # For saving the model

print("Libraries imported.")

# --- 1. Data Loading and Preprocessing ---
def load_and_preprocess_data(data_path):
    print("Loading and preprocessing data...")
    df = pd.read_csv(data_path)

    # --- Feature Engineering from previous notebook ---
    df['log_price'] = np.log1p(df['price'])

    def extract_ipq(text):
        patterns = [r'pack of (\d+)', r'(\d+)\s*per case', r'\((\d+)\s*count\)', r'pack\s*(\d+)', r'(\d+)\s*pack']
        text_lower = text.lower()
        for pattern in patterns:
            match = re.search(pattern, text_lower)
            if match:
                return int(match.group(1))
        return 1
    df['ipq'] = df['catalog_content'].apply(extract_ipq)

    def clean_text(text):
        text = re.sub(r'item name:', '', text, flags=re.IGNORECASE)
        text = re.sub(r'bullet point \d+:', '', text, flags=re.IGNORECASE)
        text = re.sub(r'value:', '', text, flags=re.IGNORECASE)
        text = re.sub(r'unit:', '', text, flags=re.IGNORECASE)
        text = text.lower().replace('\n', ' ')
        text = re.sub(r'[^a-z0-9 ]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    df['cleaned_text'] = df['catalog_content'].apply(clean_text)

    print("Data preprocessing complete.")
    return df

df = load_and_preprocess_data('student_resource/dataset/train.csv')
display(df.head())

  from .autonotebook import tqdm as notebook_tqdm
2025-10-12 23:09:52.950353: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Libraries imported.
Loading and preprocessing data...
Data preprocessing complete.


Unnamed: 0,sample_id,catalog_content,image_link,price,log_price,ipq,cleaned_text
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.89,1.773256,6,la victoria green taco sauce mild 12 ounce pac...
1,198967,"Item Name: Salerno Cookies, The Original Butte...",https://m.media-amazon.com/images/I/71YtriIHAA...,13.12,2.647592,4,salerno cookies the original butter cookies 8 ...
2,261251,"Item Name: Bear Creek Hearty Soup Bowl, Creamy...",https://m.media-amazon.com/images/I/51+PFEe-w-...,1.97,1.088562,6,bear creek hearty soup bowl creamy chicken wit...
3,55858,Item Name: Judee’s Blue Cheese Powder 11.25 oz...,https://m.media-amazon.com/images/I/41mu0HAToD...,30.34,3.444895,1,judees blue cheese powder 1125 oz glutenfree a...
4,292686,"Item Name: kedem Sherry Cooking Wine, 12.7 Oun...",https://m.media-amazon.com/images/I/41sA037+Qv...,66.49,4.211979,12,kedem sherry cooking wine 127 ounce 12 per cas...


In [2]:
# --- 2. Generate Text Embeddings ---

# Define the model from sentence-transformers and the output file path
TEXT_EMBEDDING_MODEL = 'all-MiniLM-L6-v2'
TEXT_EMBEDDINGS_FILE = 'artifacts/text_embeddings.npy'

# Create artifacts directory if it doesn't exist
os.makedirs('artifacts', exist_ok=True)

if not os.path.exists(TEXT_EMBEDDINGS_FILE):
    print("Text embeddings file not found. Generating new embeddings...")
    print(f"Loading model: {TEXT_EMBEDDING_MODEL}")
    # Load the pre-trained model
    model = SentenceTransformer(TEXT_EMBEDDING_MODEL)

    # Generate embeddings. This may take a few minutes.
    print("Generating text embeddings for all 75,000 items...")
    text_embeddings = model.encode(
        df['cleaned_text'].tolist(),
        show_progress_bar=True,
        batch_size=128 # Adjust batch size based on your RAM
    )

    # Save the embeddings to a file for future use
    print(f"Embeddings generated with shape: {text_embeddings.shape}")
    np.save(TEXT_EMBEDDINGS_FILE, text_embeddings)
    print(f"Text embeddings saved to {TEXT_EMBEDDINGS_FILE}")
else:
    print(f"Loading pre-computed text embeddings from {TEXT_EMBEDDINGS_FILE}...")
    text_embeddings = np.load(TEXT_EMBEDDINGS_FILE)
    print(f"Text embeddings loaded with shape: {text_embeddings.shape}")

Loading pre-computed text embeddings from artifacts/text_embeddings.npy...
Text embeddings loaded with shape: (75000, 384)


In [3]:
import tensorflow as tf
from pathlib import Path
import sys
import os
from tqdm import tqdm

# Add src directory to path to import utils
src_path = os.path.abspath('student_resource/src')
if src_path not in sys.path:
    sys.path.insert(0, src_path)
from challenge_utils import download_images

IMAGE_EMBEDDINGS_FILE = 'artifacts/image_embeddings.npy'
IMAGES_DIR = 'student_resource/images'
IMAGE_SIZE = (128, 128)

# Add image_path column to dataframe
df['image_path'] = df['image_link'].apply(lambda url: os.path.join(IMAGES_DIR, Path(url).name))

if not os.path.exists(IMAGE_EMBEDDINGS_FILE):
    print("Image embeddings file not found. Generating new embeddings...")

    # 1. Download all images.
    print("Verifying all images are downloaded...")
    os.makedirs(IMAGES_DIR, exist_ok=True)
    image_urls = df['image_link'].tolist()
    download_images(image_urls, IMAGES_DIR)
    print("Image download process complete.")

    # 2. Define the feature extractor model
    print("Building image feature extractor model...")
    base_model = tf.keras.applications.ResNet50(
        include_top=False, weights='imagenet', input_shape=(IMAGE_SIZE[0], IMAGE_SIZE[1], 3)
    )
    base_model.trainable = False
    image_input = tf.keras.layers.Input(shape=(IMAGE_SIZE[0], IMAGE_SIZE[1], 3))
    x = base_model(image_input, training=False)
    pooled_output = tf.keras.layers.GlobalAveragePooling2D()(x)
    feature_extractor = tf.keras.Model(image_input, pooled_output, name="image_feature_extractor")
    print("Model built.")

    # --- FIX: Verify which images actually exist before processing ---
    print("Verifying downloaded images...")
    all_paths = df['image_path'].tolist()
    existing_paths = [p for p in all_paths if os.path.exists(p)]
    print(f"Found {len(existing_paths)} existing images out of {len(all_paths)} total.")

    if len(existing_paths) > 0:
        # 3. Create a tf.data pipeline ONLY for existing images
        def load_image_for_prediction(path):
            try:
                image = tf.io.read_file(path)
                image = tf.io.decode_jpeg(image, channels=3)
                image = tf.image.resize(image, IMAGE_SIZE)
                image = tf.keras.applications.resnet50.preprocess_input(image)
                return image
            except:
                return tf.zeros((IMAGE_SIZE[0], IMAGE_SIZE[1], 3))

        print("Creating a tf.data.Dataset for existing image paths...")
        image_path_ds = tf.data.Dataset.from_tensor_slices(existing_paths)
        image_ds = image_path_ds.map(load_image_for_prediction, num_parallel_calls=tf.data.AUTOTUNE)
        image_ds = image_ds.batch(128).prefetch(tf.data.AUTOTUNE)

        # 4. Generate embeddings for existing images
        print("Generating image embeddings for existing items...")
        partial_image_embeddings = feature_extractor.predict(image_ds, verbose=1)

        # 5. Map embeddings back to the original dataframe order
        print("Mapping generated embeddings back to the original 75,000-item order...")
        embedding_dim = partial_image_embeddings.shape[1]
        embedding_map = dict(zip(existing_paths, partial_image_embeddings))

        final_image_embeddings = np.zeros((len(df), embedding_dim))
        for i, path in enumerate(df['image_path']):
            if path in embedding_map:
                final_image_embeddings[i] = embedding_map[path]

        print(f"Final image embeddings generated with shape: {final_image_embeddings.shape}")
        np.save(IMAGE_EMBEDDINGS_FILE, final_image_embeddings)
        print(f"Image embeddings saved to {IMAGE_EMBEDDINGS_FILE}")

        # Assign to the main variable for the next cell
        image_embeddings = final_image_embeddings
    else:
        print("Error: No images were found. Cannot generate embeddings.")
        # Create a dummy zero array if no images are found
        image_embeddings = np.zeros((len(df), 2048)) # 2048 is ResNet50's output dim

else:
    print(f"Loading pre-computed image embeddings from {IMAGE_EMBEDDINGS_FILE}...")
    image_embeddings = np.load(IMAGE_EMBEDDINGS_FILE)
    print(f"Image embeddings loaded with shape: {image_embeddings.shape}")


Loading pre-computed image embeddings from artifacts/image_embeddings.npy...
Image embeddings loaded with shape: (75000, 2048)


In [4]:
# --- 4. Combine Features and Train LightGBM Model ---

# Reshape ipq to be a 2D array for concatenation
ipq_features = df['ipq'].values.reshape(-1, 1)

print(f"Text embedding shape: {text_embeddings.shape}")
print(f"Image embedding shape: {image_embeddings.shape}")
print(f"IPQ features shape: {ipq_features.shape}")

# Combine all features into a single matrix X
X = np.concatenate([text_embeddings, image_embeddings, ipq_features], axis=1)
y = df['log_price'].values

print(f"Final feature matrix X shape: {X.shape}")
print(f"Target vector y shape: {y.shape}")

# --- Cross-Validation Training ---
N_SPLITS = 5
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

oof_predictions = np.zeros(X.shape[0])
oof_models = []
oof_scores = []

print(f"\nStarting {N_SPLITS}-fold cross-validation with LightGBM...")

for fold, (train_index, val_index) in enumerate(kf.split(X, y)):
    print(f"--- Fold {fold+1}/{N_SPLITS} ---")
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]

    # Define the LightGBM model
    lgbm = lgb.LGBMRegressor(
        random_state=42,
        n_estimators=1000, # High number of estimators, will be stopped by early stopping
        learning_rate=0.05,
        num_leaves=31,
        n_jobs=-1
    )

    # Train the model
    lgbm.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='rmse',
        callbacks=[lgb.early_stopping(100, verbose=False)] # Stop if validation score doesn't improve for 100 rounds
    )

    # Predict on the validation set
    val_preds = lgbm.predict(X_val)
    oof_predictions[val_index] = val_preds

    # Evaluate the model
    rmse = np.sqrt(mean_squared_error(y_val, val_preds))
    oof_scores.append(rmse)
    oof_models.append(lgbm)
    print(f"Fold {fold+1} RMSE: {rmse}")

# --- Final Performance ---
mean_oof_rmse = np.mean(oof_scores)
print(f"\nAverage RMSE across all folds: {mean_oof_rmse}")

# --- Save the trained models ---
print("Saving the 5 trained models...")
for i, model in enumerate(oof_models):
    joblib.dump(model, f'artifacts/lgbm_model_fold_{i+1}.pkl')
print("Models saved successfully.")


Text embedding shape: (75000, 384)
Image embedding shape: (75000, 2048)
IPQ features shape: (75000, 1)
Final feature matrix X shape: (75000, 2433)
Target vector y shape: (75000,)

Starting 5-fold cross-validation with LightGBM...
--- Fold 1/5 ---
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.559961 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 620242
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 2433
[LightGBM] [Info] Start training from score 2.740904




Fold 1 RMSE: 0.7443431458757918
--- Fold 2/5 ---
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.977747 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 620247
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 2433
[LightGBM] [Info] Start training from score 2.738173




Fold 2 RMSE: 0.7227761915172778
--- Fold 3/5 ---
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.662333 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 620245
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 2433
[LightGBM] [Info] Start training from score 2.741725




Fold 3 RMSE: 0.7224129549366619
--- Fold 4/5 ---
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.201671 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 620244
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 2433
[LightGBM] [Info] Start training from score 2.737836




Fold 4 RMSE: 0.7145844044843473
--- Fold 5/5 ---
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.924996 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 620241
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 2433
[LightGBM] [Info] Start training from score 2.737449




Fold 5 RMSE: 0.7291847929954292

Average RMSE across all folds: 0.7266602979619016
Saving the 5 trained models...
Models saved successfully.
