In [None]:
# Cell 2: Install dependencies
# BeautifulSoup for parsing, lightgbm, transformers (optional), ftfy if using CLIP
!pip install -q lightgbm beautifulsoup4 joblib transformers ftfy accelerate

In [None]:
# Cell 3: Imports, path detection, helper functions
import os, sys, re, json, joblib
import numpy as np, pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMRegressor
from bs4 import BeautifulSoup

# Path handling: support both 'datasets' and 'dataset' folder names
BASE = "datasets" if os.path.exists("/content/drive/MyDrive/Colab Notebooks/amazon_ml_challenge") else ("dataset" if os.path.exists("amazon_ml_challenge") else None)
if BASE is None:
    raise FileNotFoundError("No 'datasets' or 'dataset' folder found in the current directory. Upload train/test there.")
TRAIN_PATH = os.path.join("/content/drive/MyDrive/Colab Notebooks/amazon_ml_challenge/", "train.csv")
TEST_PATH  = os.path.join("/content/drive/MyDrive/Colab Notebooks/amazon_ml_challenge/", "test.csv")
OUT_PATH   = os.path.join("/content/drive/MyDrive/Colab Notebooks/amazon_ml_challenge/", "test_out23.csv")
OUT_PATH_2   = os.path.join("/content/drive/MyDrive/Colab Notebooks/amazon_ml_challenge/", "train_out.csv")
ART_DIR    = "/content/drive/MyDrive/Colab Notebooks/amazon_ml_challenge/artifacts"
os.makedirs(ART_DIR, exist_ok=True)

print("BASE:", BASE)
print("TRAIN_PATH exists:", os.path.exists(TRAIN_PATH))
print("TEST_PATH exists:", os.path.exists(TEST_PATH))
print("ART_DIR:", ART_DIR)

BASE: datasets
TRAIN_PATH exists: True
TEST_PATH exists: True
ART_DIR: /content/drive/MyDrive/Colab Notebooks/amazon_ml_challenge/artifacts


In [None]:
# Cell 4: Optimized parsing helpers and SMAPE metric
import re, json
from bs4 import BeautifulSoup
import numpy as np

# ----------------------------
# Parsing Helpers (optimized)
# ----------------------------

def parse_catalog_content(raw: str):
    """
    Fast universal parser for JSON / HTML / plain text product catalogs.
    Returns: dict with keys {'items': [{'name': str, 'features': dict}], 'raw_text': str}
    """
    out = {'items': [], 'raw_text': ''}
    if not isinstance(raw, str) or not raw.strip():
        return out

    s = raw.strip()
    out['raw_text'] = s

    # Try JSON parsing first (fastest)
    if s[0] in ['{', '[']:
        try:
            j = json.loads(s)
            candidates = j if isinstance(j, list) else j.get('items', [j])
            for c in candidates or []:
                if isinstance(c, dict):
                    name = str(c.get('name', '')).strip()
                    feats = {}
                    f = c.get('features', {})
                    if isinstance(f, dict):
                        feats = {str(k).strip(): str(v).strip() for k, v in f.items() if isinstance(k, str)}
                    elif isinstance(f, list):
                        for item in f:
                            if isinstance(item, str) and ':' in item:
                                k, v = item.split(':', 1)
                                feats[k.strip()] = v.strip()
                    out['items'].append({'name': name, 'features': feats})
            return out
        except Exception:
            pass

    # Try HTML (slower path)
    if '<' in s and '>' in s:
        try:
            soup = BeautifulSoup(s, 'lxml')  # lxml is faster than html.parser
            lis = soup.find_all('li')
            for li in lis[:30]:  # hard cap: only first 30 <li> to avoid heavy parsing
                txt = li.get_text(" ", strip=True)
                feats = {k.strip(): v.strip() for k, v in re.findall(r'([\w \-/]+?)[:\-]\s*([\w .,]+)', txt)}
                out['items'].append({'name': txt.split(':')[0].strip(), 'features': feats})
            return out
        except Exception:
            pass

    # Fallback: plain text
    tokens = [t.strip() for t in re.split(r'[\n;|\t]', s) if t.strip()]
    for t in tokens[:50]:  # limit to 50 tokens to save time
        feats = {k.strip(): v.strip() for k, v in re.findall(r'([\w \-/]+?)[:\-]\s*([\w ./,]+)', t)}
        out['items'].append({'name': t.split(':')[0].strip(), 'features': feats})
    return out


# ----------------------------
# Extract quantity/pack info
# ----------------------------
def extract_ipq(text):
    if not isinstance(text, str):
        return None
    t = text.lower()
    # Vectorized regex cascade
    for pat in [
        r'(?:pack of\s*|of\s*)?(\d+(?:[.,]\d+)?)\s*(?:pack|packs|pcs|pc|ct|count)\b',
        r'(\d+(?:[.,]\d+)?)\s*(kg|g|gram|grams|ml|l|ltr|oz)\b',
        r'^(\d+)'
    ]:
        m = re.search(pat, t)
        if m:
            val = float(m.group(1).replace(',', '.'))
            return (val, m.group(2)) if len(m.groups()) > 1 else val
    return None


# ----------------------------
# SMAPE Metric (NumPy optimized)
# ----------------------------
def smape(y_true, y_pred):
    """
    Symmetric Mean Absolute Percentage Error (SMAPE)
    Faster version using pure NumPy vectorization.
    """
    y_true = np.asarray(y_true, dtype=np.float32)
    y_pred = np.asarray(y_pred, dtype=np.float32)
    denom = np.abs(y_true) + np.abs(y_pred)
    diff = np.abs(y_pred - y_true)
    return np.mean(np.divide(diff, np.maximum(denom / 2.0, 1e-8))) * 100


In [None]:
# Cell 5: Load train and test, add lightweight text features
import pandas as pd
import numpy as np
from joblib import Parallel, delayed

# ----------------------------
# 1️⃣ Load data (use low_memory=False for large CSVs)
# ----------------------------
df_train = pd.read_csv(TRAIN_PATH, low_memory=False)
df_test  = pd.read_csv(TEST_PATH, low_memory=False)

print(f"✅ Train rows: {len(df_train):,} | Test rows: {len(df_test):,}")
display(df_train.head(3))

# ----------------------------
# 2️⃣ Optimize dtypes to save memory
# ----------------------------
def optimize_dtypes(df):
    for col in df.select_dtypes(include=['int64', 'float64']).columns:
        df[col] = pd.to_numeric(df[col], downcast='float')
    return df

df_train = optimize_dtypes(df_train)
df_test  = optimize_dtypes(df_test)

# ----------------------------
# 3️⃣ Fast text preprocessing (vectorized operations)
# ----------------------------
def preprocess_catalog(df):
    cat_col = df['catalog_content'].fillna('')
    df['word_count'] = cat_col.str.split().str.len().astype('int32')
    df['char_count'] = cat_col.str.len().astype('int32')
    df['has_image']  = df['image_link'].notna().astype('int8')
    return df

df_train = preprocess_catalog(df_train)
df_test  = preprocess_catalog(df_test)

# ----------------------------
# 4️⃣ Parallel IPQ extraction (uses all CPU cores safely)
# ----------------------------
def parallel_apply_extract_ipq(series):
    return Parallel(n_jobs=-1, backend='loky', verbose=0)(
        delayed(extract_ipq)(x) for x in series
    )

print("⏳ Extracting ipq_raw in parallel...")
df_train['ipq_raw'] = parallel_apply_extract_ipq(df_train['catalog_content'])
df_test['ipq_raw']  = parallel_apply_extract_ipq(df_test['catalog_content'])

# ----------------------------
# 5️⃣ Quick summary
# ----------------------------
print(
    f"💰 Price range: min={df_train['price'].min():.2f}, "
    f"mean={df_train['price'].mean():.2f}, "
    f"median={df_train['price'].median():.2f}, "
    f"max={df_train['price'].max():.2f}"
)
print(f"✅ Memory usage after optimization: "
      f"Train={df_train.memory_usage(deep=True).sum()/1e6:.2f} MB | "
      f"Test={df_test.memory_usage(deep=True).sum()/1e6:.2f} MB")


✅ Train rows: 75,000 | Test rows: 75,000


Unnamed: 0,sample_id,catalog_content,image_link,price
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.89
1,198967,"Item Name: Salerno Cookies, The Original Butte...",https://m.media-amazon.com/images/I/71YtriIHAA...,13.12
2,261251,"Item Name: Bear Creek Hearty Soup Bowl, Creamy...",https://m.media-amazon.com/images/I/51+PFEe-w-...,1.97


⏳ Extracting ipq_raw in parallel...
💰 Price range: min=0.13, mean=23.65, median=14.00, max=2796.00
✅ Memory usage after optimization: Train=151.57 MB | Test=151.10 MB


In [None]:
!pip uninstall -y torch torchvision torchaudio transformers accelerate
!pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2
!pip install transformers==4.39.3 accelerate==0.28.0

Found existing installation: torch 2.8.0
Uninstalling torch-2.8.0:
  Successfully uninstalled torch-2.8.0
[0mFound existing installation: transformers 4.39.3
Uninstalling transformers-4.39.3:
  Successfully uninstalled transformers-4.39.3
Found existing installation: accelerate 0.28.0
Uninstalling accelerate-0.28.0:
  Successfully uninstalled accelerate-0.28.0
[31mERROR: Could not find a version that satisfies the requirement torch==2.1.2 (from versions: 2.2.0, 2.2.1, 2.2.2, 2.3.0, 2.3.1, 2.4.0, 2.4.1, 2.5.0, 2.5.1, 2.6.0, 2.7.0, 2.7.1, 2.8.0)[0m[31m
[0m[31mERROR: No matching distribution found for torch==2.1.2[0m[31m
[0mCollecting transformers==4.39.3
  Using cached transformers-4.39.3-py3-none-any.whl.metadata (134 kB)
Collecting accelerate==0.28.0
  Using cached accelerate-0.28.0-py3-none-any.whl.metadata (18 kB)
Collecting torch>=1.10.0 (from accelerate==0.28.0)
  Using cached torch-2.8.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (30 kB)
Using cached transformers-4.39

In [None]:
# @title Run cell `df4ddb89` to resume embedding generation
import os, io, requests
import numpy as np
import torch
from PIL import Image
from tqdm import tqdm
from transformers import CLIPProcessor, CLIPModel
from concurrent.futures import ThreadPoolExecutor

device = 'cuda' if torch.cuda.is_available() else 'cpu'
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def fetch_image(url):
    try:
        r = requests.get(url, timeout=5)
        img = Image.open(io.BytesIO(r.content)).convert('RGB')
        return img
    except:
        return None

def fetch_images_parallel(urls, max_workers=16):
    imgs = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        for img in executor.map(fetch_image, urls):
            imgs.append(img)
    return imgs

def get_clip_emb_batch(img_list):
    valid_imgs = [img for img in img_list if img is not None]
    if not valid_imgs:
        return np.zeros((len(img_list), 512), dtype=np.float32)

    # Remove padding=True as it might cause warnings or issues with newer transformers versions
    inputs = processor(images=valid_imgs, return_tensors="pt").to(device)
    with torch.no_grad():
        emb = clip_model.get_image_features(**inputs)
    emb = emb.cpu().numpy()
    emb = emb / (np.linalg.norm(emb, axis=1, keepdims=True) + 1e-8)

    result = np.zeros((len(img_list), 512), dtype=np.float32)
    j = 0
    for i, img in enumerate(img_list):
        if img is not None:
            result[i] = emb[j]
            j += 1
    return result

def build_embs_memmap(df, save_path, batch_size=32):
    num_images = len(df)
    emb_shape = (num_images, 512)
    urls = df['image_link'].fillna('').tolist()

    # Check if the memmap file exists and get the number of completed embeddings
    if os.path.exists(save_path):
        try:
            embs_memmap = np.memmap(save_path, dtype=np.float32, mode='r+', shape=emb_shape)
            completed_batches = int(embs_memmap.shape[0] / batch_size)
            print(f"✅ Found existing memory-mapped file. Resuming from batch {completed_batches}.")
        except Exception as e:
            print(f"Error loading existing memmap file: {e}. Starting fresh.")
            if os.path.exists(save_path):
                os.remove(save_path)
            embs_memmap = np.memmap(save_path, dtype=np.float32, mode='w+', shape=emb_shape)
            completed_batches = 0
    else:
        embs_memmap = np.memmap(save_path, dtype=np.float32, mode='w+', shape=emb_shape)
        completed_batches = 0


    for start in tqdm(range(completed_batches * batch_size, num_images, batch_size)):
        end = min(start + batch_size, num_images)
        batch_urls = urls[start:end]
        img_batch = fetch_images_parallel(batch_urls)
        batch_embs = get_clip_emb_batch(img_batch)
        embs_memmap[start:end] = batch_embs

    # Ensure all data is written to disk
    embs_memmap.flush()
    print("✅ Saved memory-mapped embeddings to:", save_path)
    return embs_memmap

# Build embeddings for train and test data using memmap
train_emb_path = os.path.join(ART_DIR, "train_clip_embs_memmap.npy")
test_emb_path = os.path.join(ART_DIR, "test_clip_embs_memmap.npy")

# Remove existing memmap files if they exist before creating new ones
# Keep this section to allow starting fresh if needed, but the build_embs_memmap
# function now handles resuming if the file exists.
# if os.path.exists(train_emb_path):
#     os.remove(train_emb_path)
# if os.path.exists(test_emb_path):
#     os.remove(test_emb_path)


train_img_emb = build_embs_memmap(df_train, train_emb_path)
test_img_emb  = build_embs_memmap(df_test,  test_emb_path)

# You can now access train_img_emb and test_img_emb as numpy arrays,
# but the data will be loaded from the memory-mapped file as needed.
# For example: train_img_emb[:10] to access the first 10 embeddings.

# Note: When you are done with the memmap arrays, it's good practice to delete them
# or close the file if you opened it manually. In this case, the memmap object
# handles the file.
# del train_img_emb
# del test_img_emb
# os.remove(train_emb_path) # To delete the file later
# os.remove(test_emb_path) # To delete the file later

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

✅ Found existing memory-mapped file. Resuming from batch 2343.



  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:12<00:00, 12.95s/it]


✅ Saved memory-mapped embeddings to: /content/drive/MyDrive/Colab Notebooks/amazon_ml_challenge/artifacts/train_clip_embs_memmap.npy
✅ Found existing memory-mapped file. Resuming from batch 2343.


100%|██████████| 1/1 [00:12<00:00, 12.97s/it]

✅ Saved memory-mapped embeddings to: /content/drive/MyDrive/Colab Notebooks/amazon_ml_challenge/artifacts/test_clip_embs_memmap.npy





In [None]:
from scipy import sparse
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
import joblib
import numpy as np

# ----------------------------
# 1️⃣ TF-IDF + SVD (sparse-friendly)
# ----------------------------
tf = TfidfVectorizer(ngram_range=(1,2), max_features=50000, min_df=3)
all_text = pd.concat([df_train['catalog_content'], df_test['catalog_content']], axis=0)
X_all_sparse = tf.fit_transform(all_text)

svd = TruncatedSVD(n_components=200, random_state=42)
X_all_svd = svd.fit_transform(X_all_sparse)

X_train_text = X_all_svd[:len(df_train)]
X_test_text  = X_all_svd[len(df_train):]

# ----------------------------
# 2️⃣ Numeric features
# ----------------------------
num_cols = ['word_count','char_count','has_image']
scaler = StandardScaler()
X_train_num = scaler.fit_transform(df_train[num_cols].fillna(0))
X_test_num  = scaler.transform(df_test[num_cols].fillna(0))

# ----------------------------
# 3️⃣ Combine text + numeric + image embeddings (all dense)
# ----------------------------
X_train = np.hstack([X_train_text, X_train_num, train_img_emb])
X_test  = np.hstack([X_test_text,  X_test_num,  test_img_emb])

y = df_train['price'].values
y_log = np.log1p(y)  # log-transform improves regression stability

# ----------------------------
# 4️⃣ Stratified bins for CV
# ----------------------------
df_train['_bin'] = pd.qcut(df_train['price'].rank(method='first'), q=10, labels=False)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof_log = np.zeros(len(df_train), dtype=np.float32)
test_preds_log = np.zeros((len(df_test), 5), dtype=np.float32)

# ----------------------------
# 5️⃣ LightGBM parameters
# ----------------------------
params = dict(
    n_estimators=2000,
    learning_rate=0.03,
    num_leaves=128,
    subsample=0.8,
    colsample_bytree=0.6,
    random_state=42,
    n_jobs=-1
)

# ----------------------------
# 6️⃣ Training loop with early stopping & SMAPE-friendly tweaks
# ----------------------------
for f, (tr_idx, va_idx) in enumerate(skf.split(X_train, df_train['_bin'])):
    print(f"Fold {f+1}/5")
    model = lgb.LGBMRegressor(**params)
    model.fit(
        X_train[tr_idx], y_log[tr_idx],
        eval_set=[(X_train[va_idx], y_log[va_idx])],
        eval_metric='rmse',
        callbacks=[lgb.early_stopping(stopping_rounds=200, verbose=100)]
    )
    oof_log[va_idx] = model.predict(X_train[va_idx])
    test_preds_log[:, f] = model.predict(X_test)

    # Save model fold
    joblib.dump(model, os.path.join(ART_DIR, f"lgb_fold{f}.pkl"))

# ----------------------------
# 7️⃣ Convert log predictions back & evaluate SMAPE
# ----------------------------
oof_price = np.expm1(oof_log)
print("✅ OOF SMAPE =", smape(df_train['price'], oof_price))

# ----------------------------
# 8️⃣ Save preprocessing objects
# ----------------------------
joblib.dump(tf, os.path.join(ART_DIR, "tfidf2.pkl"))
joblib.dump(svd, os.path.join(ART_DIR, "svd2.pkl"))
joblib.dump(scaler, os.path.join(ART_DIR, "scaler2.pkl"))

# --- End of changes to incorporate new features ---


y = df_train['price'].values
y_log = np.log1p(y)

df_train['_bin'] = pd.qcut(df_train['price'].rank(method='first'), q=10, labels=False)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof = np.zeros(len(df_train))
test_preds_log = np.zeros((len(df_test), 5)) # Corrected variable name

# Tuned parameters (example - these may need further optimization)
params = dict(
    n_estimators=1000,  # Reduced estimators
    learning_rate=0.05, # Adjusted learning rate
    num_leaves=128,     # Reduced number of leaves
    subsample=0.8,      # Adjusted subsample
    colsample_bytree=0.8, # Adjusted colsample_bytree
    random_state=42,
    n_jobs=-1,
    reg_alpha=0.1,      # Added L1 regularization
    reg_lambda=0.1      # Added L2 regularization
)


for f, (tr, va) in enumerate(skf.split(X_train, df_train['_bin'])):
    print(f"Fold {f+1}/5")
    model = LGBMRegressor(**params)
    model.fit(X_train[tr], y_log[tr],
              eval_set=[(X_train[va], y_log[va])],
              eval_metric='rmse',
              callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=50)]) # Adjusted early stopping
    oof[va] = model.predict(X_train[va])
    test_preds_log[:, f] = model.predict(X_test) # Corrected variable name
    joblib.dump(model, os.path.join(ART_DIR, f"lgb_fold{f}.pkl"))

oof_price = np.expm1(oof)
print("OOF SMAPE = ", smape(df_train['price'], oof_price))
joblib.dump(tf, os.path.join(ART_DIR, "tfidf2.pkl"))
joblib.dump(svd, os.path.join(ART_DIR, "svd2.pkl"))
joblib.dump(scaler, os.path.join(ART_DIR, "scaler2.pkl"))
# Optionally save the new feature transformers as well
if categorical_cols:
    joblib.dump(ohe, os.path.join(ART_DIR, "onehotencoder.pkl"))
if numerical_new_cols:
    joblib.dump(scaler_new_num, os.path.join(ART_DIR, "scaler_new_num.pkl"))

Fold 1/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.126030 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 182070
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 714
[LightGBM] [Info] Start training from score 2.738630
Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[1998]	valid_0's rmse: 0.697365	valid_0's l2: 0.486317




Fold 2/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.479187 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 182070
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 714
[LightGBM] [Info] Start training from score 2.739837
Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[1998]	valid_0's rmse: 0.688358	valid_0's l2: 0.473836




Fold 3/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.145563 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 182070
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 714
[LightGBM] [Info] Start training from score 2.738748
Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[2000]	valid_0's rmse: 0.688587	valid_0's l2: 0.474152




Fold 4/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.263179 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 182070
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 714
[LightGBM] [Info] Start training from score 2.739459
Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[1996]	valid_0's rmse: 0.695044	valid_0's l2: 0.483086




Fold 5/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.178424 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 182070
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 714
[LightGBM] [Info] Start training from score 2.739412
Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[2000]	valid_0's rmse: 0.688877	valid_0's l2: 0.474551




✅ OOF SMAPE = 53.57626
Fold 1/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.207072 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 182070
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 714
[LightGBM] [Info] Start training from score 2.738630
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 0.701347	valid_0's l2: 0.491888




Fold 2/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.209868 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 182070
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 714
[LightGBM] [Info] Start training from score 2.739837
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 0.696639	valid_0's l2: 0.485306




Fold 3/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.413566 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 182070
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 714
[LightGBM] [Info] Start training from score 2.738748
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 0.694557	valid_0's l2: 0.482409




Fold 4/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.202984 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 182070
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 714
[LightGBM] [Info] Start training from score 2.739459
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[991]	valid_0's rmse: 0.699672	valid_0's l2: 0.489541




Fold 5/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.185645 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 182070
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 714
[LightGBM] [Info] Start training from score 2.739412
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 0.696858	valid_0's l2: 0.485611




OOF SMAPE =  54.04146


NameError: name 'categorical_cols' is not defined

In [None]:
# Cell 5: Load train and test, add quick features
df_train = pd.read_csv(TRAIN_PATH)
df_test  = pd.read_csv(TEST_PATH)

print("Train rows:", len(df_train), "Test rows:", len(df_test))
display(df_train.head(3))

# basic text features
for df in (df_train, df_test):
    df['catalog_content'] = df['catalog_content'].fillna('').astype(str)
    df['word_count'] = df['catalog_content'].apply(lambda x: len(x.split()))
    df['char_count'] = df['catalog_content'].apply(lambda x: len(x))
    df['has_image'] = df['image_link'].notnull().astype(int)
    # quick ipq extraction from raw text (may return tuple for weight)
    df['ipq_raw'] = df['catalog_content'].apply(extract_ipq)

# preview distribution
print("Price: min, mean, median, max:", df_train['price'].min(), df_train['price'].mean(), df_train['price'].median(), df_train['price'].max())

Train rows: 75000 Test rows: 75000


Unnamed: 0,sample_id,catalog_content,image_link,price
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.89
1,198967,"Item Name: Salerno Cookies, The Original Butte...",https://m.media-amazon.com/images/I/71YtriIHAA...,13.12
2,261251,"Item Name: Bear Creek Hearty Soup Bowl, Creamy...",https://m.media-amazon.com/images/I/51+PFEe-w-...,1.97


Price: min, mean, median, max: 0.13 23.647654 14.0 2796.0


In [None]:
# Cell 7: Generate optimized submission file

# 1️⃣ Aggregate predictions across folds using median (more robust to outliers)
preds_log = np.median(test_preds_log, axis=1)

# 2️⃣ Convert from log scale back to price
preds = np.expm1(preds_log)

# 3️⃣ Clip to minimum reasonable price
preds = np.maximum(preds, 0.01)

# 4️⃣ Prepare submission dataframe
out_df = pd.DataFrame({
    'sample_id': df_test['sample_id'],
    'price': np.round(preds, 3)
})

# 5️⃣ Save CSV
out_df.to_csv(OUT_PATH, index=False)
print(f"✅ Submission saved to: {OUT_PATH}")

# 6️⃣ Display preview
display(out_df.head())


✅ Submission saved to: /content/drive/MyDrive/Colab Notebooks/amazon_ml_challenge/test_out.csv


Unnamed: 0,sample_id,price
0,100179.0,20.493
1,245611.0,20.16
2,146263.0,26.451
3,95658.0,10.286
4,36806.0,19.292


# Task
Improve the SMAPE score of the price prediction model on the test data from 59 to around 37 by implementing advanced feature engineering, data cleaning, model selection, and hyperparameter tuning techniques.

## Feature engineering

### Subtask:
Extract more relevant features from the `catalog_content` and `image_link` columns. This could include:
    - More detailed parsing of `catalog_content` to extract specific item attributes (e.g., brand, size, color, material).
    - Using more advanced text embedding techniques (e.g., Sentence-BERT, TF-IDF with more features or different n-gram ranges) for `catalog_content`.
    - Exploring image features beyond simple CLIP embeddings, potentially using other pre-trained models or fine-tuning CLIP.
    - Creating interaction features between different types of features (e.g., text features and numerical features).
    - Investigating the `ipq_raw` column and how to best incorporate it into the model, potentially separating count and weight/volume information.


**Reasoning**:
Apply the `parse_catalog_content` function to the `catalog_content` column and extract item attributes and process the `ipq_raw` column as per instructions 1, 2, 3, and 4.



**Reasoning**:
The previous command failed because the dataframes `df_train` and `df_test` were not defined in the current session. I need to reload the dataframes first. Then I will re-execute the feature extraction code.



**Reasoning**:
The previous command failed because the `pandas` library was not imported. I need to import `pandas` and then re-execute the code to load data, extract basic features, parse catalog content, extract specific attributes, and process `ipq_raw`.



In [1]:
%cd /content/drive/MyDrive/Colab Notebooks/amazon_ml_challenge

/content/drive/MyDrive/Colab Notebooks/amazon_ml_challenge


In [2]:
!git init

Reinitialized existing Git repository in /content/drive/MyDrive/Colab Notebooks/amazon_ml_challenge/.git/


In [7]:
!git add .
!git commit -m "Upload my amazon project"

[main 897149b] Upload my amazon project


In [5]:
!git config --global user.email "piyushchauhan200417@gmail.com"

In [6]:
!git config --global user.name "ThakurNishant2004"

In [None]:
!git branch

* [32mmaster[m


In [None]:
!git branch -M main

In [8]:
!git branch

* [32mmain[m


In [11]:
!git push -u origin main

fatal: cannot exec '.git/hooks/pre-push': Permission denied
^C


In [3]:
!git remote add origin https://ThakurNishant2004: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@github.com/ThakurNishant2004/Amazon_ml_problem_smape_score.git

error: remote origin already exists.


In [13]:
!git push -u origin main

Enumerating objects: 53, done.
Counting objects:   1% (1/53)Counting objects:   3% (2/53)Counting objects:   5% (3/53)Counting objects:   7% (4/53)Counting objects:   9% (5/53)Counting objects:  11% (6/53)Counting objects:  13% (7/53)Counting objects:  15% (8/53)Counting objects:  16% (9/53)Counting objects:  18% (10/53)Counting objects:  20% (11/53)Counting objects:  22% (12/53)Counting objects:  24% (13/53)Counting objects:  26% (14/53)Counting objects:  28% (15/53)Counting objects:  30% (16/53)Counting objects:  32% (17/53)Counting objects:  33% (18/53)Counting objects:  35% (19/53)Counting objects:  37% (20/53)Counting objects:  39% (21/53)Counting objects:  41% (22/53)Counting objects:  43% (23/53)Counting objects:  45% (24/53)Counting objects:  47% (25/53)Counting objects:  49% (26/53)Counting objects:  50% (27/53)Counting objects:  52% (28/53)Counting objects:  54% (29/53)Counting objects:  56% (30/53)Counting objects:  58% (31/53)Counting objects:

In [None]:
!git remote -v


origin	https://ThakurNishant2004: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@github.com/ThakurNishant2004/https://github.com/ThakurNishant2004/Amazon_ml_problem_smape_score.git (fetch)
origin	https://ThakurNishant2004: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@github.com/ThakurNishant2004/https://github.com/ThakurNishant2004/Amazon_ml_problem_smape_score.git (push)


In [None]:
!git remote remove origin


In [None]:
!git pull origin main --allow-unrelated-histories --no-rebase


From https://github.com/ThakurNishant2004/Amazon_ml_problem_smape_score
 * branch            main       -> FETCH_HEAD
<books/amazon_ml_challenge/.git/MERGE_MSG" 6L, 316B[2;1H▽[6n[2;1H  [3;1HPzz\[0%m[6n[3;1H           [1;1H[>c]10;?]11;?[1;1H[33mMerge branch 'main' of https://github.com/ThakurNi[mshant2004/Amazon_ml_problem_smm[2;1Hape_score[2;10H[K[3;1H[34m# Please enter a commit message to explain why this merge is necessary,[m[3;72H[K[4;1H[34m# especially if it merges an updated upstream into a topic branch.
#
# Lines starting with '#' will be ignored, and an empty message aborts
# the commit.[m
[1m[34m~                                                                               [9;1H~                                                                               [10;1H~                                                                               [11;1H~                                                                               [12;1H~          

In [10]:
!apt-get install git-lfs
!git lfs install
!git lfs track "*.npy"
!git lfs track "*.pkl"
!git add .gitattributes
!git add artifacts/*.npy artifacts/*.pkl
!git commit -m "Add large files via Git LFS"

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.3).
0 upgraded, 0 newly installed, 0 to remove and 38 not upgraded.
Updated git hooks.
Git LFS initialized.
Tracking "*.npy"
Tracking "*.pkl"
fatal: cannot exec '.git/hooks/post-commit': Permission denied
[main 32d75a7] Add large files via Git LFS
 12 files changed, 2 insertions(+)
 create mode 100644 .gitattributes
 rewrite artifacts/lgb_fold0.pkl (99%)
 rewrite artifacts/lgb_fold1.pkl (99%)
 rewrite artifacts/lgb_fold2.pkl (99%)
 rewrite artifacts/lgb_fold3.pkl (99%)
 rewrite artifacts/lgb_fold4.pkl (99%)
 rewrite artifacts/scaler2.pkl (100%)
 rewrite artifacts/svd2.pkl (99%)
 rewrite artifacts/test_clip_embs_memmap.npy (99%)
 rewrite artifacts/tfidf2.pkl (100%)
 rewrite artifacts/train_clip_embs.npy (99%)
 rewrite artifacts/train_clip_embs_memmap.npy (99%)


In [12]:
!rm .git/hooks/pre-push

In [14]:
!chmod +x .git/hooks/pre-push

chmod: cannot access '.git/hooks/pre-push': No such file or directory
