In [None]:
!pip install -q sentence-transformers hnswlib timm torchvision lightgbm xgboost tqdm

import os
import re
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from PIL import Image
import torch
import torchvision.transforms as T
import timm
import hnswlib
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
import lightgbm as lgb
import joblib

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for hnswlib (pyproject.toml) ... [?25l[?25hdone


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Using device:", device)

Using device: cuda


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
train_path = '/content/drive/MyDrive/DataSets/train.csv'
test_path = '/content/drive/MyDrive/DataSets/test.csv'
image_folder = '/content/drive/MyDrive/DataSets/images/'
cache_folder = '/content/drive/MyDrive/DataSets/image_embeddings/'
os.makedirs(cache_folder, exist_ok=True)

train = pd.read_csv(train_path, sep=',', quotechar='"', engine='python', on_bad_lines='skip')
test = pd.read_csv(test_path, sep=',', quotechar='"', engine='python', on_bad_lines='skip')

print("Train shape:", train.shape)
print("Test shape:", test.shape)

Train shape: (75000, 4)
Test shape: (75000, 3)


In [None]:
def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower().replace('\n',' ')
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

train['clean_text'] = train['catalog_content'].apply(clean_text)
test['clean_text'] = test['catalog_content'].apply(clean_text)

In [None]:
# Extract numeric value and unit
def extract_value_unit(text):
    value, unit = None, None
    value_match = re.search(r'Value[: ]+\s*([\d\.]+)', text, re.IGNORECASE)
    if value_match:
        try: value = float(value_match.group(1))
        except: value = None
    unit_match = re.search(r'Unit[: ]+\s*([a-zA-Z]+)', text, re.IGNORECASE)
    if unit_match: unit = unit_match.group(1).lower()
    return value, unit

train[['value_num', 'unit']] = train['catalog_content'].apply(lambda x: pd.Series(extract_value_unit(x)))
test[['value_num', 'unit']] = test['catalog_content'].apply(lambda x: pd.Series(extract_value_unit(x)))

In [None]:
# Extract brand
def extract_brand(text):
    if pd.isna(text): return None
    brand_match = re.match(r'([^,]+)', text)
    if brand_match: return brand_match.group(1).strip().lower()
    return None

train['brand'] = train['catalog_content'].apply(extract_brand)
test['brand'] = test['catalog_content'].apply(extract_brand)

In [None]:
#Categorical features
TOP_K_BRANDS = 50
top_brands = train['brand'].value_counts().nlargest(TOP_K_BRANDS).index.tolist()
train_top_brands = train['brand'].apply(lambda x: x if x in top_brands else 'other')
test_top_brands = test['brand'].apply(lambda x: x if x in top_brands else 'other')

train_brands_onehot = pd.get_dummies(train_top_brands, prefix='brand', dummy_na=True)
test_brands_onehot = pd.get_dummies(test_top_brands, prefix='brand', dummy_na=True)
test_brands_onehot = test_brands_onehot.reindex(columns=train_brands_onehot.columns, fill_value=0)

In [None]:
# Target mean encoding with blending for rare
brand_target_mean = train.groupby('brand')['price'].mean().to_dict()
global_mean = train['price'].mean()
alpha = 0.7
train['brand_target'] = train['brand'].apply(lambda x: alpha*brand_target_mean.get(x, global_mean) + (1-alpha)*global_mean)
test['brand_target']  = test['brand'].apply(lambda x: alpha*brand_target_mean.get(x, global_mean) + (1-alpha)*global_mean)

In [None]:
# Units
TOP_K_UNITS = 20
top_units = train['unit'].value_counts().nlargest(TOP_K_UNITS).index.tolist()
train_top_units = train['unit'].apply(lambda x: x if x in top_units else 'other')
test_top_units = test['unit'].apply(lambda x: x if x in top_units else 'other')

train_units_onehot = pd.get_dummies(train_top_units, prefix='unit', dummy_na=True)
test_units_onehot = pd.get_dummies(test_top_units, prefix='unit', dummy_na=True)
test_units_onehot = test_units_onehot.reindex(columns=train_units_onehot.columns, fill_value=0)

unit_target_mean = train.groupby('unit')['price'].mean().to_dict()
train['unit_target'] = train['unit'].apply(lambda x: alpha*unit_target_mean.get(x, global_mean) + (1-alpha)*global_mean)
test['unit_target'] = test['unit'].apply(lambda x: alpha*unit_target_mean.get(x, global_mean) + (1-alpha)*global_mean)

In [None]:
# Structured arrays
X_train_cat = np.hstack([train_brands_onehot.values, train['brand_target'].values.reshape(-1,1),
                         train_units_onehot.values, train['unit_target'].values.reshape(-1,1)]).astype(np.float32)
X_test_cat = np.hstack([test_brands_onehot.values, test['brand_target'].values.reshape(-1,1),
                        test_units_onehot.values, test['unit_target'].values.reshape(-1,1)]).astype(np.float32)

X_train_num = train[['value_num']].fillna(0).astype(np.float32).values
X_test_num = test[['value_num']].fillna(0).astype(np.float32).values

In [None]:
model_names = [
    'sentence-transformers/paraphrase-MiniLM-L6-v2',
    'sentence-transformers/all-MiniLM-L6-v2',
    'sentence-transformers/distiluse-base-multilingual-cased-v2'
]
X_train_text_list, X_test_text_list = [], []

for mname in model_names:
    print(f"Encoding with {mname}...")
    model = SentenceTransformer(mname, device=device)
    emb_train = model.encode(train['clean_text'].tolist(), convert_to_tensor=False, batch_size=64, show_progress_bar=True)
    emb_test  = model.encode(test['clean_text'].tolist(), convert_to_tensor=False, batch_size=64, show_progress_bar=True)
    X_train_text_list.append(emb_train)
    X_test_text_list.append(emb_test)

X_train_text = np.hstack(X_train_text_list)
X_test_text  = np.hstack(X_test_text_list)
print("Text embeddings shape:", X_train_text.shape)

Encoding with sentence-transformers/paraphrase-MiniLM-L6-v2...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1172 [00:00<?, ?it/s]

Batches:   0%|          | 0/1172 [00:00<?, ?it/s]

Encoding with sentence-transformers/all-MiniLM-L6-v2...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1172 [00:00<?, ?it/s]

Batches:   0%|          | 0/1172 [00:00<?, ?it/s]

Encoding with sentence-transformers/distiluse-base-multilingual-cased-v2...


modules.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/610 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/539M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

2_Dense/model.safetensors:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

Batches:   0%|          | 0/1172 [00:00<?, ?it/s]

Batches:   0%|          | 0/1172 [00:00<?, ?it/s]

Text embeddings shape: (75000, 1280)


In [None]:
def generate_ann_features(train_emb, test_emb, k=10):
    dim = train_emb.shape[1]
    index = hnswlib.Index(space='cosine', dim=dim)
    index.init_index(max_elements=train_emb.shape[0], ef_construction=200, M=50)
    index.add_items(train_emb)
    index.set_ef(50)
    # Train features
    labels_train, _ = index.knn_query(train_emb, k=k)
    ann_train = np.array([train['price'].values[labels_train[i]] for i in range(len(labels_train))])
    # Test features
    labels_test, _ = index.knn_query(test_emb, k=k)
    ann_test = np.array([train['price'].values[labels_test[i]] for i in range(len(labels_test))])
    return ann_train, ann_test

ANN_train, ANN_test = generate_ann_features(X_train_text, X_test_text, k=10)
print("ANN features shape:", ANN_train.shape)

ANN features shape: (75000, 10)


In [None]:
transform = T.Compose([
    T.Resize((224,224)),
    T.ToTensor(),
    T.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
])

In [None]:
def batch_image_embeddings(image_names, backbones=['resnet50'], batch_size=32):
    all_embeddings = []
    for backbone_name in backbones:
        print(f"Using backbone: {backbone_name}")
        if backbone_name.startswith('resnet'):
            model = timm.create_model(backbone_name, pretrained=True, num_classes=0, global_pool='avg').to(device)
        else:
            model = timm.create_model(backbone_name, pretrained=True, num_classes=0, global_pool='avg').to(device)
        model.eval()
        emb_list = []
        for i in tqdm(range(0, len(image_names), batch_size)):
            batch_files = image_names[i:i+batch_size]
            imgs, indices = [], []
            for idx, img_name in enumerate(batch_files):
                safe_name = os.path.splitext(os.path.basename(img_name))[0]
                cache_path = os.path.join(cache_folder, f"{backbone_name}_{safe_name}.npy")
                if os.path.exists(cache_path):
                    emb_list.append(np.load(cache_path))
                else:
                    img_path = os.path.join(image_folder, img_name)
                    try:
                        img = Image.open(img_path).convert("RGB")
                        imgs.append(transform(img))
                        indices.append(len(emb_list))
                        emb_list.append(None)
                    except:
                        emb_list.append(np.zeros(model.num_features))
            if len(imgs) > 0:
                imgs = torch.stack(imgs).to(device)
                with torch.no_grad():
                    batch_emb = model(imgs).cpu().numpy()
                for idx, emb in zip(indices, batch_emb):
                    emb_list[idx] = emb
                    safe_name = os.path.splitext(os.path.basename(batch_files[indices.index(idx)]))[0]
                    cache_path = os.path.join(cache_folder, f"{backbone_name}_{safe_name}.npy")
                    np.save(cache_path, emb)
        all_embeddings.append(np.vstack(emb_list))
    return np.hstack(all_embeddings)

X_train_img = batch_image_embeddings(train['image_link'].tolist(), backbones=['resnet50','efficientnet_b0'], batch_size=32)
X_test_img  = batch_image_embeddings(test['image_link'].tolist(), backbones=['resnet50','efficientnet_b0'], batch_size=32)
print("Image embeddings shape:", X_train_img.shape)

Using backbone: resnet50


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

  0%|          | 0/2344 [00:00<?, ?it/s]

Using backbone: efficientnet_b0


model.safetensors:   0%|          | 0.00/21.4M [00:00<?, ?B/s]

  0%|          | 0/2344 [00:00<?, ?it/s]

Using backbone: resnet50


  0%|          | 0/2344 [00:00<?, ?it/s]

Using backbone: efficientnet_b0


  0%|          | 0/2344 [00:00<?, ?it/s]

Image embeddings shape: (75000, 3328)


In [None]:
X_train_full = np.hstack([X_train_text, ANN_train, X_train_num, X_train_cat, X_train_img])
X_test_full  = np.hstack([X_test_text, ANN_test, X_test_num, X_test_cat, X_test_img])
y_train = np.log1p(train['price'].values)
print("Full feature matrix shape:", X_train_full.shape)

In [None]:
# Train-validation split
X_tr, X_val, y_tr, y_val = train_test_split(X_train_full, y_train, test_size=0.2, random_state=42)

In [None]:
# XGBoost
xgb_model = XGBRegressor(
    n_estimators=1000,
    max_depth=7,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    tree_method='gpu_hist',
    random_state=42
)
xgb_model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], early_stopping_rounds=50, verbose=20)

In [None]:
# LightGBM
lgb_train = lgb.Dataset(X_tr, y_tr)
lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)
lgb_params = {
    'objective':'regression',
    'metric':'rmse',
    'boosting_type':'gbdt',
    'learning_rate':0.05,
    'num_leaves': 256,
    'max_depth': 12,
    'feature_fraction':0.8,
    'bagging_fraction':0.8,
    'bagging_freq':1,
    'device':'gpu',
    'seed':42
}
lgb_model = lgb.train(lgb_params, lgb_train, num_boost_round=1000, valid_sets=[lgb_train,lgb_val], early_stopping_rounds=50, verbose_eval=50)

In [None]:
# Ensemble Predictions
def smape(y_true, y_pred):
    return 100*np.mean(np.abs(y_pred - y_true)/((np.abs(y_true)+np.abs(y_pred))/2))

y_val_pred = 0.5*(np.expm1(xgb_model.predict(X_val)) + np.expm1(lgb_model.predict(X_val)))
val_smape = smape(np.expm1(y_val), y_val_pred)
print(f"Validation SMAPE: {val_smape:.2f}%")

In [None]:
y_test_pred = 0.5*(np.expm1(xgb_model.predict(X_test_full)) + np.expm1(lgb_model.predict(X_test_full)))
submission = pd.DataFrame({'sample_id': test['sample_id'], 'price': y_test_pred})
submission.to_csv('/content/drive/MyDrive/DataSets/submission_winner_level.csv', index=False)
print("Submission saved!")

# Save models
joblib.dump(xgb_model, '/content/drive/MyDrive/DataSets/xgb_model_winner.pkl')
joblib.dump(lgb_model, '/content/drive/MyDrive/DataSets/lgb_model_winner.pkl')
print("Models saved to Drive!")