In [None]:
# ===============================================================
#  Multimodal Price Prediction ‚Äì Text + Image
#  Author: Swayam Singh Sikarwar (example)
#  Works on Google Colab (GPU recommended for image embeddings)
# ===============================================================

# --- 0. Setup
!pip install -q pandas numpy lightgbm sentence-transformers timm torchvision Pillow tqdm requests joblib

import os, requests, hashlib
from PIL import Image
from io import BytesIO
from tqdm import tqdm
import numpy as np
import pandas as pd
import torch, timm
import torchvision.transforms as T
from sentence_transformers import SentenceTransformer
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
import joblib

# ===============================================================
# 1.  Upload or load your data
# ===============================================================
# Option 1: Upload files
# from google.colab import files
# print("üìÇ Please upload train.csv and test.csv")
# uploaded = files.upload()   # upload train/test CSVs
# train = pd.read_csv('train.csv')
# test  = pd.read_csv('test.csv')

# Option 2: Load from file path (replace with your file paths)
train_path = '/content/train.csv'  # Replace with the actual path to your train.csv
test_path = '/content/test.csv'    # Replace with the actual path to your test.csv
# Added engine='python', quotechar='"', doublequote=True to handle potential parsing issues
# Added on_bad_lines='skip' to skip problematic rows
train = pd.read_csv(train_path, engine='python', quotechar='"', doublequote=True, on_bad_lines='skip')
test  = pd.read_csv(test_path, engine='python', quotechar='"', doublequote=True, on_bad_lines='skip')


print("‚úÖ Loaded:", train.shape, test.shape)
display(train.head()) # Use display for better formatting

# ===============================================================
# 2. Image download utility
# ===============================================================
def ensure_dir(path): os.makedirs(path, exist_ok=True)

def image_filename_from_url(url):
    h = hashlib.md5(str(url).encode('utf-8')).hexdigest()
    ext = str(url).split('.')[-1].split('?')[0]
    if ext.lower() not in ('jpg','jpeg','png','webp','bmp'):
        ext = 'jpg'
    return f"{h}.{ext}"

def download_image(url, out_dir='images'):
    ensure_dir(out_dir)
    if not isinstance(url, str) or not url:
        return None
    fname = image_filename_from_url(url)
    path = os.path.join(out_dir, fname)
    if os.path.exists(path):
        return path
    try:
        r = requests.get(url, timeout=10, headers={'User-Agent':'Mozilla/5.0'})
        r.raise_for_status()
        img = Image.open(BytesIO(r.content)).convert('RGB')
        img.save(path, 'JPEG', quality=85)
        return path
    except Exception:
        return None

print("üì• Downloading few sample images (this may take time)...")
train['local_image'] = [download_image(u) for u in tqdm(train['image_link'])]
test['local_image']  = [download_image(u) for u in tqdm(test['image_link'])]

# ===============================================================
# 3. Text embeddings (SentenceTransformer)
# ===============================================================
def compute_text_embeddings(df, text_col='catalog_content', model_name="all-mpnet-base-v2"):
    model = SentenceTransformer(model_name)
    texts = df[text_col].fillna('').astype(str).tolist()
    emb = model.encode(texts, batch_size=64, show_progress_bar=True, convert_to_numpy=True)
    return emb

print("üß† Computing text embeddings...")
text_emb_train = compute_text_embeddings(train)
text_emb_test  = compute_text_embeddings(test)
np.save('text_emb_train.npy', text_emb_train)
np.save('text_emb_test.npy', text_emb_test)

# ===============================================================
# 4. Image embeddings (EfficientNet from timm)
# ===============================================================
def get_image_model(device='cuda'):
    model = timm.create_model('tf_efficientnet_b0_ns', pretrained=True, num_classes=0, global_pool='avg')
    model.eval().to(device)
    return model

def preprocess_image(path):
    tr = T.Compose([
        T.Resize(256),
        T.CenterCrop(224),
        T.ToTensor(),
        T.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
    ])
    return tr(Image.open(path).convert('RGB'))

def extract_image_embeddings(df, img_col='local_image', batch_size=32, device='cuda'):
    model = get_image_model(device)
    feats = []
    with torch.no_grad():
        for start in tqdm(range(0, len(df), batch_size)):
            batch_paths = df[img_col].iloc[start:start+batch_size]
            imgs = []
            for p in batch_paths:
                if isinstance(p, str) and os.path.exists(p):
                    imgs.append(preprocess_image(p))
                else:
                    imgs.append(torch.zeros(3,224,224))
            batch = torch.stack(imgs).to(device)
            out = model(batch).cpu().numpy()
            feats.append(out)
    feats = np.concatenate(feats, axis=0)
    return feats

print("üñºÔ∏è Extracting image embeddings (small subset demo)...")
device = 'cuda' if torch.cuda.is_available() else 'cpu'
img_emb_train = extract_image_embeddings(train, device=device)
img_emb_test  = extract_image_embeddings(test, device=device)
np.save('img_emb_train.npy', img_emb_train)
np.save('img_emb_test.npy', img_emb_test)

# ===============================================================
# 5. Train LightGBM model
# ===============================================================
def prepare_features(df, txt_emb, img_emb):
    lengths = df['catalog_content'].fillna('').astype(str).apply(len).values.reshape(-1,1)
    X = np.hstack([txt_emb, img_emb, lengths])
    return X

print("üìà Training model...")
y = np.log1p(train['price'].values)   # demo subset to keep fast
X = prepare_features(train, text_emb_train, img_emb_train)
kf = KFold(n_splits=3, shuffle=True, random_state=42)

preds = np.zeros(len(X))
models = []
for fold,(tr_idx,val_idx) in enumerate(kf.split(X)):
    Xtr,Xv = X[tr_idx],X[val_idx]
    ytr,yv = y[tr_idx],y[val_idx]
    trd=lgb.Dataset(Xtr,label=ytr)
    vd=lgb.Dataset(Xv,label=yv)
    params={'objective':'regression','metric':'rmse','learning_rate':0.05,'num_leaves':64,'seed':42}
    m=lgb.train(params,trd,valid_sets=[vd],num_boost_round=500,early_stopping_rounds=50,verbose_eval=50)
    preds[val_idx]=m.predict(Xv,num_iteration=m.best_iteration)
    models.append(m)

rmse=np.sqrt(mean_squared_error(np.expm1(y),np.expm1(preds)))
mae = mean_absolute_error(np.expm1(y),np.expm1(preds))
print(f"‚úÖ CV RMSE={rmse:.2f}, MAE={mae:.2f}")

# ===============================================================
# 6. Predict on test set and save submission
# ===============================================================
Xt = prepare_features(test, text_emb_test, img_emb_test)
p = np.mean([m.predict(Xt,num_iteration=m.best_iteration) for m in models],axis=0)
preds_final = np.expm1(p)
out = pd.DataFrame({'sample_id': test['sample_id'], 'predicted_price': preds_final})
out.to_csv('submission.csv', index=False)
print("üíæ Saved submission.csv")

from google.colab import files
files.download('submission.csv')

‚úÖ Loaded: (24678, 4) (24738, 3)


Unnamed: 0,sample_id,catalog_content,image_link,price
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.89
1,198967,"Item Name: Salerno Cookies, The Original Butte...",https://m.media-amazon.com/images/I/71YtriIHAA...,13.12
2,261251,"Item Name: Bear Creek Hearty Soup Bowl, Creamy...",https://m.media-amazon.com/images/I/51+PFEe-w-...,1.97
3,55858,Item Name: Judee‚Äôs Blue Cheese Powder 11.25 oz...,https://m.media-amazon.com/images/I/41mu0HAToD...,30.34
4,292686,"Item Name: kedem Sherry Cooking Wine, 12.7 Oun...",https://m.media-amazon.com/images/I/41sA037+Qv...,66.49


üì• Downloading few sample images (this may take time)...


 65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 16121/24678 [1:03:05<14:41,  9.71it/s]