# required libraries

In [9]:
# ======================================================
# ✅ 1️⃣ Install Required Libraries (Fixed Dependencies)
# ======================================================

# Install compatible versions of core packages
!pip install -q transformers==4.38.2 timm==0.9.2 efficientnet_pytorch==0.7.1 torchmetrics==1.4.0

# Install supporting libraries
!pip install -q pandas numpy scikit-learn tqdm matplotlib pillow

# Install SentencePiece (used for tokenization in some models)
!pip install -q sentencepiece

# (Optional but recommended) Install sentence-transformers afterward
!pip install -q sentence-transformers==2.2.2


# Mount Google Drive and Locate Dataset

In [10]:
# ======================================================
# 2️⃣ Mount Google Drive and Locate Dataset
# ======================================================
from pathlib import Path
import os

print("🔗 Mounting Google Drive ...")
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Automatically detect folder that contains train.csv / test.csv
from pathlib import Path

DRIVE_ROOT = Path("/content/drive/MyDrive")

def find_dataset_folder(base_dir):
    for path in base_dir.rglob("sample_train.csv"):
        folder = path.parent
        test_path = folder / "sample_test.csv"
        if test_path.exists():
            return folder
    return None

DATA_DIR = find_dataset_folder(DRIVE_ROOT)

if DATA_DIR is None:
    print("❌ train.csv and sample_test.csv not found automatically.")
    print("➡  Set DATA_DIR manually below, e.g.:")
    print("DATA_DIR = Path('/content/drive/MyDrive/SmartPricingChallenge/')")
else:
    print(f"✅ Dataset folder found: {DATA_DIR}")

# Convert to string with trailing slash for easy joining
DATA_DIR = str(DATA_DIR) + "/"
print("Using DATA_DIR =", DATA_DIR)

🔗 Mounting Google Drive ...
Mounted at /content/drive
✅ Dataset folder found: /content/drive/MyDrive/student_resource/dataset
Using DATA_DIR = /content/drive/MyDrive/student_resource/dataset/


# Smart Product Pricing Challenge – Optimized Model

In [None]:
# ======================================================
# ⿣ Smart Product Pricing Challenge – Optimized Model (Fixed)
# ======================================================

# --- Imports ---
import os, io, re, time, math, random, hashlib, requests
import numpy as np, pandas as pd
from pathlib import Path
from PIL import Image
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import timm
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# ======================================================
# Device
# ======================================================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# ======================================================
# Paths
# ======================================================
DATA_DIR = '/content/drive/MyDrive/student_resource/dataset' # change if needed
train_path = Path(DATA_DIR) / "sample_train.csv"
test_path  = Path(DATA_DIR) / "sample_test.csv"

train_df = pd.read_csv(train_path)
test_df  = pd.read_csv(test_path)

print("Train size:", len(train_df), " Test size:", len(test_df))
display(train_df.head())

# ======================================================
# Preprocessing
# ======================================================
def clean_text(s):
    if pd.isna(s): return ""
    s = str(s)
    s = re.sub(r"<[^>]+>", " ", s)
    s = s.replace("\n", " ").replace("\r", " ")
    s = re.sub(r"\s+", " ", s).strip()
    return s

train_df["catalog_content"] = train_df["catalog_content"].astype(str).map(clean_text)
test_df["catalog_content"]  = test_df["catalog_content"].astype(str).map(clean_text)

# ======================================================
# Tokenizer (DistilBERT)
# ======================================================
MODEL_TEXT = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_TEXT)
MAX_LEN = 128

# ======================================================
# Image Transforms
# ======================================================
IMG_SIZE = 192
train_tfms = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.RandomResizedCrop(IMG_SIZE, scale=(0.7,1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(0.15,0.15,0.15),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225]),
])
valid_tfms = transforms.Compose([
    transforms.Resize((IMG_SIZE,IMG_SIZE)),
    transforms.CenterCrop(IMG_SIZE),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225]),
])

# ======================================================
# Image Downloader
# ======================================================
CACHE_DIR = Path("/content/image_cache")
CACHE_DIR.mkdir(exist_ok=True)

def url_to_file(url):
    return CACHE_DIR / (hashlib.md5(url.encode()).hexdigest() + ".jpg")

def download_image(url, max_retries=3, timeout=5):
    if not isinstance(url, str) or url.strip() == "":
        return None
    f = url_to_file(url)
    if f.exists():
        try:
            return Image.open(f).convert("RGB")
        except:
            f.unlink(missing_ok=True)
    for attempt in range(max_retries):
        try:
            r = requests.get(url, timeout=timeout)
            if r.status_code == 200:
                img = Image.open(io.BytesIO(r.content)).convert("RGB")
                img.save(f)
                return img
        except:
            time.sleep(1.5 * (attempt+1))
    return None

# ======================================================
# Dataset Class (FIXED)
# ======================================================
class PricingDataset(Dataset):
    def __init__(self, df, is_train=True, transform=None):
        self.df = df.reset_index(drop=True)
        self.is_train = is_train
        self.transform = transform or valid_tfms

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = str(row["catalog_content"])
        enc = tokenizer(text, padding="max_length", truncation=True,
                        max_length=MAX_LEN, return_tensors="pt")
        input_ids = enc["input_ids"].squeeze(0)
        attn_mask = enc["attention_mask"].squeeze(0)

        img = download_image(row.get("image_link", None))
        if img is None:
            img = Image.new("RGB", (IMG_SIZE, IMG_SIZE), (128,128,128))
        img_t = self.transform(img)

        if self.is_train:
            price = torch.tensor(row["price"], dtype=torch.float32)
            return input_ids, attn_mask, img_t, price
        else:
            sid = row["sample_id"]
            return input_ids, attn_mask, img_t, sid

# ======================================================
# Split Train / Validation
# ======================================================
train_df = train_df.dropna(subset=["price"])
train_df["price_bin"] = pd.qcut(train_df["price"].rank(method="first"), q=10, labels=False)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42, stratify=train_df["price_bin"])
train_df = train_df.drop(columns=["price_bin"])

# ======================================================
# DataLoaders
# ======================================================
BATCH_SIZE = 16
train_dl = DataLoader(PricingDataset(train_df, True, train_tfms), batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
val_dl   = DataLoader(PricingDataset(val_df, True, valid_tfms),   batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

# ======================================================
# Model (EfficientNet-B0 + DistilBERT)
# ======================================================
class MultiModalModel(nn.Module):
    def __init__(self, text_model=MODEL_TEXT, img_model="tf_efficientnet_b0_ns", hidden_dim=256):
        super().__init__()
        self.text_backbone = AutoModel.from_pretrained(text_model)
        self.img_backbone  = timm.create_model(img_model, pretrained=True, num_classes=0, global_pool="avg")
        self.text_proj = nn.Sequential(
            nn.Linear(self.text_backbone.config.hidden_size, hidden_dim),
            nn.ReLU(), nn.Dropout(0.3)
        )
        self.img_proj = nn.Sequential(
            nn.Linear(self.img_backbone.num_features, hidden_dim),
            nn.ReLU(), nn.Dropout(0.3)
        )
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim*2, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, ids, mask, img):
        txt_out = self.text_backbone(input_ids=ids, attention_mask=mask).last_hidden_state[:,0,:]
        txt_feat = self.text_proj(txt_out)
        img_feat = self.img_proj(self.img_backbone(img))
        x = torch.cat([txt_feat, img_feat], dim=1)
        out = self.fc(x).squeeze(1)
        return F.softplus(out)  # ensures positive prices

# ======================================================
# SMAPE Metric
# ======================================================
def smape(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    num = np.abs(y_true - y_pred)
    den = (np.abs(y_true) + np.abs(y_pred)) / 2
    den = np.where(den==0, 1e-8, den)
    return np.mean(num/den) * 100

# ======================================================
# Training Loop
# ======================================================
model = MultiModalModel().to(device)

# Freeze most transformer & CNN layers first
for name, param in model.text_backbone.named_parameters():
    if "transformer.layer.5" not in name:
        param.requires_grad = False
for name, param in model.img_backbone.named_parameters():
    if "blocks.6" not in name and "conv_head" not in name:
        param.requires_grad = False

opt = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=6, eta_min=1e-6)
loss_fn = nn.SmoothL1Loss()
scaler = torch.cuda.amp.GradScaler()

best_smape, patience, patience_ctr = 999, 2, 0
model_path = "/content/best_model.pth"

for epoch in range(8):
    model.train()
    tr_loss = 0
    for ids, mask, img, price in tqdm(train_dl, desc=f"Epoch {epoch+1}/8"):
        ids, mask, img, price = ids.to(device), mask.to(device), img.to(device), price.to(device)
        opt.zero_grad()
        with torch.cuda.amp.autocast():
            pred = model(ids, mask, img)
            loss = loss_fn(pred, price)
        scaler.scale(loss).backward()
        scaler.step(opt)
        scaler.update()
        tr_loss += loss.item() * img.size(0)
    scheduler.step()
    tr_loss /= len(train_dl.dataset)

    # Validation
    model.eval()
    preds, trues = [], []
    with torch.no_grad():
        for ids, mask, img, price in val_dl:
            ids, mask, img, price = ids.to(device), mask.to(device), img.to(device), price.to(device)
            with torch.cuda.amp.autocast():
                pred = model(ids, mask, img)
            preds.extend(pred.cpu().numpy())
            trues.extend(price.cpu().numpy())
    val_smape = smape(trues, preds)
    print(f"Epoch {epoch+1}: TrainLoss={tr_loss:.4f}, ValSMAPE={val_smape:.3f}")

    if val_smape < best_smape:
        best_smape = val_smape
        patience_ctr = 0
        torch.save(model.state_dict(), model_path)
        print("✅ New best model saved.")
    else:
        patience_ctr += 1
        if patience_ctr >= patience:
            print("⏹ Early stopping.")
            break

# ======================================================
# Inference on Test Set
# ======================================================
print("\n🔍 Loading best model and running predictions ...")
model.load_state_dict(torch.load(model_path, map_location=device))
model.eval()

test_ds = PricingDataset(test_df, is_train=False, transform=valid_tfms)
test_dl = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

sample_ids, pred_prices = [], []
with torch.no_grad():
    for ids, mask, img, sids in tqdm(test_dl, desc="Predicting"):
        ids, mask, img = ids.to(device), mask.to(device), img.to(device)
        with torch.cuda.amp.autocast():
            preds = model(ids, mask, img)
        sample_ids.extend(sids)
        pred_prices.extend(preds.cpu().numpy().tolist())

out_df = pd.DataFrame({"sample_id": sample_ids, "price": np.clip(pred_prices, 0.01, None)})
out_path = "/content/test_out.csv"
out_df.to_csv(out_path, index=False)
print(f"✅ Submission file saved: {out_path}")

# Copy to Drive
import shutil
shutil.copy(out_path, DATA_DIR)
print(f"✅ test_out.csv also copied to {DATA_DIR}")


Generate prediction file using the trained model now on test.csv file


In [None]:
# ======================================================
# 🔮 Inference on Official 75K Test Set
# ======================================================

from pathlib import Path
import torch
import pandas as pd
from torch.utils.data import DataLoader

# --- Directory where dataset is stored ---
DATA_DIR = "/content/drive/MyDrive/student_resource/dataset"

# --- Load your trained model ---
model.load_state_dict(torch.load("/content/best_model.pth", map_location=device))
model.eval()

# --- Load the full official test set ---
official_test_path = Path(DATA_DIR) / "test.csv"  # your real 75k test file
test_df = pd.read_csv(official_test_path)
print("Official Test size:", len(test_df))

# --- Clean text again (same function used before) ---
test_df["catalog_content"] = test_df["catalog_content"].astype(str).map(clean_text)

# --- Create dataset and dataloader ---
test_ds = PricingDataset(test_df, is_train=False, transform=valid_tfms)
test_dl = DataLoader(test_ds, batch_size=16, shuffle=False, num_workers=2)

# --- Run predictions ---
sample_ids, pred_prices = [], []
with torch.no_grad():
    for ids, mask, img, sids in tqdm(test_dl, desc="Predicting 75k"):
        ids, mask, img = ids.to(device), mask.to(device), img.to(device)
        with torch.amp.autocast("cuda"):
            preds = model(ids, mask, img)
        sample_ids.extend(sids)
        pred_prices.extend(preds.cpu().numpy().tolist())

# --- Save submission ---
out_df = pd.DataFrame({
    "sample_id": sample_ids,
    "price": np.clip(pred_prices, 0.01, None)
})
out_path = "/content/test_out.csv"
out_df.to_csv(out_path, index=False)
print(f"✅ Submission file saved: {out_path}")

# --- Optional: Copy to Drive for backup ---
!cp /content/test_out.csv "$DATA_DIR"
print(f"✅ test_out.csv also copied to {DATA_DIR}")
