In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("train.csv")

In [3]:
df.head()

Unnamed: 0,sample_id,catalog_content,image_link,price
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.89
1,198967,"Item Name: Salerno Cookies, The Original Butte...",https://m.media-amazon.com/images/I/71YtriIHAA...,13.12
2,261251,"Item Name: Bear Creek Hearty Soup Bowl, Creamy...",https://m.media-amazon.com/images/I/51+PFEe-w-...,1.97
3,55858,Item Name: Judee’s Blue Cheese Powder 11.25 oz...,https://m.media-amazon.com/images/I/41mu0HAToD...,30.34
4,292686,"Item Name: kedem Sherry Cooking Wine, 12.7 Oun...",https://m.media-amazon.com/images/I/41sA037+Qv...,66.49


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75000 entries, 0 to 74999
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   sample_id        75000 non-null  int64  
 1   catalog_content  75000 non-null  object 
 2   image_link       75000 non-null  object 
 3   price            75000 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 2.3+ MB


In [5]:
df.describe()

Unnamed: 0,sample_id,price
count,75000.0,75000.0
mean,149841.917707,23.647654
std,86585.346513,33.376932
min,0.0,0.13
25%,73845.75,6.795
50%,150129.0,14.0
75%,225040.25,28.625
max,299438.0,2796.0


In [6]:
print(df.isnull().sum())

sample_id          0
catalog_content    0
image_link         0
price              0
dtype: int64


In [7]:
print(df['catalog_content'].unique())

['Item Name: La Victoria Green Taco Sauce Mild, 12 Ounce (Pack of 6)\nValue: 72.0\nUnit: Fl Oz\n'
 'Item Name: Salerno Cookies, The Original Butter Cookies, 8 Ounce (Pack of 4)\nBullet Point 1: Original Butter Cookies: Classic butter cookies made with real butter\nBullet Point 2: Variety Pack: Includes 4 boxes with 32 cookies total\nBullet Point 3: Occasion Perfect: Delicious cookies for birthdays, weddings, anniversaries\nBullet Point 4: Shareable Treats: Fun to give and enjoy with friends and family\nBullet Point 5: Salerno Brand: Trusted brand of delicious butter cookies since 1925\nValue: 32.0\nUnit: Ounce\n'
 'Item Name: Bear Creek Hearty Soup Bowl, Creamy Chicken with Rice, 1.9 Ounce (Pack of 6)\nBullet Point 1: Loaded with hearty long grain wild rice and vegetables\nBullet Point 2: Full of hearty goodness\nBullet Point 3: Single serve bowls\nBullet Point 4: Easy to prepare mix\nBullet Point 5: 0 grams trans fat\nValue: 11.4\nUnit: Ounce\n'
 ...
 'Item Name: Jolly Rancher Hard Ca

Textual data Cleaning

In [8]:
import re

In [9]:
print("Columns:", df.columns.tolist())

Columns: ['sample_id', 'catalog_content', 'image_link', 'price']


In [10]:
def parse_catalog(text):
    # handle missing
    if pd.isna(text):
        return pd.Series({"item_name":"", "bullet_points":"", "value":None, "unit":""})
    # ensure string
    text = str(text)
    # ITEM NAME
    m_name = re.search(r'Item Name:\s*(.*?)(?:\n|$)', text, flags=re.IGNORECASE)
    item_name = m_name.group(1).strip() if m_name else ""
    # BULLET POINTS (all)
    bullets = re.findall(r'Bullet Point \d+:\s*(.*?)(?:\n|$)', text, flags=re.IGNORECASE)
    bullet_points = " ".join(b.strip() for b in bullets) if bullets else ""
    # VALUE (numeric)
    m_value = re.search(r'Value:\s*([\d.]+)', text, flags=re.IGNORECASE)
    value = float(m_value.group(1)) if m_value else None
    # UNIT
    m_unit = re.search(r'Unit:\s*(.*?)(?:\n|$)', text, flags=re.IGNORECASE)
    unit = m_unit.group(1).strip() if m_unit else ""
    return pd.Series({"item_name": item_name, "bullet_points": bullet_points, "value": value, "unit": unit})

# Debug: parse a single sample to ensure parser works
sample_idx = 0
print("Sample catalog content (first row):\n", df.loc[sample_idx, "catalog_content"][:1000], "\n")
print("Parsed sample:\n", parse_catalog(df.loc[sample_idx, "catalog_content"]))

# Apply parser to entire column
parsed = df['catalog_content'].apply(parse_catalog)
df = pd.concat([df, parsed], axis=1)

# Convert price to numeric if needed
df['price'] = pd.to_numeric(df['price'], errors='coerce')

# Show results
print("\nParsed columns head:")
print(df[['item_name','bullet_points','value','unit','price']].head())
print("\nMissing values after parse:")
print(df[['item_name','bullet_points','value','unit','price']].isnull().sum())

Sample catalog content (first row):
 Item Name: La Victoria Green Taco Sauce Mild, 12 Ounce (Pack of 6)
Value: 72.0
Unit: Fl Oz
 

Parsed sample:
 item_name        La Victoria Green Taco Sauce Mild, 12 Ounce (P...
bullet_points                                                     
value                                                         72.0
unit                                                         Fl Oz
dtype: object

Parsed columns head:
                                           item_name  \
0  La Victoria Green Taco Sauce Mild, 12 Ounce (P...   
1  Salerno Cookies, The Original Butter Cookies, ...   
2  Bear Creek Hearty Soup Bowl, Creamy Chicken wi...   
3  Judee’s Blue Cheese Powder 11.25 oz - Gluten-F...   
4  kedem Sherry Cooking Wine, 12.7 Ounce - 12 per...   

                                       bullet_points  value   unit  price  
0                                                     72.00  Fl Oz   4.89  
1  Original Butter Cookies: Classic butter cookie...  32.00 

In [11]:
bad_idx = None
for i, txt in enumerate(df['catalog_content']):
    try:
        parse_catalog(txt)
    except Exception as e:
        bad_idx = i
        print("Failed at row:", i, "error:", e)
        break


In [12]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# download necessary resources
nltk.download('stopwords')
nltk.download('wordnet')

# initialize tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    if pd.isna(text):
        return ""
    # lowercase
    text = text.lower()
    # remove punctuation, numbers, symbols
    text = re.sub(r'[^a-z\s]', '', text)
    # remove stopwords & lemmatize
    words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
    return " ".join(words)

# combine both text columns into one
df['combined_text'] = df['item_name'].fillna('') + ' ' + df['bullet_points'].fillna('')

# clean the combined text
df['clean_text'] = df['combined_text'].apply(clean_text)

# display result
print(df[['clean_text', 'price']].head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rammo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rammo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                          clean_text  price
0       la victoria green taco sauce mild ounce pack   4.89
1  salerno cooky original butter cooky ounce pack...  13.12
2  bear creek hearty soup bowl creamy chicken ric...   1.97
3  judees blue cheese powder oz glutenfree nutfre...  30.34
4           kedem sherry cooking wine ounce per case  66.49


In [13]:
# full_multimodal_pipeline.py
import os
import random
import math
import time
import requests
from io import BytesIO

import numpy as np
import pandas as pd
from PIL import Image

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models

from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup
from torch.cuda.amp import autocast, GradScaler

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", DEVICE)

Device: cpu


In [15]:
# ---------------------------
# 1) Preprocessing (log-price + value scaler + unit encoding)
# ---------------------------
# df should already have: 'clean_text', 'value', 'unit', 'price', 'image_link', 'sample_id'
# If price has strings, make numeric:
df['price'] = pd.to_numeric(df['price'], errors='coerce')
df = df.dropna(subset=['price'])  # drop rows with nan price
# Log-transform target for stability
df['log_price'] = np.log1p(df['price'])

# Value: numeric product measurement (we parsed earlier); fill NA with 0
df['value'] = pd.to_numeric(df['value'], errors='coerce').fillna(0.0)

# Standard scale value
value_scaler = StandardScaler()
df['value_scaled'] = value_scaler.fit_transform(df[['value']])

# Encode unit (categorical)
unit_le = LabelEncoder()
df['unit'] = df['unit'].fillna("unknown")
df['unit_idx'] = unit_le.fit_transform(df['unit'])

# Train / validation split
train_df, val_df = train_test_split(df, test_size=0.15, random_state=SEED)
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

In [16]:
# ---------------------------
# 2) Tokenizer and image transforms
# ---------------------------
TEXT_MODEL = "distilbert-base-uncased"   # lightweight; change to "bert-base-uncased" if you want
tokenizer = AutoTokenizer.from_pretrained(TEXT_MODEL)
MAX_TEXT_LEN = 64

# Image transforms (train vs val)
IMG_SIZE = 224
train_img_transforms = transforms.Compose([
    transforms.RandomResizedCrop(IMG_SIZE),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(0.1,0.1,0.1,0.1),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
])
val_img_transforms = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
])

In [17]:

# ---------------------------
# 3) Dataset (with simple image caching)
# ---------------------------
IMAGE_CACHE_DIR = "./image_cache"
os.makedirs(IMAGE_CACHE_DIR, exist_ok=True)

def download_image(url, save_path):
    try:
        resp = requests.get(url, timeout=8)
        resp.raise_for_status()
        img = Image.open(BytesIO(resp.content)).convert("RGB")
        img.save(save_path, format='JPEG', quality=85)
        return True
    except Exception as e:
        # print("Download failed:", e)
        return False

class ProductDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=MAX_TEXT_LEN, img_transforms=None, cache_dir=IMAGE_CACHE_DIR):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.img_transforms = img_transforms
        self.cache_dir = cache_dir

    def __len__(self):
        return len(self.df)

    def _load_image(self, url, sample_id):
        # local filename based on sample id
        file_name = f"{sample_id}.jpg"
        path = os.path.join(self.cache_dir, file_name)
        # if already downloaded, open it
        if os.path.exists(path):
            try:
                img = Image.open(path).convert("RGB")
                return img
            except:
                os.remove(path)  # corrupt, try re-download

        # try download
        ok = download_image(url, path)
        if ok:
            try:
                img = Image.open(path).convert("RGB")
                return img
            except:
                pass

        # fallback: return a plain gray image
        return Image.new("RGB", (IMG_SIZE, IMG_SIZE), (127,127,127))

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        sample_id = row['sample_id']
        text = str(row.get('clean_text', ""))[:1000]
        image_url = row.get('image_link', "")
        value_scaled = float(row.get('value_scaled', 0.0))
        unit_idx = int(row.get('unit_idx', 0))
        log_price = float(row.get('log_price', 0.0))

        # tokenize text (fixed-length)
        toks = self.tokenizer.encode_plus(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids = toks['input_ids'].squeeze(0)        # (max_len,)
        attention_mask = toks['attention_mask'].squeeze(0)

        # load image
        img = self._load_image(image_url, sample_id)
        if self.img_transforms:
            img = self.img_transforms(img)  # tensor

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'image': img,
            'value': torch.tensor(value_scaled, dtype=torch.float32),
            'unit_idx': torch.tensor(unit_idx, dtype=torch.long),
            'label': torch.tensor(log_price, dtype=torch.float32)
        }

# create datasets & loaders
train_ds = ProductDataset(train_df, tokenizer, img_transforms=train_img_transforms)
val_ds   = ProductDataset(val_df, tokenizer, img_transforms=val_img_transforms)

BATCH_SIZE = 16
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=4, pin_memory=True)
val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=True)

In [18]:

# ---------------------------
# 4) Multimodal model
# ---------------------------
class MultimodalRegressor(nn.Module):
    def __init__(self, text_model_name=TEXT_MODEL, image_model_name='resnet50',
                 num_units=len(unit_le.classes_), unit_emb_dim=8, hidden_dim=512, dropout=0.2):
        super().__init__()
        # Text encoder
        self.text_encoder = AutoModel.from_pretrained(text_model_name)
        text_hidden = self.text_encoder.config.hidden_size

        # Image encoder: pretrained ResNet50 without final fc
        backbone = models.resnet50(pretrained=True)
        # remove final fc
        self.image_encoder = nn.Sequential(*list(backbone.children())[:-1])  # outputs (B, 2048, 1,1)
        image_feat_dim = 2048

        # unit embedding
        self.unit_emb = nn.Embedding(num_units, unit_emb_dim)

        # final head
        total_dim = text_hidden + image_feat_dim + 1 + unit_emb_dim
        self.head = nn.Sequential(
            nn.Linear(total_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim//2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim//2, 1)
        )

    def forward(self, input_ids, attention_mask, image, value, unit_idx):
        # text -> mean pooling over tokens
        text_out = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        last_hidden = text_out.last_hidden_state          # (B, seq, H)
        mask = attention_mask.unsqueeze(-1)               # (B, seq, 1)
        sum_hidden = (last_hidden * mask).sum(1)          # (B, H)
        denom = mask.sum(1).clamp(min=1e-9)
        text_feat = sum_hidden / denom                    # (B, H)

        # image -> flatten
        img_feat = self.image_encoder(image)              # (B, 2048, 1,1)
        img_feat = img_feat.view(img_feat.size(0), -1)    # (B, 2048)

        unit_emb = self.unit_emb(unit_idx)                # (B, unit_emb_dim)

        # concat (value is scalar)
        x = torch.cat([text_feat, img_feat, value.unsqueeze(1), unit_emb], dim=1)
        out = self.head(x).squeeze(1)                     # (B,)
        return out

model = MultimodalRegressor().to(DEVICE)

W1013 16:50:32.363000 27440 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


In [19]:

# ---------------------------
# 6) Utility: evaluate
# ---------------------------
def evaluate(model, loader):
    model.eval()
    preds = []
    trues = []
    losses = []
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            image = batch['image'].to(DEVICE)
            value = batch['value'].to(DEVICE)
            unit_idx = batch['unit_idx'].to(DEVICE)
            label = batch['label'].to(DEVICE)

            out = model(input_ids, attention_mask, image, value, unit_idx)
            loss = criterion(out, label)
            losses.append(loss.item())

            preds.append(out.detach().cpu().numpy())
            trues.append(label.detach().cpu().numpy())

    preds = np.concatenate(preds)
    trues = np.concatenate(trues)
    # convert back to price-space
    price_preds = np.expm1(preds)
    price_trues = np.expm1(trues)
    rmse = math.sqrt(mean_squared_error(price_trues, price_preds))
    mae = mean_absolute_error(price_trues, price_preds)
    return np.mean(losses), rmse, mae

In [None]:

# ---------------------------
# 7) Training loop
# ---------------------------
best_rmse = float('inf')
save_path = "best_multimodal.pth"
EPOCHS = 5
for epoch in range(1, EPOCHS+1):
    model.train()
    epoch_losses = []
    t0 = time.time()
    for step, batch in enumerate(train_loader, 1):
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        image = batch['image'].to(DEVICE)
        value = batch['value'].to(DEVICE)
        unit_idx = batch['unit_idx'].to(DEVICE)
        label = batch['label'].to(DEVICE)

        optimizer.zero_grad()
        with autocast():
            outputs = model(input_ids, attention_mask, image, value, unit_idx)
            loss = criterion(outputs, label)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
        epoch_losses.append(loss.item())

        if step % 50 == 0:
            print(f"Epoch {epoch} step {step}/{len(train_loader)} loss={np.mean(epoch_losses):.4f}")

    avg_train_loss = np.mean(epoch_losses)
    val_loss, val_rmse, val_mae = evaluate(model, val_loader)
    elapsed = time.time() - t0
    print(f"Epoch {epoch} finished in {elapsed:.1f}s - train_loss={avg_train_loss:.4f} val_loss={val_loss:.4f} val_rmse={val_rmse:.2f} val_mae={val_mae:.2f}")

    # save best
    if val_rmse < best_rmse:
        best_rmse = val_rmse
        torch.save({
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'value_scaler': value_scaler,
            'unit_le_classes': unit_le.classes_,
            'tokenizer': tokenizer.name_or_path
        }, save_path)
        print("Saved best model:", save_path)

print("Training complete. Best RMSE:", best_rmse)

