In [None]:
import os
import pandas as pd 
import numpy as np

In [None]:
import os
import numpy as np
import pandas as pd

# Paths
image_dir = r"C:\Users\hp\OneDrive\Desktop\Amazon ML Challenge-20251010T213227Z-1-001\Amazon ML Challenge\student_resource\images"
prices_path = r"C:\Users\hp\OneDrive\Desktop\Amazon ML Challenge-20251010T213227Z-1-001\Amazon ML Challenge\student_resource\dataset\train.csv"

# 1) Enumerate image files and derive sample_ids
image_files = [f for f in os.listdir(image_dir) if f.lower().endswith('.jpg')]
image_sample_ids = [os.path.splitext(f)[0] for f in image_files]

# 2) Load prices to align with images
prices_df = pd.read_csv(prices_path)
prices_df = prices_df.drop(['catalog_content', 'image_link'], axis=1)
prices_df['sample_id'] = prices_df['sample_id'].astype(str)

# 3) Sanity checks
print(f"Found {len(image_sample_ids)} images.")
print(f"Prices dataset has {len(prices_df)} rows.")
print("Block 1 ready: image sample_ids derived and alignment plan established.")


In [None]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

print("DATASET ALIGNMENT")

# 1. Set paths
image_dir = "./images"  # Original images folder
prices_path = r"C:\Users\hp\OneDrive\Desktop\amazon_ml_challenge\Amazon ML Challenge\student_resource\dataset\train.csv"

# 2. Get sample_ids from images
jpg_files = [f for f in os.listdir(image_dir) if f.lower().endswith('.jpg')]
image_sample_ids = [os.path.splitext(f)[0] for f in jpg_files]
image_set = set(image_sample_ids)

# 3. Load prices
prices_df = pd.read_csv(prices_path)
prices_df['sample_id'] = prices_df['sample_id'].astype(str)
price_set = set(prices_df['sample_id'].tolist())

# 4. Find alignment
common_ids = image_set & price_set
missing_in_images = price_set - image_set
missing_in_prices = image_set - price_set

# 5. Save aligned list
aligned_sample_ids = sorted(list(common_ids))
np.save("aligned_sample_ids.npy", np.array(aligned_sample_ids, dtype=str))

# Optional: save missing
if missing_in_images:
    pd.Series(list(missing_in_images)).to_csv("missing_images.csv", index=False, header=["sample_id"])

In [None]:
from PIL import Image, ImageFile
import os
import numpy as np
from tqdm import tqdm

ImageFile.LOAD_TRUNCATED_IMAGES = True


print("IMAGE RESIZING (ONE-TIME)")


# Paths
input_dir = "./images"
output_dir = "./images_resized"
os.makedirs(output_dir, exist_ok=True)

# Load aligned IDs (only resize images we'll use)
aligned_ids = np.load("aligned_sample_ids.npy").astype(str)

# Resize with error handling
corrupted = []
successful = 0

for sid in tqdm(aligned_ids, desc="Resizing images"):
    input_path = os.path.join(input_dir, f"{sid}.jpg")
    output_path = os.path.join(output_dir, f"{sid}.jpg")
    try:
        img = Image.open(input_path)
        img.load()
        img_resized = img.resize((256, 256), Image.BICUBIC)
        img_resized.save(output_path, quality=95, optimize=True)
        successful += 1
    except Exception as e:
        corrupted.append(sid)

# Summary
print(f"\nsuccessfully resized {successful} images.")
print(f"Skipped {len(corrupted)} corrupted images.")

# Update aligned list (remove corrupted)
if corrupted:
    aligned_clean = [sid for sid in aligned_ids if sid not in corrupted]
    np.save("aligned_sample_ids_cleaned.npy", np.array(aligned_clean, dtype=str))
else:
    np.save("aligned_sample_ids_cleaned.npy", aligned_ids)

# Persist corrupted list if any
if corrupted:
    with open("corrupted_images.txt", "w") as f:
        f.write("\n".join(corrupted))

print("BLOCK 1.6 COMPLETE - Use './images_resized' in Block 2")

In [None]:
import os
import time
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from torchvision import transforms
from torchvision.models import ViT_B_16_Weights


print("BLOCK 2 - IMAGE LOADER")


# Paths
image_dir = "./images_resized"
aligned_sample_ids = np.load("aligned_sample_ids_cleaned.npy").astype(str)

# Transform
weights = ViT_B_16_Weights.IMAGENET1K_V1
transform = weights.transforms()

# Dataset
class ImageDatasetBySampleID(Dataset):
    def __init__(self, sample_ids, image_dir, transform=None):
        self.sample_ids = sample_ids
        self.image_dir = image_dir
        self.transform = transform

    def __len__(self):
        return len(self.sample_ids)

    def __getitem__(self, idx):
        sid = str(self.sample_ids[idx])
        path = os.path.join(self.image_dir, f"{sid}.jpg")
        image = Image.open(path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        return sid, image

# DataLoader
dataset = ImageDatasetBySampleID(aligned_sample_ids, image_dir, transform=transform)
loader = DataLoader(
    dataset,
    batch_size=64,
    shuffle=False,
    num_workers=0,
    pin_memory=False
)

print(f"DataLoader created.")
print(f"Total batches: {len(loader)}")
print(f"Batch size: {loader.batch_size}")

# Test first batch
print("\nTesting First Batch")
start = time.time()
sids, imgs = next(iter(loader))
elapsed = time.time() - start
print(f"First batch loaded in {elapsed:.2f} seconds.")
print(f"Shape: {imgs.shape}")


print("BLOCK 2 COMPLETE")

In [None]:
import torch
import numpy as np
from torchvision.models import vit_b_16, ViT_B_16_Weights
import torch.nn as nn
from tqdm import tqdm
import time


print("BLOCK 3 - EMBEDDING EXTRACTION (Vision Transformer)")




# Load Model (ViT-B/16)


print("\nLoading Model")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = vit_b_16(weights=ViT_B_16_Weights.IMAGENET1K_V1)
model.eval()
model.heads.head = nn.Identity()

model = model.to(device)

print(f"Feature dimension: 768 (ViT-B/16 output)")



print("\nTesting First Batch Inference")

start_test = time.time()




print("\nExtracting Embeddings")
emb_list = []
sid_list = []
start_time = time.time()

with torch.no_grad():
    pbar = tqdm(loader, total=len(loader), desc="ViT: Extracting")
    for sids, imgs in pbar:
        imgs = imgs.to(device, non_blocking=True)
        feats = model(imgs)
        feats_np = feats.cpu().numpy().astype(np.float16)
        emb_list.append(feats_np)
        sid_list.extend(list(sids))
        if len(emb_list) % 50 == 0:
            elapsed = time.time() - start_time
            speed = len(sid_list) / elapsed
            pbar.set_postfix({"Speed": f"{speed:.1f} img/s"})



print("\nSaving Embeddings")
embeddings = np.vstack(emb_list)
np.save("image_embeddings_vit.npy", embeddings)
np.save("image_sample_ids_order.npy", np.array(sid_list, dtype=str))

total_time = time.time() - start_time
avg_speed = len(sid_list) / total_time

print(f"Embeddings saved!")
print(f"Shape: {embeddings.shape}")  # (N, 768)
print(f"Total time: {total_time/60:.2f} minutes")


print("\nSummary")
print(f"Model: Vision Transformer B/16")
print(f"Feature dimension: {embeddings.shape[1]}")
print(f"Total images processed: {len(sid_list)}")
print(f"Saved to: image_embeddings_vit.npy")




In [None]:
import numpy as np
from sklearn.decomposition import PCA
import joblib


print("BLOCK 4 - PCA ON IMAGE EMBEDDINGS")


# Load embeddings
print("\nLoading Embeddings")
embeddings = np.load("image_embeddings_vit.npy")  # [N, D]
N, D = embeddings.shape
print(f"Loaded embeddings: {embeddings.shape}")

# PCA configuration
D_pca = 512
print(f"\nPCA Configuration")
print(f"Target dimension: {D_pca}")
print(f"Full dataset: {N} samples, {D} â†’ {D_pca} features")

# Fit PCA on full dataset (unsupervised preprocessing)
print("\nFitting PCA")
pca = PCA(n_components=D_pca, random_state=42)
pca.fit(embeddings)

# Transform
embeddings_pca = pca.transform(embeddings)

# Save results
print("\nSaving Results")
output_emb_path = f"image_embeddings_pca{D_pca}_full.npy"
np.save(output_emb_path, embeddings_pca)
print(f"Saved embeddings: {output_emb_path} ({embeddings_pca.shape})")

pkl_path = f"image_pca_{D_pca}.pkl"
joblib.dump(pca, pkl_path)
print(f"Saved PCA model: {pkl_path}")

# Diagnostics
print("\nPCA Diagnostics")
explained_variance = pca.explained_variance_ratio_.sum()
print(f"Explained variance: {explained_variance:.4f} ({explained_variance*100:.2f}%)")

np.save(f"pca_explained_variance_ratio_{D_pca}.npy", pca.explained_variance_ratio_)


In [None]:
import numpy as np
import pandas as pd


print("ALIGN IMAGE + TEXT EMBEDDINGS WITH PRICES")


# 1. Load Image Embeddings
image_sample_ids = np.load("image_sample_ids_order.npy").astype(str)
embeddings_img = np.load("image_embeddings_pca512_full.npy")  # [N, 512]
assert embeddings_img.shape[0] == len(image_sample_ids), "Image IDs and embeddings count mismatch."

df_images = pd.DataFrame({
    "sample_id": image_sample_ids,
    "img_emb": list(embeddings_img)
})
df_images["sample_id"] = df_images["sample_id"].astype(str)

# 2. Load Text Embeddings
text_df = pd.read_csv(r"C:\Users\hp\OneDrive\Desktop\amazon_ml_challenge\Amazon ML Challenge\student_resource\text_embeddings_with_id_1.csv")
text_df["sample_id"] = text_df["sample_id"].astype(str)
all_cols = list(text_df.columns)
text_cols = [c for c in all_cols if c != "sample_id"]

# 3. Load Prices
prices_df = pd.read_csv(
    r"C:\Users\hp\OneDrive\Desktop\amazon_ml_challenge\Amazon ML Challenge\student_resource\dataset\train.csv"
)
prices_df["sample_id"] = prices_df["sample_id"].astype(str)

price_col = "price" if "price" in prices_df.columns else [c for c in prices_df.columns if "price" in c.lower()][0]

# 4. Merge by sample_id
df_join = prices_df.merge(df_images, on="sample_id", how="inner")
df_join = df_join.merge(text_df[["sample_id"] + text_cols], on="sample_id", how="inner")

if len(df_join) == 0:
    raise ValueError("No matching sample_ids after merge. Check sample_id formats.")

# 5. Build features
X_img = np.vstack(df_join["img_emb"].values).astype(np.float32)
X_txt = df_join[text_cols].to_numpy(dtype=np.float32)
X = np.hstack([X_img, X_txt]).astype(np.float32)
y = df_join[price_col].to_numpy()

# Applying log tranform on prices

y_log = np.log1p(y)
print(f"âœ… Log range: {y_log.min():.3f} - {y_log.max():.3f}")


# 6. Save
np.save("X_combined_image_text.npy", X)
np.save("y_price.npy", y_log)
np.save("aligned_sample_ids_final.npy", df_join["sample_id"].values)

# 7. Validation
print(f"X shape: {X.shape}")
print(f"y shape: {y_log.shape}")


print("FINAL FEATURE SUMMARY")
print(f"Image features: {X_img.shape[1]} dims from ViT embeddings")
print(f"Text features: {X_txt.shape[1]} dims from 205 text features")
print(f"Total features: {X.shape[1]}")
print(f"Training samples: {X.shape[0]}")

In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# SMAPE metric
def calculate_smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    smape_values = np.where(denominator != 0, numerator / denominator, 0)
    return np.mean(smape_values) * 100

def lgb_smape_metric(y_pred, train_data):
    y_true = train_data.get_label()
    y_true_actual = np.expm1(y_true)
    y_pred_actual = np.expm1(y_pred)
    smape_value = calculate_smape(y_true_actual, y_pred_actual)
    return 'smape', smape_value, False

# Load data
X = np.load("X_combined_image_text.npy")
y = np.load("y_price.npy")

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

train_set = lgb.Dataset(X_train, label=y_train, free_raw_data=False)
val_set = lgb.Dataset(X_val, label=y_val, reference=train_set, free_raw_data=False)

params = {
    'objective': 'regression',
    'metric': 'l1',
    'boosting': 'gbdt',
    'learning_rate': 0.02,
    'num_leaves': 30,
    'max_depth': 8,
    'feature_fraction': 0.58,
    'bagging_fraction': 0.6,
    'bagging_freq': 5,
    'min_child_samples': 70,
    'lambda_l2': 5.0,
    'lambda_l1': 0.3,
    'min_gain_to_split': 0.4,
    'verbosity': -1,
    'n_jobs': -1,
    'seed': 42
}

gbm = lgb.train(
    params,
    train_set,
    num_boost_round=1200,
    valid_sets=[train_set, val_set],
    valid_names=['train', 'valid'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=30),
        lgb.log_evaluation(period=100)
    ],
    feval=lgb_smape_metric
)

y_pred_log = gbm.predict(X_val, num_iteration=gbm.best_iteration)
y_pred = np.expm1(y_pred_log)
y_val_actual = np.expm1(y_val)

mae = mean_absolute_error(y_val_actual, y_pred)
smape_score = calculate_smape(y_val_actual, y_pred)

print(f"MAE: ${mae:.2f}")
print(f"SMAPE: {smape_score:.2f}%")

In [None]:
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error


# SMAPE Metric Implementation

def calculate_smape(y_true, y_pred):
    """
    Symmetric Mean Absolute Percentage Error (SMAPE)
    Formula: (1/n) * Î£ |predicted - actual| / ((|actual| + |predicted|)/2) * 100
    
    Args:
        y_true: actual values
        y_pred: predicted values
    
    Returns:
        SMAPE score (0-200%, lower is better)
    """
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    
    # Avoid division by zero
    smape_values = np.where(denominator != 0, numerator / denominator, 0)
    smape_score = np.mean(smape_values) * 100
    
    return smape_score


def lgb_smape_metric(y_pred, train_data):
    """
    Custom SMAPE metric for LightGBM
    Returns: (metric_name, metric_value, is_higher_better)
    """
    y_true = train_data.get_label()
    
    # If using log-transformed prices, reverse transform
    y_true_actual = np.expm1(y_true)
    y_pred_actual = np.expm1(y_pred)
    
    smape_value = calculate_smape(y_true_actual, y_pred_actual)
    
    return 'smape', smape_value, False  # False = lower is better


# Load Data

X = np.load("X_combined_image_text.npy")  # [N, D]
y = np.load("y_price.npy")                   # [N]

print(f"Loaded data: X shape = {X.shape}, y shape = {y.shape}")


# Train-Val Split


X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Train set: {X_train.shape[0]} samples")
print(f"Val set: {X_val.shape[0]} samples")


# Create LightGBM Datasets

train_set = lgb.Dataset(X_train, label=y_train, free_raw_data=False)
val_set = lgb.Dataset(X_val, label=y_val, reference=train_set, free_raw_data=False)


# Model Parameters

params = {
    'objective': 'regression',
    'metric': 'l1',
    'boosting': 'gbdt',
    'learning_rate': 0.02,
    'num_leaves': 24,
    'max_depth': 8,
    'feature_fraction': 0.58,
    'bagging_fraction': 0.6,
    'bagging_freq': 5,
    'min_child_samples': 100,
    'lambda_l2': 5.0,
    'lambda_l1': 0.3,
    'min_gain_to_split': 0.4,
    'verbosity': -1,
    'n_jobs': -1,
    'seed': 42
}


# Train Model with SMAPE

print("\nTraining LightGBM model...")
print("="*60)

gbm = lgb.train(
    params,
    train_set,
    num_boost_round=1200,
    valid_sets=[train_set, val_set],
    valid_names=['train', 'valid'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=30),
        lgb.log_evaluation(period=100)
    ],
    feval=lgb_smape_metric  # Add SMAPE as custom metric
)

print("="*60)
print(f"Training completed. Best iteration: {gbm.best_iteration}")


# Evaluate on Validation Set

print("\n" + "="*60)
print("VALIDATION METRICS")
print("="*60)

# Get predictions
y_pred_log = gbm.predict(X_val, num_iteration=gbm.best_iteration)
y_pred = np.expm1(y_pred_log)
y_val_actual = np.expm1(y_val)

# Calculate MAE
mae = mean_absolute_error(y_val_actual, y_pred)
print(f"MAE (Mean Absolute Error): ${mae:.2f}")

# Calculate SMAPE
smape_score = calculate_smape(y_val_actual, y_pred)
print(f"SMAPE (Symmetric Mean Absolute Percentage Error): {smape_score:.2f}%")
print(f"  â†’ Lower is better (range: 0-200%)")

# Example interpretation
print(f"\nðŸ“Š Interpretation:")
print(f"   - If actual price = $100 and predicted = $120")
print(f"   - SMAPE = |100-120| / ((|100|+|120|)/2) * 100 = 18.18%")





In [None]:
model_path = "model_block7_no_pca.txt"
gbm.save_model(model_path)
print(f"\n Model saved to: {model_path}")

