In [1]:
import pandas as pd
import numpy as np
from PIL import Image
from io import BytesIO
from tqdm import tqdm
import re
import warnings
import os

# --- Scikit-learn & LightGBM Imports ---
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from scipy.sparse import hstack, csr_matrix
import lightgbm as lgb

# --- PyTorch Imports ---
import torch
import torch.nn as nn
from torchvision import transforms

warnings.filterwarnings('ignore', category=UserWarning, module='sklearn')

In [2]:
def smape_metric(y_true, y_pred):
    """Custom SMAPE metric for LightGBM training."""
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    # Add a small epsilon to avoid division by zero
    denominator[denominator == 0] = 1e-6
    smape_val = np.mean(numerator / denominator) * 100
    # LightGBM requires (metric_name, value, is_higher_better)
    return 'smape', smape_val, False

def calculate_smape(y_true, y_pred):
    """Calculates SMAPE for final evaluation."""
    return smape_metric(y_true, y_pred)[1]

In [3]:
# --- 1. Text Feature Engineering Functions ---
def clean_text(text):
    if not isinstance(text, str): return ""
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def extract_text_features(df):
    df['item_name'] = df['catalog_content'].str.extract(r"Item Name: (.*?)\n").iloc[:, 0].fillna('')
    df['description'] = df['catalog_content'].str.extract(r"Product Description: (.*?)\nValue:").iloc[:, 0].fillna('')
    bullet_points_list = df['catalog_content'].str.findall(r"Bullet Point \d+: (.*)")
    df['bullet_points'] = bullet_points_list.apply(lambda x: ' '.join(x))
    df['quantity_value'] = df['catalog_content'].str.extract(r"Value: ([\d.]+)").astype(float)
    pack_size = df['catalog_content'].str.extract(r"\(Pack of (\d+)\)").iloc[:, 0].astype(float)
    df['normalized_quantity'] = df['quantity_value'].fillna(1) * pack_size.fillna(1)
    df['brand'] = df['item_name'].str.split().str[0]
    df['num_bullet_points'] = bullet_points_list.apply(len)
    df['item_name_length'] = df['item_name'].str.len()
    df['description_length'] = df['description'].str.len()
    full_text = df['item_name'] + ' ' + df['bullet_points'] + ' ' + df['description']
    df['processed_text'] = full_text.apply(clean_text)
    keywords = ['premium', 'organic', 'gluten free', 'vegan', 'natural', 'non-gmo']
    for keyword in keywords:
        df[f'is_{keyword.replace(" ", "_")}'] = df['processed_text'].str.contains(keyword).astype(int)
    df.fillna({'normalized_quantity': 1, 'item_name_length': 0, 'description_length': 0}, inplace=True)
    return df

In [4]:
# --- 2. PyTorch U-Net Encoder for Image Features ---
class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels, dropout_rate=0.1):
        super().__init__()
        self.conv_block = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1, bias=False),
            nn.ReLU(inplace=True), nn.Dropout(p=dropout_rate),
            nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1, bias=False),
            nn.ReLU(inplace=True)
        )
    def forward(self, x): return self.conv_block(x)

class UNetEncoder(nn.Module):
    def __init__(self, in_channels=3):
        super().__init__()
        self.c1, self.p1 = ConvBlock(in_channels, 16), nn.MaxPool2d(2)
        self.c2, self.p2 = ConvBlock(16, 32), nn.MaxPool2d(2)
        self.c3, self.p3 = ConvBlock(32, 64, 0.2), nn.MaxPool2d(2)
        self.c4, self.p4 = ConvBlock(64, 128, 0.2), nn.MaxPool2d(2)
        self.c5 = ConvBlock(128, 256, 0.3)
        self.pool = nn.AdaptiveAvgPool2d((1, 1))
    def forward(self, x):
        x = self.p1(self.c1(x)); x = self.p2(self.c2(x)); x = self.p3(self.c3(x))
        x = self.p4(self.c4(x)); x = self.c5(x)
        return torch.flatten(self.pool(x), 1)

def load_and_preprocess_image(image_path, transform):
    try:
        img = Image.open(image_path).convert('RGB')
        return transform(img).unsqueeze(0)
    except (FileNotFoundError, OSError):
        return None

def create_image_features(df, image_folder_path, encoder_model, device):
    encoder_model.eval().to(device)
    transform = transforms.Compose([
        transforms.Resize((128, 128)), transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    all_features = []
    with torch.no_grad():
        for sample_id in tqdm(df['sample_id'], desc=f"Processing Images in {image_folder_path}"):
            image_path = os.path.join(image_folder_path, f"{sample_id}.jpg")
            img_tensor = load_and_preprocess_image(image_path, transform)
            if img_tensor is not None:
                features = encoder_model(img_tensor.to(device))
                all_features.append(features.cpu().numpy()[0])
            else:
                all_features.append(np.zeros(256))
    return np.array(all_features)

In [9]:
TRAIN_IMAGE_DIR = r'C:\Amazon ML Challange\train_images'
TEST_IMAGE_DIR = r'C:\Amazon ML Challange\student_resource\test_images'

print("Loading data...")
train_df = pd.read_csv('C:\\Amazon ML Challange\\student_resource\\dataset\\train.csv')
test_df = pd.read_csv('C:\\Amazon ML Challange\\student_resource\\dataset\\test.csv')
y_train = train_df['price']
test_ids = test_df['sample_id']

print("Processing text features...")
train_df_text = extract_text_features(train_df.copy())
test_df_text = extract_text_features(test_df.copy())

tfidf = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1, 2))
train_text_vec = tfidf.fit_transform(train_df_text['processed_text'])
test_text_vec = tfidf.transform(test_df_text['processed_text'])

num_cols = [c for c in train_df_text.columns if train_df_text[c].dtype != 'object' and c not in ['sample_id', 'price', 'catalog_content']]
train_text_features = hstack([train_text_vec, csr_matrix(train_df_text[num_cols].values)])
test_text_features = hstack([test_text_vec, csr_matrix(test_df_text[num_cols].values)])

print("\nProcessing image features...")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder = UNetEncoder()
train_image_features = create_image_features(train_df, TRAIN_IMAGE_DIR, encoder, device)
test_image_features = create_image_features(test_df, TEST_IMAGE_DIR, encoder, device)

print("\nCombining text and image features...")
X_train_full = hstack([train_text_features, csr_matrix(train_image_features)]).tocsr()
X_test = hstack([test_text_features, csr_matrix(test_image_features)]).tocsr()

Loading data...
Processing text features...

Processing image features...


Processing Images in C:\Amazon ML Challange\train_images: 100%|██████████| 75000/75000 [1:28:13<00:00, 14.17it/s]
Processing Images in C:\Amazon ML Challange\student_resource\test_images: 100%|██████████| 75000/75000 [1:22:46<00:00, 15.10it/s]



Combining text and image features...


In [10]:
print("\nSplitting data for validation...")
X_train, X_val, y_train_split, y_val = train_test_split(
    X_train_full, y_train, test_size=0.2, random_state=42
)
print(f"Training set shape: {X_train.shape}, Validation set shape: {X_val.shape}")

print("\nTraining LightGBM model with validation...")
lgbm = lgb.LGBMRegressor(
    objective='regression_l1',
    n_estimators=2000,
    learning_rate=0.05,
    num_leaves=31,
    max_depth=-1,
    min_child_samples=20,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

lgbm.fit(
    X_train, y_train_split,
    eval_set=[(X_val, y_val)],
    eval_metric=smape_metric,
    callbacks=[
        lgb.early_stopping(100, verbose=True),
        lgb.log_evaluation(period=10)
    ]
)

print("\nEvaluating model on validation set...")
val_preds = lgbm.predict(X_val)
mae = mean_absolute_error(y_val, val_preds)
print(f"Validation Mean Absolute Error: {mae:.4f}")

print("\nRetraining model on full training data for final submission...")
lgbm_final = lgb.LGBMRegressor(
    objective='regression_l1',
    n_estimators=lgbm.best_iteration_,
    learning_rate=0.05,
    num_leaves=31,
    max_depth=-1,
    min_child_samples=20,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

lgbm_final.fit(X_train_full, y_train,
                       eval_set=[(X_train_full, y_train)],
                       eval_metric=smape_metric,
                       callbacks=[lgb.log_evaluation(period=10)])

print("\nPredicting on test data...")
predictions = lgbm_final.predict(X_test)

submission = pd.DataFrame({'sample_id': test_ids, 'price': predictions})
submission['price'] = submission['price'].clip(0)
submission.to_csv('submission.csv', index=False)

print("\n✅ Submission file 'submission.csv' created successfully!")


Splitting data for validation...
Training set shape: (60000, 5267), Validation set shape: (15000, 5267)

Training LightGBM model with validation...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.614596 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 718380
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 5266
[LightGBM] [Info] Start training from score 14.090000
Training until validation scores don't improve for 100 rounds
[10]	valid_0's l1: 15.4516	valid_0's smape: 67.1152
[20]	valid_0's l1: 14.735	valid_0's smape: 64.2166
[30]	valid_0's l1: 14.2696	valid_0's smape: 62.444
[40]	valid_0's l1: 13.9534	valid_0's smape: 61.2182
[50]	valid_0's l1: 13.6907	valid_0's smape: 60.1931
[60]	valid_0's l1: 13.4843	valid_0's smape: 59.394
[70]	valid_0's l1: 13.3359	valid_0's smape: 58.7905
[80]	valid_0's l1: 13.2086	valid_0's smape: 58.261
[90]	valid_0's l1: 13.1101	val