In [None]:
import pandas as pd
import numpy as np
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, RobustScaler, StandardScaler
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings('ignore')

# ==================== ENHANCED FEATURE EXTRACTION ====================

class OptimizedFeatureExtractor:
    def __init__(self):
        # Increased TF-IDF dimensions for better text capture
        self.tfidf_name = TfidfVectorizer(
            max_features=220, ngram_range=(1, 3), min_df=2, max_df=0.90,
            sublinear_tf=True, strip_accents='unicode', token_pattern=r'\b\w+\b'
        )
        self.tfidf_bullets = TfidfVectorizer(
            max_features=280, ngram_range=(1, 3), min_df=2, max_df=0.90,
            sublinear_tf=True, strip_accents='unicode'
        )
        self.char_vectorizer = TfidfVectorizer(
            analyzer='char', ngram_range=(3, 4), max_features=50, min_df=2
        )

        # Enhanced SVD for better dimensionality reduction
        self.svd_name = TruncatedSVD(n_components=42, random_state=42)
        self.svd_bullets = TruncatedSVD(n_components=55, random_state=42)
        self.svd_char = TruncatedSVD(n_components=18, random_state=42)

        self.image_pca = None
        self.scaler = RobustScaler()

        self.unit_conversions = {
            'oz': 1, 'ounce': 1, 'lb': 16, 'pound': 16, 'g': 0.035274, 'gram': 0.035274,
            'kg': 35.274, 'ml': 0.033814, 'milliliter': 0.033814, 'l': 33.814, 'liter': 33.814,
            'fl oz': 1, 'fluid ounce': 1, 'fl. oz': 1, 'floz': 1
        }

        self.brand_stats = {}
        self.category_stats = {}
        self.le_category = LabelEncoder()

    def extract_value_field(self, text):
        if pd.isna(text):
            return None
        match = re.search(r'Value:\s*(\d+\.?\d*)', str(text), re.IGNORECASE)
        return float(match.group(1)) if match else None

    def extract_unit_field(self, text):
        if pd.isna(text):
            return 'unknown'
        match = re.search(r'Unit:\s*([^\n]+)', str(text), re.IGNORECASE)
        return match.group(1).strip().lower() if match else 'unknown'

    def extract_size_features_advanced(self, text):
        if pd.isna(text):
            return self._empty_size_features()
        text = str(text).lower()
        value = self.extract_value_field(text)
        unit = self.extract_unit_field(text)

        pack_patterns = [
            r'pack\s+of\s+(\d+)', r'\(pack\s+of\s+(\d+)\)', r'(\d+)\s*-?\s*pack',
            r'(\d+)\s+count', r'(\d+)\s*ct\b', r'case\s+of\s+(\d+)',
            r'(\d+)\s+per\s+case', r'(\d+)\s*pk\b', r'(\d+)\s+units?', r'set\s+of\s+(\d+)'
        ]
        pack_count = 1
        for pattern in pack_patterns:
            match = re.search(pattern, text)
            if match:
                pack_count = int(match.group(1))
                break

        size_oz = 0
        if value and unit and unit in self.unit_conversions:
            size_oz = value * self.unit_conversions[unit]

        if size_oz == 0:
            size_patterns = [
                r'(\d+\.?\d*)\s*x?\s*(fl\s*oz|fluid\s*ounce|fl\.\s*oz)',
                r'(\d+\.?\d*)\s*-?\s*(oz|ounce)(?!\s*(pack|ct|count))',
                r'(\d+\.?\d*)\s*(lb|lbs|pound|pounds)',
                r'(\d+\.?\d*)\s*(g|gram|grams)(?!\s*(pack|ct))',
                r'(\d+\.?\d*)\s*(kg|kilogram)', r'(\d+\.?\d*)\s*(ml|milliliter)',
                r'(\d+\.?\d*)\s*(l|liter)(?!bs)',
            ]
            for pattern in size_patterns:
                match = re.search(pattern, text)
                if match:
                    size = float(match.group(1))
                    unit_type = match.group(2).strip()
                    size_oz = size * self.unit_conversions.get(unit_type, 1)
                    break

        total_volume = size_oz * pack_count
        size_oz = min(size_oz, 500)
        total_volume = min(total_volume, 2000)
        pack_count = min(pack_count, 100)

        return {
            'size_oz': size_oz,
            'pack_count': pack_count,
            'total_volume_oz': total_volume,
            'has_size_info': 1 if size_oz > 0 else 0,
            'log_size': np.log1p(size_oz),
            'log_total_volume': np.log1p(total_volume),
            'log_pack': np.log1p(pack_count),
            'sqrt_size': np.sqrt(size_oz) if size_oz > 0 else 0,
            'is_bulk': 1 if pack_count >= 6 else 0,
            'is_single': 1 if pack_count == 1 else 0,
            'value_field': value if value else 0,
            'has_value_field': 1 if value else 0,
        }

    def _empty_size_features(self):
        return {
            'size_oz': 0, 'pack_count': 1, 'total_volume_oz': 0, 'has_size_info': 0,
            'log_size': 0, 'log_total_volume': 0, 'log_pack': 0, 'sqrt_size': 0,
            'is_bulk': 0, 'is_single': 1, 'value_field': 0, 'has_value_field': 0
        }

    def extract_premium_advanced(self, text):
        if pd.isna(text):
            return self._empty_premium_features()
        text_lower = str(text).lower()

        ultra_premium = ['imported', 'artisan', 'handcrafted', 'reserve', 'aged', 'vintage', 'estate']
        premium = ['organic', 'premium', 'gourmet', 'specialty', 'select', 'finest', 'quality']
        quality = ['natural', 'pure', 'authentic', 'traditional', 'fresh', 'real']
        dietary = ['gluten-free', 'gluten free', 'non-gmo', 'non gmo', 'kosher',
                  'halal', 'vegan', 'vegetarian', 'dairy-free', 'sugar-free']

        ultra_premium_count = sum(1 for w in ultra_premium if w in text_lower)
        premium_count = sum(1 for w in premium if w in text_lower)
        quality_count = sum(1 for w in quality if w in text_lower)
        dietary_count = sum(1 for w in dietary if w in text_lower)

        weighted_premium = (ultra_premium_count * 2.5 + premium_count * 1.5 +
                          quality_count * 0.8 + dietary_count * 1.2)

        return {
            'ultra_premium_count': ultra_premium_count,
            'premium_count': premium_count,
            'quality_count': quality_count,
            'dietary_count': dietary_count,
            'weighted_premium_score': weighted_premium,
            'is_premium': 1 if (ultra_premium_count > 0 or premium_count > 0) else 0,
        }

    def _empty_premium_features(self):
        return {k: 0 for k in ['ultra_premium_count', 'premium_count', 'quality_count',
                               'dietary_count', 'weighted_premium_score', 'is_premium']}

    def extract_category_v2(self, text):
        if pd.isna(text):
            return {'category': 'other', 'category_price_tier': 1}
        text_lower = str(text).lower()

        categories = {
            'wine': 4, 'cheese': 3, 'olive_oil': 3, 'chocolate': 3,
            'sauce': 2, 'dressing': 2, 'condiment': 2, 'cookie': 2,
            'soup': 2, 'seasoning': 2, 'snack': 2, 'beverage': 2,
            'powder': 1, 'water': 1, 'basic': 1
        }

        keywords = {
            'wine': ['wine', 'vino', 'merlot', 'cabernet', 'chardonnay'],
            'cheese': ['cheese', 'cheddar', 'mozzarella', 'parmesan', 'brie'],
            'olive_oil': ['olive oil', 'extra virgin'],
            'chocolate': ['chocolate', 'cocoa'],
            'sauce': ['sauce', 'salsa', 'marinara'],
            'dressing': ['dressing', 'vinaigrette'],
            'condiment': ['ketchup', 'mustard', 'mayo', 'relish'],
            'soup': ['soup', 'broth', 'stock'],
            'seasoning': ['seasoning', 'spice', 'herb'],
            'snack': ['chips', 'popcorn', 'crackers', 'pretzels'],
            'beverage': ['juice', 'soda', 'drink', 'tea', 'coffee'],
            'cookie': ['cookie', 'biscuit', 'wafer'],
            'powder': ['powder', 'mix'],
            'water': ['water'],
            'basic': ['salt', 'pepper', 'sugar']
        }

        detected_category = 'other'
        price_tier = 1
        for cat, kws in keywords.items():
            if any(kw in text_lower for kw in kws):
                detected_category = cat
                price_tier = categories.get(cat, 1)
                break

        return {'category': detected_category, 'category_price_tier': price_tier}

    def extract_text_features_advanced(self, text):
        if pd.isna(text):
            return self._empty_text_features()
        text = str(text)

        words = re.findall(r'\b\w+\b', text)
        unique_words = set(words)
        bullets = re.findall(r'Bullet Point \d+:', text)

        return {
            'char_count': len(text),
            'word_count': len(words),
            'unique_word_count': len(unique_words),
            'bullet_count': len(bullets),
            'log_char_count': np.log1p(len(text)),
            'log_word_count': np.log1p(len(words)),
            'word_diversity': len(unique_words) / (len(words) + 1),
        }

    def _empty_text_features(self):
        return {k: 0 for k in ['char_count', 'word_count', 'unique_word_count',
                               'bullet_count', 'log_char_count', 'log_word_count',
                               'word_diversity']}

    def compute_target_encoding(self, df, target_col='price', is_training=True):
        brands = df['catalog_content'].apply(
            lambda x: re.search(r'Item Name:\s*([^,\s]+)', str(x)).group(1).lower()
            if pd.notna(x) and re.search(r'Item Name:\s*([^,\s]+)', str(x)) else 'unknown'
        )

        categories = df['catalog_content'].apply(self.extract_category_v2).apply(lambda x: x['category'])

        if is_training:
            global_mean = df[target_col].mean()

            brand_stats = brands.to_frame('brand').join(df[[target_col]])
            self.brand_stats = brand_stats.groupby('brand')[target_col].agg(['mean', 'count']).to_dict('index')
            self.brand_mean = global_mean

            cat_stats = categories.to_frame('category').join(df[[target_col]])
            self.category_stats = cat_stats.groupby('category')[target_col].agg(['mean', 'count']).to_dict('index')
            self.cat_mean = global_mean

        m = 10
        brand_encoded = brands.map(lambda x: self._smooth_mean(
            self.brand_stats.get(x, {'mean': self.brand_mean, 'count': 0}),
            self.brand_mean, m
        ))

        cat_encoded = categories.map(lambda x: self._smooth_mean(
            self.category_stats.get(x, {'mean': self.cat_mean, 'count': 0}),
            self.cat_mean, m
        ))

        return brand_encoded.values, cat_encoded.values

    def _smooth_mean(self, stats, global_mean, m):
        count = stats['count']
        mean = stats['mean']
        return (count * mean + m * global_mean) / (count + m)

    def process_catalog_content(self, df, is_training=True, prices=None, image_features_df=None):
        print("="*70)
        print(" " * 20 + "FEATURE EXTRACTION")
        print("="*70)

        features_dict = {}

        item_names = df['catalog_content'].apply(
            lambda x: re.search(r'Item Name:\s*([^\n]+)', str(x)).group(1)
            if pd.notna(x) and re.search(r'Item Name:\s*([^\n]+)', str(x)) else ''
        )
        bullet_text = df['catalog_content'].apply(
            lambda x: ' '.join(re.findall(r'Bullet Point \d+:\s*([^\n]+)', str(x)))
            if pd.notna(x) else ''
        )

        for key, func in [
            ('size', self.extract_size_features_advanced),
            ('premium', self.extract_premium_advanced),
            ('category', self.extract_category_v2),
            ('text', self.extract_text_features_advanced)
        ]:
            features = df['catalog_content'].apply(func)
            for k in features.iloc[0].keys():
                features_dict[k] = [f[k] for f in features]

        numerical_df = pd.DataFrame(features_dict)

        if prices is not None:
            temp_df = df.copy()
            temp_df['price'] = prices
            brand_encoded, cat_encoded = self.compute_target_encoding(temp_df, is_training=is_training)
        else:
            if is_training:
                raise ValueError("Prices required for training")
            brand_encoded, cat_encoded = self.compute_target_encoding(df, is_training=False)

        numerical_df['brand_target_enc'] = brand_encoded
        numerical_df['category_target_enc'] = cat_encoded

        # TF-IDF features
        if is_training:
            tfidf_name = self.tfidf_name.fit_transform(item_names)
            tfidf_bullets = self.tfidf_bullets.fit_transform(bullet_text)
            char_features = self.char_vectorizer.fit_transform(item_names)
        else:
            tfidf_name = self.tfidf_name.transform(item_names)
            tfidf_bullets = self.tfidf_bullets.transform(bullet_text)
            char_features = self.char_vectorizer.transform(item_names)

        tfidf_name_svd = self.svd_name.fit_transform(tfidf_name) if is_training else self.svd_name.transform(tfidf_name)
        tfidf_bullets_svd = self.svd_bullets.fit_transform(tfidf_bullets) if is_training else self.svd_bullets.transform(tfidf_bullets)
        char_svd = self.svd_char.fit_transform(char_features) if is_training else self.svd_char.transform(char_features)

        tfidf_name_df = pd.DataFrame(tfidf_name_svd, columns=[f'name_tfidf_{i}' for i in range(tfidf_name_svd.shape[1])])
        tfidf_bullets_df = pd.DataFrame(tfidf_bullets_svd, columns=[f'bullet_tfidf_{i}' for i in range(tfidf_bullets_svd.shape[1])])
        char_df = pd.DataFrame(char_svd, columns=[f'char_ngram_{i}' for i in range(char_svd.shape[1])])

        if is_training:
            numerical_df['category_encoded'] = self.le_category.fit_transform(numerical_df['category'])
        else:
            test_categories = numerical_df['category']
            known_categories = list(self.le_category.classes_)
            numerical_df['category_encoded'] = test_categories.apply(
                lambda x: self.le_category.transform([x])[0] if x in known_categories else -1
            )

        numerical_df = numerical_df.drop(['category'], axis=1)

        # Interaction features
        numerical_df['size_premium_int'] = numerical_df['size_oz'] * numerical_df['weighted_premium_score']
        numerical_df['size_tier_int'] = numerical_df['size_oz'] * numerical_df['category_price_tier']
        numerical_df['brand_cat_int'] = numerical_df['brand_target_enc'] * numerical_df['category_target_enc']
        numerical_df['pack_premium_int'] = numerical_df['pack_count'] * numerical_df['weighted_premium_score']
        numerical_df['brand_size_int'] = numerical_df['brand_target_enc'] * numerical_df['log_size']

        image_features_list = []
        if image_features_df is not None:
            print(f"\nâœ“ Loading pre-extracted image features...")
            print(f"  Image features shape: {image_features_df.shape}")

            img_feat = image_features_df.copy()
            if 'sample_id' in img_feat.columns:
                img_feat = img_feat.drop('sample_id', axis=1)

            n_components = 65  # Increased from 50

            if is_training:
                print(f"  Applying PCA: {img_feat.shape[1]} â†’ {n_components} components")
                self.image_pca = PCA(n_components=n_components, random_state=42)
                img_reduced = self.image_pca.fit_transform(img_feat)
                explained_var = self.image_pca.explained_variance_ratio_.sum()
                print(f"  Explained variance: {explained_var:.2%}")
            else:
                if self.image_pca is None:
                    raise RuntimeError("Image PCA not fitted. Train model first.")
                img_reduced = self.image_pca.transform(img_feat)

            img_df = pd.DataFrame(img_reduced, columns=[f'img_pca_{i}' for i in range(n_components)])

            # Enhanced image-text interactions
            numerical_df['img_size_int'] = img_reduced[:, 0] * numerical_df['log_size']
            numerical_df['img_premium_int'] = img_reduced[:, 1] * numerical_df['weighted_premium_score']
            numerical_df['img_tier_int'] = img_reduced[:, 2] * numerical_df['category_price_tier']
            numerical_df['img_brand_int'] = img_reduced[:, 3] * numerical_df['brand_target_enc']
            numerical_df['img_pack_int'] = img_reduced[:, 4] * numerical_df['log_pack']
            numerical_df['img_quality_int'] = img_reduced[:, 5] * numerical_df['quality_count']

            image_features_list = [img_df]
            print(f"âœ“ Added {n_components} PCA image features + 6 interaction features")

        final_features = pd.concat([
            numerical_df.reset_index(drop=True),
            tfidf_name_df.reset_index(drop=True),
            tfidf_bullets_df.reset_index(drop=True),
            char_df.reset_index(drop=True)
        ] + image_features_list, axis=1)

        final_features = final_features.replace([np.inf, -np.inf], np.nan).fillna(0)
        print(f"âœ“ Total features: {final_features.shape[1]}\n")
        return final_features


# ==================== ENHANCED NEURAL NETWORK ====================

class EnhancedPricePredictor(nn.Module):
    def __init__(self, input_dim, dropout=0.50):
        super().__init__()

        self.input_bn = nn.BatchNorm1d(input_dim)
        self.input_dropout = nn.Dropout(0.20)

        # Deeper architecture with residual connections
        self.fc1 = nn.Linear(input_dim, 384)
        self.bn1 = nn.BatchNorm1d(384)
        self.dropout1 = nn.Dropout(dropout)

        self.fc2 = nn.Linear(384, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.dropout2 = nn.Dropout(dropout)

        self.fc3 = nn.Linear(256, 128)
        self.bn3 = nn.BatchNorm1d(128)
        self.dropout3 = nn.Dropout(dropout * 0.75)

        self.fc4 = nn.Linear(128, 64)
        self.bn4 = nn.BatchNorm1d(64)
        self.dropout4 = nn.Dropout(dropout * 0.5)

        self.fc5 = nn.Linear(64, 32)
        self.bn5 = nn.BatchNorm1d(32)
        self.dropout5 = nn.Dropout(dropout * 0.3)

        self.output = nn.Linear(32, 1)

        self._initialize_weights()

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm1d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

    def forward(self, x):
        x = self.input_bn(x)
        x = self.input_dropout(x)

        x = F.relu(self.bn1(self.fc1(x)))
        x = self.dropout1(x)

        x = F.relu(self.bn2(self.fc2(x)))
        x = self.dropout2(x)

        x = F.relu(self.bn3(self.fc3(x)))
        x = self.dropout3(x)

        x = F.relu(self.bn4(self.fc4(x)))
        x = self.dropout4(x)

        x = F.relu(self.bn5(self.fc5(x)))
        x = self.dropout5(x)

        output = self.output(x)
        return output.squeeze()


class ImprovedLoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.huber = nn.SmoothL1Loss()

    def forward(self, pred, target):
        pred_orig = torch.expm1(pred)
        target_orig = torch.expm1(target)

        mse_loss = F.mse_loss(pred, target)
        huber_loss = self.huber(pred, target)

        epsilon = 0.1
        smape_loss = 200 * torch.mean(
            torch.abs(pred_orig - target_orig) / (torch.abs(pred_orig) + torch.abs(target_orig) + epsilon)
        )

        weights = torch.where(target_orig < 10, 1.8,
                   torch.where(target_orig < 20, 1.3,
                   torch.where(target_orig < 50, 1.1, 1.0)))
        weighted_mse = torch.mean(weights * (pred - target) ** 2)

        total_loss = 0.20 * weighted_mse + 0.20 * huber_loss + 0.60 * smape_loss
        return total_loss


def calculate_metrics(y_true, y_pred):
    y_true_np = y_true.cpu().numpy() if torch.is_tensor(y_true) else y_true
    y_pred_np = y_pred.cpu().numpy() if torch.is_tensor(y_pred) else y_pred

    smape = 100 * np.mean(np.abs(y_pred_np - y_true_np) / ((np.abs(y_pred_np) + np.abs(y_true_np))/2) + 1e-8)
    mse = np.mean((y_pred_np - y_true_np) ** 2)
    rmse = np.sqrt(mse)
    mae = np.mean(np.abs(y_pred_np - y_true_np))
    mape = 100 * np.mean(np.abs((y_true_np - y_pred_np) / (y_true_np + 1e-8)))
    ss_res = np.sum((y_true_np - y_pred_np) ** 2)
    ss_tot = np.sum((y_true_np - np.mean(y_true_np)) ** 2)
    r2 = 1 - (ss_res / (ss_tot + 1e-8))

    return {'SMAPE': smape, 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape, 'R2': r2}


# ==================== TRAINING WITH K-FOLD ====================

def train_with_kfold(df, train_image_df, n_folds=5):
    print("="*70)
    print(" " * 15 + "K-FOLD CROSS-VALIDATION TRAINING")
    print(" " * 22 + "(WITH IMAGE FEATURES)")
    print("="*70)

    extractor = OptimizedFeatureExtractor()
    y = df['price'].values
    y_log = np.log1p(y)

    X = extractor.process_catalog_content(df, is_training=True, prices=y, image_features_df=train_image_df)

    print(f"Dataset: {len(df)} samples, {X.shape[1]} features")
    print(f"Price range: ${y.min():.2f} - ${y.max():.2f}\n")

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"âœ“ Using device: {device}\n")

    kfold = KFold(n_splits=n_folds, shuffle=True, random_state=42)

    fold_models = []
    fold_smapes = []
    fold_predictions = np.zeros(len(df))

    for fold, (train_idx, val_idx) in enumerate(kfold.split(X)):
        print(f"\n{'='*70}")
        print(f"FOLD {fold + 1}/{n_folds}")
        print(f"{'='*70}")

        X_train_fold = X.iloc[train_idx]
        X_val_fold = X.iloc[val_idx]
        y_train_fold = y_log[train_idx]
        y_val_fold = y_log[val_idx]
        y_val_orig = y[val_idx]

        scaler = RobustScaler()
        X_train_scaled = scaler.fit_transform(X_train_fold)
        X_val_scaled = scaler.transform(X_val_fold)

        X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
        X_val_tensor = torch.FloatTensor(X_val_scaled).to(device)
        y_train_tensor = torch.FloatTensor(y_train_fold).to(device)

        batch_size = 384
        train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

        model = EnhancedPricePredictor(
            input_dim=X_train_scaled.shape[1],
            dropout=0.50
        ).to(device)

        criterion = ImprovedLoss()

        optimizer = torch.optim.AdamW(
            model.parameters(),
            lr=0.0008,
            betas=(0.9, 0.999),
            weight_decay=0.0075
        )

        num_epochs = 200
        scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
            optimizer, T_0=40, T_mult=2, eta_min=3e-7
        )

        best_val_smape = float('inf')
        best_model_state = None
        patience = 30
        patience_counter = 0

        for epoch in range(num_epochs):
            model.train()
            train_loss = 0.0

            for batch_X, batch_y in train_loader:
                optimizer.zero_grad()
                predictions = model(batch_X)
                loss = criterion(predictions, batch_y)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.2)
                optimizer.step()
                train_loss += loss.item() * batch_X.size(0)

            train_loss /= len(train_dataset)

            # Validation
            model.eval()
            with torch.no_grad():
                val_pred_log = model(X_val_tensor).cpu().numpy()
                val_pred = np.expm1(val_pred_log)
                val_pred = np.clip(val_pred, 0.01, 10000)

                val_smape = 100 * np.mean(
                    np.abs(val_pred - y_val_orig) / (((np.abs(val_pred) + np.abs(y_val_orig))/2 + 1e-8))
                )

            scheduler.step()

            if val_smape < best_val_smape:
                best_val_smape = val_smape
                best_model_state = model.state_dict()
                patience_counter = 0
            else:
                patience_counter += 1

            if (epoch + 1) % 25 == 0 or epoch < 3:
                print(f"Epoch {epoch+1:3d}/{num_epochs} | Loss: {train_loss:.4f} | Val SMAPE: {val_smape:.2f}")

            if patience_counter >= patience:
                print(f"Early stopping at epoch {epoch+1}")
                break

        # Load best model
        model.load_state_dict(best_model_state)
        model.eval()

        with torch.no_grad():
            val_pred_log = model(X_val_tensor).cpu().numpy()
            val_pred = np.expm1(val_pred_log)
            val_pred = np.clip(val_pred, 0.01, 10000)

        fold_predictions[val_idx] = val_pred

        print(f"\nFold {fold + 1} Best SMAPE: {best_val_smape:.2f}")
        fold_smapes.append(best_val_smape)
        fold_models.append({
            'model': model,
            'scaler': scaler,
            'best_smape': best_val_smape
        })

    # Calculate overall metrics
    print(f"\n{'='*70}")
    print(" " * 20 + "CROSS-VALIDATION RESULTS")
    print(f"{'='*70}")

    cv_metrics = calculate_metrics(y, fold_predictions)

    print(f"\nOverall CV Metrics:")
    for metric, value in cv_metrics.items():
        print(f"  {metric:<10} {value:>10.4f}")

    print(f"\nFold SMAPE scores:")
    for i, smape in enumerate(fold_smapes):
        print(f"  Fold {i+1}: {smape:.2f}")
    print(f"  Mean: {np.mean(fold_smapes):.2f} Â± {np.std(fold_smapes):.2f}")

    # Error analysis by price range
    print(f"\n{'='*70}")
    print(" " * 23 + "SMAPE BY PRICE RANGE")
    print(f"{'='*70}")

    price_ranges = [
        (0, 10, 'Low ($0-$10)'),
        (10, 20, 'Medium ($10-$20)'),
        (20, 50, 'High ($20-$50)'),
        (50, float('inf'), 'Very High (>$50)')
    ]

    for low, high, label in price_ranges:
        mask = (y >= low) & (y < high)
        if mask.sum() > 0:
            range_smape = 200 * np.mean(
                np.abs(fold_predictions[mask] - y[mask]) /
                (np.abs(fold_predictions[mask]) + np.abs(y[mask]) + 1e-8)
            )
            range_mae = np.mean(np.abs(fold_predictions[mask] - y[mask]))
            print(f"{label:<25} SMAPE: {range_smape:>6.2f}  MAE: ${range_mae:>6.2f}  (n={mask.sum()})")

    print(f"\n{'='*70}")
    print(" " * 20 + "ðŸŽ¯ PERFORMANCE SUMMARY")
    print(f"{'='*70}")

    print(f"\nâœ“ CV SMAPE:  {cv_metrics['SMAPE']:.2f}")
    print(f"âœ“ CV RMSE:   ${cv_metrics['RMSE']:.2f}")
    print(f"âœ“ CV MAE:    ${cv_metrics['MAE']:.2f}")
    print(f"âœ“ RÂ² Score:  {cv_metrics['R2']:.4f}")

    if cv_metrics['SMAPE'] < 45:
        print(f"\nðŸŽ‰ TARGET ACHIEVED! SMAPE < 45 âœ“")
    else:
        print(f"\nâš  Gap to target (45): {cv_metrics['SMAPE'] - 45:.2f} points")

    print(f"\nImprovement from baseline (54.3): {54.3 - cv_metrics['SMAPE']:.2f} points")
    print("="*70)

    return {
        'fold_models': fold_models,
        'extractor': extractor,
        'device': device,
        'cv_metrics': cv_metrics,
        'fold_smapes': fold_smapes
    }


def predict_test_data(test_df, test_image_df, trained_results):
    """Make ensemble predictions on test data"""

    print("\n" + "="*70)
    print(" " * 20 + "TEST DATA PREDICTION")
    print("="*70)

    extractor = trained_results['extractor']
    fold_models = trained_results['fold_models']
    device = trained_results['device']

    print(f"\nProcessing {len(test_df)} test samples...")

    X_test = extractor.process_catalog_content(test_df, is_training=False, image_features_df=test_image_df)

    all_predictions = []

    for i, fold_data in enumerate(fold_models):
        model = fold_data['model']
        scaler = fold_data['scaler']

        X_test_scaled = scaler.transform(X_test)
        X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)

        model.eval()
        with torch.no_grad():
            test_pred_log = model(X_test_tensor).cpu().numpy()

        test_pred = np.expm1(test_pred_log)
        test_pred = np.clip(test_pred, 0.01, 10000)

        all_predictions.append(test_pred)
        print(f"  Fold {i+1} predictions: ${test_pred.mean():.2f} mean")

    # Weighted ensemble
    weights = [1.0 / fold_data['best_smape'] for fold_data in fold_models]
    weights = np.array(weights) / np.sum(weights)
    weighted_predictions = np.average(all_predictions, axis=0, weights=weights)

    print(f"\nâœ“ Predictions complete!")
    print(f"\nEnsemble (weighted by performance):")
    print(f"  Price range: ${weighted_predictions.min():.2f} - ${weighted_predictions.max():.2f}")
    print(f"  Mean price:  ${weighted_predictions.mean():.2f}")
    print(f"  Median:      ${np.median(weighted_predictions):.2f}")

    return weighted_predictions


def create_submission(test_df, predictions, output_file='submission.csv'):
    """Create submission file"""

    print("\n" + "="*70)
    print(" " * 20 + "CREATING SUBMISSION")
    print("="*70)

    submission = pd.DataFrame({
        'sample_id': test_df['sample_id'],
        'price': predictions
    })

    submission.to_csv(output_file, index=False)

    print(f"\nâœ“ File saved: {output_file}")
    print(f"âœ“ Total predictions: {len(submission)}")
    print(f"\nFirst 10 rows:")
    print(submission.head(10).to_string(index=False))
    print(f"\nStats:")
    print(f"  Min:    ${predictions.min():.2f}")
    print(f"  Max:    ${predictions.max():.2f}")
    print(f"  Mean:   ${predictions.mean():.2f}")
    print(f"  Median: ${np.median(predictions):.2f}")
    print("="*70)

    return submission


def full_pipeline(train_csv='train.csv', test_csv='test.csv',
                  train_image_csv='image_reduced_256.csv',
                  test_image_csv='test_image_features_256.csv',
                  output_csv='submission.csv', n_folds=5):
    """Complete optimized pipeline"""

    print("\n" + "="*70)
    print(" " * 15 + "ðŸš€ OPTIMIZED PIPELINE START")
    print(" " * 18 + "(TARGET: SMAPE ~45)")
    print("="*70)

    # Step 1: Load training data
    print("\n[1/5] Loading training data...")
    train_df = pd.read_csv(train_csv)
    train_image_df = pd.read_csv(train_image_csv)
    print(f"âœ“ Loaded {len(train_df)} training samples")
    print(f"âœ“ Loaded {train_image_df.shape} image features")

    if 'sample_id' in train_image_df.columns:
        if not train_df['sample_id'].equals(train_image_df['sample_id']):
            print("âš  Aligning image features by sample_id...")
            train_image_df = train_image_df.set_index('sample_id').loc[train_df['sample_id']].reset_index(drop=True)

    # Step 2: Train model
    print(f"\n[2/5] Training with {n_folds}-Fold Cross-Validation...")
    trained_results = train_with_kfold(train_df, train_image_df, n_folds=n_folds)

    # Step 3: Load test data
    print("\n[3/5] Loading test data...")
    test_df = pd.read_csv(test_csv)
    test_image_df = pd.read_csv(test_image_csv)
    print(f"âœ“ Loaded {len(test_df)} test samples")
    print(f"âœ“ Loaded {test_image_df.shape} test image features")

    if 'sample_id' in test_image_df.columns:
        if not test_df['sample_id'].equals(test_image_df['sample_id']):
            print("âš  Aligning test image features by sample_id...")
            test_image_df = test_image_df.set_index('sample_id').loc[test_df['sample_id']].reset_index(drop=True)

    # Step 4: Make predictions
    print("\n[4/5] Making predictions...")
    predictions = predict_test_data(test_df, test_image_df, trained_results)

    # Step 5: Create submission
    print("\n[5/5] Creating submission file...")
    submission = create_submission(test_df, predictions, output_csv)

    print("\n" + "="*70)
    print(" " * 15 + "âœ… PIPELINE COMPLETE!")
    print("="*70)
    print(f"\nðŸ“Š Cross-Validation SMAPE: {trained_results['cv_metrics']['SMAPE']:.2f}")
    print(f"ðŸ“„ Submission file: {output_csv}")

    if trained_results['cv_metrics']['SMAPE'] < 45:
        print(f"\nðŸŽ¯ SUCCESS! Target SMAPE < 45 achieved!")
    else:
        print(f"\nðŸ“ˆ Current SMAPE: {trained_results['cv_metrics']['SMAPE']:.2f}")
        print(f"   Gap to target: {trained_results['cv_metrics']['SMAPE'] - 45:.2f} points")

    print("="*70 + "\n")

    return {
        'trained_results': trained_results,
        'predictions': predictions,
        'submission': submission
    }


if __name__ == "__main__":
    results = full_pipeline(
        train_csv='/content/train.csv',
        test_csv='/content/test.csv',
        train_image_csv='image_reduced_256.csv',
        test_image_csv='test_image_features_256.csv',
        output_csv='submission_new.csv',
        n_folds=5
    )

    print("\n" + "="*70)
    print("FINAL RESULTS SUMMARY")
    print("="*70)
    print(f"\nCross-Validation Metrics:")
    for metric, value in results['trained_results']['cv_metrics'].items():
        print(f"  {metric}: {value:.4f}")
    print(f"\nPredictions saved to: submission_new.csv")
    print(f"Total test predictions: {len(results['predictions'])}")
    print("="*70)