In [3]:
# Run this once to install packages
!pip install sentence-transformers scikit-learn lightgbm pandas numpy scipy

# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sentence_transformers import SentenceTransformer
import lightgbm as lgb
from scipy.optimize import minimize
import re
import warnings
!pip install -U jupyter ipywidgets tqdm
!jupyter nbextension enable --py widgetsnbextension

warnings.filterwarnings('ignore')

np.random.seed(42)
print("✓ All packages imported successfully!")

Collecting jupyter
  Downloading jupyter-1.1.1-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting ipywidgets
  Downloading ipywidgets-8.1.7-py3-none-any.whl.metadata (2.4 kB)
Collecting jupyter-console (from jupyter)
  Downloading jupyter_console-6.6.3-py3-none-any.whl.metadata (5.8 kB)
Collecting widgetsnbextension~=4.0.14 (from ipywidgets)
  Downloading widgetsnbextension-4.0.14-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab_widgets~=3.0.15 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.15-py3-none-any.whl.metadata (20 kB)
Downloading jupyter-1.1.1-py2.py3-none-any.whl (2.7 kB)
Downloading ipywidgets-8.1.7-py3-none-any.whl (139 kB)
Downloading jupyterlab_widgets-3.0.15-py3-none-any.whl (216 kB)
Downloading widgetsnbextension-4.0.14-py3-none-any.whl (2.2 MB)
   ---------------------------------------- 0.0/2.2 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.2 MB ? eta -:--:--
   ---- ----------------------------------- 0.3/2.2 MB ? eta -:--:--
   ---

usage: jupyter [-h] [--version] [--config-dir] [--data-dir] [--runtime-dir]
               [--paths] [--json] [--debug]
               [subcommand]

Jupyter: Interactive Computing

positional arguments:
  subcommand     the subcommand to launch

options:
  -h, --help     show this help message and exit
  --version      show the versions of core jupyter packages and exit
  --config-dir   show Jupyter config dir
  --data-dir     show Jupyter data dir
  --runtime-dir  show Jupyter runtime dir
  --paths        show all Jupyter paths. Add --json for machine-readable
                 format.
  --json         output paths as machine-readable json
  --debug        output debug information about paths

Available subcommands: console dejavu events execute kernel kernelspec lab
labextension labhub migrate nbconvert notebook run server troubleshoot trust

Jupyter command `jupyter-nbextension` not found.


In [46]:
import pandas as pd

# Load your datasets
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')

# ✅ Assuming you have a submission DataFrame (rename 'df' to your actual variable)
# Example: If your predictions are in a DataFrame named 'submission'
submission = pd.DataFrame({
    'sample_id': test['sample_id'].astype(int),
    'price': final_ensemble_test.astype(float).round(2)
})

# ✅ Save properly
submission.to_csv('submission.csv', index=False)

# ✅ Display some info
print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"\nColumns: {train.columns.tolist()}")
print(f"\nPrice statistics:")
print(train['price'].describe())


Train shape: (75000, 4)
Test shape: (75000, 3)

Columns: ['sample_id', 'catalog_content', 'image_link', 'price']

Price statistics:
count    75000.000000
mean        23.647654
std         33.376932
min          0.130000
25%          6.795000
50%         14.000000
75%         28.625000
max       2796.000000
Name: price, dtype: float64


In [6]:
def clean_text(text):
    """Clean and normalize text"""
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)  # Remove special chars
    text = re.sub(r'\s+', ' ', text).strip()      # Remove extra spaces
    return text

# Apply cleaning
train['catalog_clean'] = train['catalog_content'].apply(clean_text)
test['catalog_clean'] = test['catalog_content'].apply(clean_text)

print("✓ Text cleaning done!")
print(f"Sample: {train['catalog_clean'].iloc[0][:150]}...")

✓ Text cleaning done!
Sample: item name la victoria green taco sauce mild 12 ounce pack of 6 value 72 0 unit fl oz...


In [7]:
def extract_text_features(df):
    """Extract statistical features from text"""
    features = pd.DataFrame()
    
    # Length features
    features['text_length'] = df['catalog_clean'].str.len()
    features['word_count'] = df['catalog_clean'].str.split().str.len()
    features['avg_word_length'] = features['text_length'] / (features['word_count'] + 1)
    
    # Numeric features
    features['num_digits'] = df['catalog_clean'].str.count(r'\d')
    features['has_numbers'] = (features['num_digits'] > 0).astype(int)
    
    # Extract quantity/IPQ
    def extract_quantity(text):
        matches = re.findall(r'(\d+)\s*(?:pack|pcs|pieces|count|qty)', str(text))
        return int(matches[0]) if matches else 1
    
    features['ipq_extracted'] = df['catalog_content'].apply(extract_quantity)
    
    # Premium/budget indicators
    features['has_premium'] = df['catalog_clean'].str.contains(
        'premium|luxury|deluxe|professional|pro', na=False
    ).astype(int)
    
    features['has_budget'] = df['catalog_clean'].str.contains(
        'budget|economy|basic|value', na=False
    ).astype(int)
    
    return features

# Extract features
train_text_features = extract_text_features(train)
test_text_features = extract_text_features(test)

print(f"✓ Text features extracted: {train_text_features.shape}")
print(train_text_features.head())

✓ Text features extracted: (75000, 8)
   text_length  word_count  avg_word_length  num_digits  has_numbers  \
0           84          19         4.200000           6            1   
1          491          81         5.987805          17            1   
2          315          61         5.080645          12            1   
3         1272         213         5.943925          17            1   
4          142          29         4.733333          13            1   

   ipq_extracted  has_premium  has_budget  
0              1            0           1  
1              1            0           1  
2              1            0           1  
3              1            1           1  
4              1            0           1  


In [8]:
import time

def load_model_with_retry(model_name, max_retries=3):
    """Load model with retry logic"""
    for attempt in range(max_retries):
        try:
            print(f"Attempt {attempt + 1}/{max_retries}: Loading {model_name}...")
            model = SentenceTransformer(model_name)
            print(f"✓ Model loaded successfully!")
            return model
        except Exception as e:
            print(f"⚠ Attempt {attempt + 1} failed: {str(e)[:100]}")
            if attempt < max_retries - 1:
                wait_time = 5 * (attempt + 1)
                print(f"Waiting {wait_time} seconds before retry...")
                time.sleep(wait_time)
            else:
                raise e

# Load model with retry
embedding_model = load_model_with_retry('all-MiniLM-L6-v2', max_retries=3)

def get_embeddings(texts, batch_size=256):
    """Generate embeddings in batches"""
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size].tolist()
        batch_embeddings = embedding_model.encode(batch, show_progress_bar=False)
        embeddings.append(batch_embeddings)
        if (i // batch_size) % 10 == 0:
            print(f"Processed {i}/{len(texts)} texts")
    return np.vstack(embeddings)

# Generate embeddings
print("\n🔄 Generating TRAIN embeddings...")
train_embeddings = get_embeddings(train['catalog_clean'])
print(f"✓ Train embeddings: {train_embeddings.shape}")

print("\n🔄 Generating TEST embeddings...")
test_embeddings = get_embeddings(test['catalog_clean'])
print(f"✓ Test embeddings: {test_embeddings.shape}")

Attempt 1/3: Loading all-MiniLM-L6-v2...
✓ Model loaded successfully!

🔄 Generating TRAIN embeddings...
Processed 0/75000 texts
Processed 2560/75000 texts
Processed 5120/75000 texts
Processed 7680/75000 texts
Processed 10240/75000 texts
Processed 12800/75000 texts
Processed 15360/75000 texts
Processed 17920/75000 texts
Processed 20480/75000 texts
Processed 23040/75000 texts
Processed 25600/75000 texts
Processed 28160/75000 texts
Processed 30720/75000 texts
Processed 33280/75000 texts
Processed 35840/75000 texts
Processed 38400/75000 texts
Processed 40960/75000 texts
Processed 43520/75000 texts
Processed 46080/75000 texts
Processed 48640/75000 texts
Processed 51200/75000 texts
Processed 53760/75000 texts
Processed 56320/75000 texts
Processed 58880/75000 texts
Processed 61440/75000 texts
Processed 64000/75000 texts
Processed 66560/75000 texts
Processed 69120/75000 texts
Processed 71680/75000 texts
Processed 74240/75000 texts
✓ Train embeddings: (75000, 384)

🔄 Generating TEST embeddings.

In [9]:
# Reduce from 384 to 128 dimensions for faster training
print("Applying SVD to reduce dimensions...")
svd = TruncatedSVD(n_components=128, random_state=42)

train_embeddings_reduced = svd.fit_transform(train_embeddings)
test_embeddings_reduced = svd.transform(test_embeddings)

print(f"✓ Reduced embeddings shape: {train_embeddings_reduced.shape}")
print(f"✓ Explained variance: {svd.explained_variance_ratio_.sum():.4f}")

# Update embeddings
train_embeddings = train_embeddings_reduced
test_embeddings = test_embeddings_reduced

Applying SVD to reduce dimensions...
✓ Reduced embeddings shape: (75000, 128)
✓ Explained variance: 0.8679


In [10]:
print("Creating TF-IDF features...")
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=5,
    max_df=0.8,
    sublinear_tf=True
)

train_tfidf = tfidf.fit_transform(train['catalog_clean'])
test_tfidf = tfidf.transform(test['catalog_clean'])

# Apply SVD to reduce TF-IDF dimensions
tfidf_svd = TruncatedSVD(n_components=100, random_state=42)
train_tfidf_dense = tfidf_svd.fit_transform(train_tfidf)
test_tfidf_dense = tfidf_svd.transform(test_tfidf)

print(f"✓ TF-IDF shape: {train_tfidf_dense.shape}")
print(f"✓ TF-IDF variance explained: {tfidf_svd.explained_variance_ratio_.sum():.4f}")

Creating TF-IDF features...
✓ TF-IDF shape: (75000, 100)
✓ TF-IDF variance explained: 0.2712


In [11]:
# Stack all features together
X_train = np.hstack([
    train_embeddings,           # 128 dims - semantic meaning
    train_tfidf_dense,          # 100 dims - keyword features
    train_text_features.values  # 8 dims - statistical features
])

X_test = np.hstack([
    test_embeddings,
    test_tfidf_dense,
    test_text_features.values
])

print(f"✓ Combined feature matrix:")
print(f"  Train: {X_train.shape}")
print(f"  Test: {X_test.shape}")

# Scale features (important for Ridge/Lasso)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Prepare target variable
y_train = train['price'].values
y_train_log = np.log1p(y_train)  # Log transformation

print(f"✓ Features scaled and ready!")

✓ Combined feature matrix:
  Train: (75000, 236)
  Test: (75000, 236)
✓ Features scaled and ready!


In [12]:
def smape(y_true, y_pred):
    """Calculate Symmetric Mean Absolute Percentage Error"""
    denominator = (np.abs(y_true) + np.abs(y_pred))
    diff = np.abs(y_true - y_pred)
    # Avoid division by zero
    denominator = np.where(denominator == 0, 1e-10, denominator)
    smape_val = 200 * np.mean(diff / denominator)
    return smape_val

# Test the function
print("Testing SMAPE function:")
test_true = np.array([100, 200, 300])
test_pred = np.array([110, 190, 320])
print(f"SMAPE: {smape(test_true, test_pred):.2f}%")
print("✓ SMAPE function ready!")

Testing SMAPE function:
SMAPE: 7.03%
✓ SMAPE function ready!


In [13]:
# Setup cross-validation
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Storage for predictions
oof_predictions = {}  # Out-of-fold predictions
test_predictions = {}  # Test predictions

print(f"✓ {n_splits}-Fold Cross-Validation setup complete!")
print(f"Training on {len(X_train_scaled)} samples")

✓ 5-Fold Cross-Validation setup complete!
Training on 75000 samples


In [14]:
print("\n" + "="*60)
print("TRAINING RIDGE REGRESSION")
print("="*60)

ridge_oof = np.zeros(len(X_train_scaled))
ridge_test = np.zeros(len(X_test_scaled))

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_scaled)):
    print(f"\n--- Fold {fold+1}/{n_splits} ---")
    
    X_tr, X_val = X_train_scaled[train_idx], X_train_scaled[val_idx]
    y_tr, y_val = y_train_log[train_idx], y_train_log[val_idx]
    
    # Train Ridge
    ridge = Ridge(alpha=10.0, random_state=42)
    ridge.fit(X_tr, y_tr)
    
    # Predict
    ridge_oof[val_idx] = ridge.predict(X_val)
    ridge_test += ridge.predict(X_test_scaled) / n_splits
    
    # Calculate SMAPE for this fold
    val_smape = smape(np.expm1(y_val), np.expm1(ridge_oof[val_idx]))
    print(f"Fold {fold+1} SMAPE: {val_smape:.2f}%")

# Store predictions
oof_predictions['ridge'] = np.expm1(ridge_oof)
test_predictions['ridge'] = np.expm1(ridge_test)

overall_smape = smape(y_train, oof_predictions['ridge'])
print(f"\n{'='*60}")
print(f"✓ RIDGE Overall SMAPE: {overall_smape:.2f}%")
print(f"{'='*60}")


TRAINING RIDGE REGRESSION

--- Fold 1/5 ---
Fold 1 SMAPE: 61.93%

--- Fold 2/5 ---
Fold 2 SMAPE: 61.24%

--- Fold 3/5 ---
Fold 3 SMAPE: 61.35%

--- Fold 4/5 ---
Fold 4 SMAPE: 60.31%

--- Fold 5/5 ---
Fold 5 SMAPE: 61.27%

✓ RIDGE Overall SMAPE: 61.22%


In [15]:
print("\n" + "="*60)
print("TRAINING LASSO REGRESSION")
print("="*60)

lasso_oof = np.zeros(len(X_train_scaled))
lasso_test = np.zeros(len(X_test_scaled))

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_scaled)):
    print(f"\n--- Fold {fold+1}/{n_splits} ---")
    
    X_tr, X_val = X_train_scaled[train_idx], X_train_scaled[val_idx]
    y_tr, y_val = y_train_log[train_idx], y_train_log[val_idx]
    
    # Train Lasso
    lasso = Lasso(alpha=0.1, random_state=42, max_iter=2000)
    lasso.fit(X_tr, y_tr)
    
    # Predict
    lasso_oof[val_idx] = lasso.predict(X_val)
    lasso_test += lasso.predict(X_test_scaled) / n_splits
    
    val_smape = smape(np.expm1(y_val), np.expm1(lasso_oof[val_idx]))
    print(f"Fold {fold+1} SMAPE: {val_smape:.2f}%")

# Store predictions
oof_predictions['lasso'] = np.expm1(lasso_oof)
test_predictions['lasso'] = np.expm1(lasso_test)

overall_smape = smape(y_train, oof_predictions['lasso'])
print(f"\n{'='*60}")
print(f"✓ LASSO Overall SMAPE: {overall_smape:.2f}%")
print(f"{'='*60}")


TRAINING LASSO REGRESSION

--- Fold 1/5 ---
Fold 1 SMAPE: 69.21%

--- Fold 2/5 ---
Fold 2 SMAPE: 68.74%

--- Fold 3/5 ---
Fold 3 SMAPE: 68.67%

--- Fold 4/5 ---
Fold 4 SMAPE: 67.91%

--- Fold 5/5 ---
Fold 5 SMAPE: 68.88%

✓ LASSO Overall SMAPE: 68.68%


In [16]:
print("\n" + "="*60)
print("TRAINING ELASTICNET")
print("="*60)

elastic_oof = np.zeros(len(X_train_scaled))
elastic_test = np.zeros(len(X_test_scaled))

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_scaled)):
    print(f"\n--- Fold {fold+1}/{n_splits} ---")
    
    X_tr, X_val = X_train_scaled[train_idx], X_train_scaled[val_idx]
    y_tr, y_val = y_train_log[train_idx], y_train_log[val_idx]
    
    # Train ElasticNet
    elastic = ElasticNet(alpha=0.5, l1_ratio=0.5, random_state=42, max_iter=2000)
    elastic.fit(X_tr, y_tr)
    
    # Predict
    elastic_oof[val_idx] = elastic.predict(X_val)
    elastic_test += elastic.predict(X_test_scaled) / n_splits
    
    val_smape = smape(np.expm1(y_val), np.expm1(elastic_oof[val_idx]))
    print(f"Fold {fold+1} SMAPE: {val_smape:.2f}%")

oof_predictions['elastic'] = np.expm1(elastic_oof)
test_predictions['elastic'] = np.expm1(elastic_test)

overall_smape = smape(y_train, oof_predictions['elastic'])
print(f"\n{'='*60}")
print(f"✓ ELASTICNET Overall SMAPE: {overall_smape:.2f}%")
print(f"{'='*60}")


TRAINING ELASTICNET

--- Fold 1/5 ---
Fold 1 SMAPE: 73.29%

--- Fold 2/5 ---
Fold 2 SMAPE: 72.67%

--- Fold 3/5 ---
Fold 3 SMAPE: 72.66%

--- Fold 4/5 ---
Fold 4 SMAPE: 72.03%

--- Fold 5/5 ---
Fold 5 SMAPE: 72.82%

✓ ELASTICNET Overall SMAPE: 72.69%


In [17]:
print("\n" + "="*60)
print("TRAINING LIGHTGBM")
print("="*60)

lgb_oof = np.zeros(len(X_train_scaled))
lgb_test = np.zeros(len(X_test_scaled))

lgb_params = {
    'objective': 'regression',
    'metric': 'mae',
    'boosting_type': 'gbdt',
    'num_leaves': 64,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'min_child_samples': 20,
    'lambda_l1': 0.5,
    'lambda_l2': 0.5,
    'verbosity': -1,
    'random_state': 42
}

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_scaled)):
    print(f"\n--- Fold {fold+1}/{n_splits} ---")
    
    X_tr, X_val = X_train_scaled[train_idx], X_train_scaled[val_idx]
    y_tr, y_val = y_train_log[train_idx], y_train_log[val_idx]
    
    train_data = lgb.Dataset(X_tr, label=y_tr)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
    
    model = lgb.train(
        lgb_params,
        train_data,
        num_boost_round=1000,
        valid_sets=[val_data],
        callbacks=[
            lgb.early_stopping(stopping_rounds=100),
            lgb.log_evaluation(200)
        ]
    )
    
    lgb_oof[val_idx] = model.predict(X_val, num_iteration=model.best_iteration)
    lgb_test += model.predict(X_test_scaled, num_iteration=model.best_iteration) / n_splits
    
    val_smape = smape(np.expm1(y_val), np.expm1(lgb_oof[val_idx]))
    print(f"Fold {fold+1} SMAPE: {val_smape:.2f}%")

oof_predictions['lgbm'] = np.expm1(lgb_oof)
test_predictions['lgbm'] = np.expm1(lgb_test)

overall_smape = smape(y_train, oof_predictions['lgbm'])
print(f"\n{'='*60}")
print(f"✓ LIGHTGBM Overall SMAPE: {overall_smape:.2f}%")
print(f"{'='*60}")


TRAINING LIGHTGBM

--- Fold 1/5 ---
Training until validation scores don't improve for 100 rounds
[200]	valid_0's l1: 0.585838
[400]	valid_0's l1: 0.572277
[600]	valid_0's l1: 0.566795
[800]	valid_0's l1: 0.562834
[1000]	valid_0's l1: 0.559344
Did not meet early stopping. Best iteration is:
[999]	valid_0's l1: 0.559328
Fold 1 SMAPE: 55.89%

--- Fold 2/5 ---
Training until validation scores don't improve for 100 rounds
[200]	valid_0's l1: 0.576915
[400]	valid_0's l1: 0.564464
[600]	valid_0's l1: 0.558589
[800]	valid_0's l1: 0.55542
[1000]	valid_0's l1: 0.552087
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l1: 0.552087
Fold 2 SMAPE: 55.29%

--- Fold 3/5 ---
Training until validation scores don't improve for 100 rounds
[200]	valid_0's l1: 0.580389
[400]	valid_0's l1: 0.566644
[600]	valid_0's l1: 0.560535
[800]	valid_0's l1: 0.556155
[1000]	valid_0's l1: 0.55338
Did not meet early stopping. Best iteration is:
[999]	valid_0's l1: 0.553355
Fold 3 SMAPE: 55.53%

--- Fold 

In [18]:
print("\n" + "="*70)
print("MODEL PERFORMANCE SUMMARY")
print("="*70)

for model_name, predictions in oof_predictions.items():
    score = smape(y_train, predictions)
    print(f"{model_name.upper():20s} SMAPE: {score:.2f}%")

print("="*70)


MODEL PERFORMANCE SUMMARY
RIDGE                SMAPE: 61.22%
LASSO                SMAPE: 68.68%
ELASTIC              SMAPE: 72.69%
LGBM                 SMAPE: 55.45%


In [19]:
print("\n" + "="*70)
print("OPTIMIZING ENSEMBLE WEIGHTS")
print("="*70)

def ensemble_smape_objective(weights, predictions_dict, y_true):
    """Objective function for ensemble optimization"""
    weights = np.abs(weights)  # Ensure positive
    weights /= weights.sum()    # Normalize to sum=1
    
    # Weighted ensemble prediction
    ensemble_pred = np.zeros(len(y_true))
    for i, (name, pred) in enumerate(predictions_dict.items()):
        ensemble_pred += weights[i] * pred
    
    return smape(y_true, ensemble_pred)

# Get model names
model_names = list(oof_predictions.keys())
print(f"Models to ensemble: {model_names}")

# Initial weights (equal)
initial_weights = np.ones(len(model_names)) / len(model_names)

# Optimize weights using Nelder-Mead
result = minimize(
    ensemble_smape_objective,
    initial_weights,
    args=(oof_predictions, y_train),
    method='Nelder-Mead',
    options={'maxiter': 1000, 'disp': True}
)

# Get optimal weights
optimal_weights = np.abs(result.x)
optimal_weights /= optimal_weights.sum()

print(f"\n{'='*70}")
print("OPTIMAL ENSEMBLE WEIGHTS:")
print("="*70)
for name, weight in zip(model_names, optimal_weights):
    print(f"{name.upper():20s} weight: {weight:.4f} ({weight*100:.1f}%)")
print("="*70)


OPTIMIZING ENSEMBLE WEIGHTS
Models to ensemble: ['ridge', 'lasso', 'elastic', 'lgbm']
Optimization terminated successfully.
         Current function value: 55.448095
         Iterations: 109
         Function evaluations: 191

OPTIMAL ENSEMBLE WEIGHTS:
RIDGE                weight: 0.0000 (0.0%)
LASSO                weight: 0.0000 (0.0%)
ELASTIC              weight: 0.0000 (0.0%)
LGBM                 weight: 1.0000 (100.0%)


In [20]:
# Create weighted ensemble predictions
ensemble_oof = np.zeros(len(y_train))
ensemble_test = np.zeros(len(X_test_scaled))

for i, (name, pred) in enumerate(oof_predictions.items()):
    ensemble_oof += optimal_weights[i] * pred
    ensemble_test += optimal_weights[i] * test_predictions[name]

ensemble_smape_score = smape(y_train, ensemble_oof)

print("\n" + "="*70)
print("ENSEMBLE RESULTS")
print("="*70)
print(f"✓ Ensemble OOF SMAPE: {ensemble_smape_score:.2f}%")
print("="*70)

# Compare with individual models
print("\nCOMPARISON:")
for name, pred in oof_predictions.items():
    score = smape(y_train, pred)
    print(f"  {name.upper():20s}: {score:.2f}%")
print(f"  {'ENSEMBLE':20s}: {ensemble_smape_score:.2f}%")
print("="*70)


ENSEMBLE RESULTS
✓ Ensemble OOF SMAPE: 55.45%

COMPARISON:
  RIDGE               : 61.22%
  LASSO               : 68.68%
  ELASTIC             : 72.69%
  LGBM                : 55.45%
  ENSEMBLE            : 55.45%


In [21]:
# Apply smart post-processing
def postprocess_predictions(predictions, train_prices):
    """Clip extreme predictions to realistic ranges"""
    
    # Get percentile bounds from training data
    lower_bound = train_prices.quantile(0.001)
    upper_bound = train_prices.quantile(0.999)
    
    # Clip predictions
    predictions_clipped = np.clip(predictions, lower_bound, upper_bound)
    
    # Ensure all positive
    predictions_clipped = np.maximum(predictions_clipped, 0.01)
    
    return predictions_clipped

# Apply post-processing
ensemble_test_final = postprocess_predictions(ensemble_test, train['price'])

print("Post-processing applied:")
print(f"  Original range: [{ensemble_test.min():.2f}, {ensemble_test.max():.2f}]")
print(f"  Clipped range:  [{ensemble_test_final.min():.2f}, {ensemble_test_final.max():.2f}]")
print(f"  Training range: [{train['price'].min():.2f}, {train['price'].max():.2f}]")

Post-processing applied:
  Original range: [1.19, 229.07]
  Clipped range:  [1.19, 229.07]
  Training range: [0.13, 2796.00]


In [24]:
print("\n" + "="*70)
print("GENERATING SENTENCE EMBEDDINGS WITH DOWNLOADED MODEL")
print("="*70)

def get_embeddings_batch(texts, model, batch_size=256):
    """Generate embeddings in batches"""
    embeddings = []
    total = len(texts)
    
    for i in range(0, total, batch_size):
        batch = texts[i:i+batch_size].tolist()
        batch_embeddings = model.encode(
            batch, 
            show_progress_bar=False,
            batch_size=batch_size
        )
        embeddings.append(batch_embeddings)
        
        if (i // batch_size) % 10 == 0:
            print(f"  Processed {i}/{total} texts ({i/total*100:.1f}%)")
    
    return np.vstack(embeddings)

print("\n🔄 Generating TRAIN embeddings...")
train_embeddings_real = get_embeddings_batch(train['catalog_clean'], embedding_model)
print(f"✓ Train embeddings: {train_embeddings_real.shape}")

print("\n🔄 Generating TEST embeddings...")
test_embeddings_real = get_embeddings_batch(test['catalog_clean'], embedding_model)
print(f"✓ Test embeddings: {test_embeddings_real.shape}")

# Reduce dimensions with SVD
print("\nApplying SVD dimensionality reduction...")
svd_embeddings = TruncatedSVD(n_components=200, random_state=42)
train_embeddings_reduced = svd_embeddings.fit_transform(train_embeddings_real)
test_embeddings_reduced = svd_embeddings.transform(test_embeddings_real)

print(f"✓ Reduced embeddings: {train_embeddings_reduced.shape}")
print(f"✓ Variance explained: {svd_embeddings.explained_variance_ratio_.sum():.4f}")


GENERATING SENTENCE EMBEDDINGS WITH DOWNLOADED MODEL

🔄 Generating TRAIN embeddings...
  Processed 0/75000 texts (0.0%)
  Processed 2560/75000 texts (3.4%)
  Processed 5120/75000 texts (6.8%)
  Processed 7680/75000 texts (10.2%)
  Processed 10240/75000 texts (13.7%)
  Processed 12800/75000 texts (17.1%)
  Processed 15360/75000 texts (20.5%)
  Processed 17920/75000 texts (23.9%)
  Processed 20480/75000 texts (27.3%)
  Processed 23040/75000 texts (30.7%)
  Processed 25600/75000 texts (34.1%)
  Processed 28160/75000 texts (37.5%)
  Processed 30720/75000 texts (41.0%)
  Processed 33280/75000 texts (44.4%)
  Processed 35840/75000 texts (47.8%)
  Processed 38400/75000 texts (51.2%)
  Processed 40960/75000 texts (54.6%)
  Processed 43520/75000 texts (58.0%)
  Processed 46080/75000 texts (61.4%)
  Processed 48640/75000 texts (64.9%)
  Processed 51200/75000 texts (68.3%)
  Processed 53760/75000 texts (71.7%)
  Processed 56320/75000 texts (75.1%)
  Processed 58880/75000 texts (78.5%)
  Processe

In [30]:
print("\n" + "="*70)
print("CREATING ENHANCED TF-IDF FEATURES")
print("="*70)

# Multiple TF-IDF with different n-gram ranges
tfidf_word = TfidfVectorizer(
    max_features=8000,
    ngram_range=(1, 2),
    min_df=3,
    max_df=0.85,
    sublinear_tf=True
)

tfidf_char = TfidfVectorizer(
    max_features=5000,
    ngram_range=(3, 5),
    min_df=3,
    analyzer='char',
    sublinear_tf=True
)

print("Fitting word TF-IDF...")
train_tfidf_w = tfidf_word.fit_transform(train['catalog_clean'])
test_tfidf_w = tfidf_word.transform(test['catalog_clean'])

print("Fitting char TF-IDF...")
train_tfidf_c = tfidf_char.fit_transform(train['catalog_clean'])
test_tfidf_c = tfidf_char.transform(test['catalog_clean'])

# SVD reduction
svd_word = TruncatedSVD(n_components=150, random_state=42)
train_word_svd = svd_word.fit_transform(train_tfidf_w)
test_word_svd = svd_word.transform(test_tfidf_w)

svd_char = TruncatedSVD(n_components=100, random_state=42)
train_char_svd = svd_char.fit_transform(train_tfidf_c)
test_char_svd = svd_char.transform(test_tfidf_c)

print(f"✓ Word TF-IDF: {train_word_svd.shape}")
print(f"✓ Char TF-IDF: {train_char_svd.shape}")


CREATING ENHANCED TF-IDF FEATURES
Fitting word TF-IDF...
Fitting char TF-IDF...
✓ Word TF-IDF: (75000, 150)
✓ Char TF-IDF: (75000, 100)


In [31]:
def extract_advanced_features(df):
    """Extract comprehensive features"""
    features = pd.DataFrame()
    
    # Text statistics
    features['text_length'] = df['catalog_clean'].str.len()
    features['word_count'] = df['catalog_clean'].str.split().str.len()
    features['avg_word_length'] = features['text_length'] / (features['word_count'] + 1)
    features['unique_word_ratio'] = df['catalog_clean'].apply(
        lambda x: len(set(str(x).split())) / (len(str(x).split()) + 1)
    )
    
    # Numeric features
    features['num_count'] = df['catalog_clean'].str.count(r'\d')
    features['has_numbers'] = (features['num_count'] > 0).astype(int)
    features['number_density'] = features['num_count'] / (features['text_length'] + 1)
    
    # Extract IPQ
    def extract_quantity(text):
        matches = re.findall(r'(\d+)\s*(?:pack|pcs|pieces|count|qty|units?)', str(text).lower())
        return int(matches[0]) if matches else 1
    
    features['ipq'] = df['catalog_content'].apply(extract_quantity)
    features['ipq_log'] = np.log1p(features['ipq'])
    
    # Premium/Budget indicators
    features['has_premium'] = df['catalog_clean'].str.contains(
        'premium|luxury|deluxe|professional|pro|elite', na=False
    ).astype(int)
    
    features['has_budget'] = df['catalog_clean'].str.contains(
        'budget|economy|basic|value|cheap', na=False
    ).astype(int)
    
    # Material/Size/Brand
    features['has_material'] = df['catalog_clean'].str.contains(
        'cotton|plastic|metal|wood|steel|leather', na=False
    ).astype(int)
    
    features['has_size'] = df['catalog_clean'].str.contains(
        'small|medium|large|xl|size|cm|inch|kg|liter', na=False
    ).astype(int)
    
    features['has_brand'] = df['catalog_clean'].str.contains(
        'brand|original|authentic|genuine', na=False
    ).astype(int)
    
    return features

print("Extracting advanced features...")
train_adv = extract_advanced_features(train)
test_adv = extract_advanced_features(test)
print(f"✓ Advanced features: {train_adv.shape}")

Extracting advanced features...
✓ Advanced features: (75000, 14)


In [32]:
print("\n" + "="*70)
print("COMBINING ALL FEATURES - MAXIMUM FEATURE SET")
print("="*70)

X_train_ultimate = np.hstack([
    train_embeddings_reduced,  # 200 - semantic embeddings
    train_word_svd,            # 150 - word n-grams
    train_char_svd,            # 100 - char n-grams
    train_adv.values           # 14 - advanced features
])

X_test_ultimate = np.hstack([
    test_embeddings_reduced,
    test_word_svd,
    test_char_svd,
    test_adv.values
])

print(f"Combined features: {X_train_ultimate.shape}")

# Scale
scaler_ultimate = StandardScaler()
X_train_ultimate_scaled = scaler_ultimate.fit_transform(X_train_ultimate)
X_test_ultimate_scaled = scaler_ultimate.transform(X_test_ultimate)

print(f"✓ Final feature matrix: {X_train_ultimate_scaled.shape}")
print("="*70)


COMBINING ALL FEATURES - MAXIMUM FEATURE SET
Combined features: (75000, 464)
✓ Final feature matrix: (75000, 464)


In [33]:
print("\n" + "="*70)
print("TRAINING RIDGE WITH ULTIMATE FEATURES")
print("="*70)

ridge_oof_final = np.zeros(len(X_train_ultimate_scaled))
ridge_test_final = np.zeros(len(X_test_ultimate_scaled))

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_ultimate_scaled)):
    print(f"\n--- Fold {fold+1}/{n_splits} ---")
    
    X_tr, X_val = X_train_ultimate_scaled[train_idx], X_train_ultimate_scaled[val_idx]
    y_tr, y_val = y_train_log[train_idx], y_train_log[val_idx]
    
    ridge = Ridge(alpha=10.0, random_state=42)
    ridge.fit(X_tr, y_tr)
    
    ridge_oof_final[val_idx] = ridge.predict(X_val)
    ridge_test_final += ridge.predict(X_test_ultimate_scaled) / n_splits
    
    val_smape = smape(np.expm1(y_val), np.expm1(ridge_oof_final[val_idx]))
    print(f"Fold {fold+1} SMAPE: {val_smape:.2f}%")

ridge_final_pred = np.expm1(ridge_oof_final)
ridge_final_test = np.expm1(ridge_test_final)

overall_ridge = smape(y_train, ridge_final_pred)
print(f"\n{'='*70}")
print(f"✓ RIDGE FINAL SMAPE: {overall_ridge:.2f}%")
print(f"{'='*70}")


TRAINING RIDGE WITH ULTIMATE FEATURES

--- Fold 1/5 ---
Fold 1 SMAPE: 60.41%

--- Fold 2/5 ---
Fold 2 SMAPE: 59.43%

--- Fold 3/5 ---
Fold 3 SMAPE: 59.70%

--- Fold 4/5 ---
Fold 4 SMAPE: 58.72%

--- Fold 5/5 ---
Fold 5 SMAPE: 59.51%

✓ RIDGE FINAL SMAPE: 59.55%


In [34]:
print("\n" + "="*70)
print("TRAINING LASSO WITH ULTIMATE FEATURES")
print("="*70)

lasso_oof_final = np.zeros(len(X_train_ultimate_scaled))
lasso_test_final = np.zeros(len(X_test_ultimate_scaled))

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_ultimate_scaled)):
    print(f"\n--- Fold {fold+1}/{n_splits} ---")
    
    X_tr, X_val = X_train_ultimate_scaled[train_idx], X_train_ultimate_scaled[val_idx]
    y_tr, y_val = y_train_log[train_idx], y_train_log[val_idx]
    
    lasso = Lasso(alpha=0.05, random_state=42, max_iter=3000)
    lasso.fit(X_tr, y_tr)
    
    lasso_oof_final[val_idx] = lasso.predict(X_val)
    lasso_test_final += lasso.predict(X_test_ultimate_scaled) / n_splits
    
    val_smape = smape(np.expm1(y_val), np.expm1(lasso_oof_final[val_idx]))
    print(f"Fold {fold+1} SMAPE: {val_smape:.2f}%")

lasso_final_pred = np.expm1(lasso_oof_final)
lasso_final_test = np.expm1(lasso_test_final)

overall_lasso = smape(y_train, lasso_final_pred)
print(f"\n{'='*70}")
print(f"✓ LASSO FINAL SMAPE: {overall_lasso:.2f}%")
print(f"{'='*70}")


TRAINING LASSO WITH ULTIMATE FEATURES

--- Fold 1/5 ---
Fold 1 SMAPE: 66.95%

--- Fold 2/5 ---
Fold 2 SMAPE: 66.45%

--- Fold 3/5 ---
Fold 3 SMAPE: 66.40%

--- Fold 4/5 ---
Fold 4 SMAPE: 65.59%

--- Fold 5/5 ---
Fold 5 SMAPE: 66.71%

✓ LASSO FINAL SMAPE: 66.42%


In [35]:
print("\n" + "="*70)
print("TRAINING LIGHTGBM WITH ULTIMATE FEATURES")
print("="*70)

lgb_oof_final = np.zeros(len(X_train_ultimate_scaled))
lgb_test_final = np.zeros(len(X_test_ultimate_scaled))

lgb_params_final = {
    'objective': 'regression',
    'metric': 'mae',
    'boosting_type': 'gbdt',
    'num_leaves': 80,
    'learning_rate': 0.03,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'min_child_samples': 20,
    'lambda_l1': 1.0,
    'lambda_l2': 1.0,
    'verbosity': -1,
    'random_state': 42
}

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_ultimate_scaled)):
    print(f"\n--- Fold {fold+1}/{n_splits} ---")
    
    X_tr, X_val = X_train_ultimate_scaled[train_idx], X_train_ultimate_scaled[val_idx]
    y_tr, y_val = y_train_log[train_idx], y_train_log[val_idx]
    
    train_data = lgb.Dataset(X_tr, label=y_tr)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
    
    model = lgb.train(
        lgb_params_final,
        train_data,
        num_boost_round=2000,
        valid_sets=[val_data],
        callbacks=[
            lgb.early_stopping(stopping_rounds=150),
            lgb.log_evaluation(300)
        ]
    )
    
    lgb_oof_final[val_idx] = model.predict(X_val, num_iteration=model.best_iteration)
    lgb_test_final += model.predict(X_test_ultimate_scaled, num_iteration=model.best_iteration) / n_splits
    
    val_smape = smape(np.expm1(y_val), np.expm1(lgb_oof_final[val_idx]))
    print(f"Fold {fold+1} SMAPE: {val_smape:.2f}%")

lgb_final_pred = np.expm1(lgb_oof_final)
lgb_final_test = np.expm1(lgb_test_final)

overall_lgb = smape(y_train, lgb_final_pred)
print(f"\n{'='*70}")
print(f"✓ LIGHTGBM FINAL SMAPE: {overall_lgb:.2f}%")
print(f"{'='*70}")


TRAINING LIGHTGBM WITH ULTIMATE FEATURES

--- Fold 1/5 ---
Training until validation scores don't improve for 150 rounds
[300]	valid_0's l1: 0.579076
[600]	valid_0's l1: 0.564351
[900]	valid_0's l1: 0.557212
[1200]	valid_0's l1: 0.552434
[1500]	valid_0's l1: 0.54972
[1800]	valid_0's l1: 0.547357
Did not meet early stopping. Best iteration is:
[1996]	valid_0's l1: 0.545995
Fold 1 SMAPE: 54.70%

--- Fold 2/5 ---
Training until validation scores don't improve for 150 rounds
[300]	valid_0's l1: 0.570868
[600]	valid_0's l1: 0.555782
[900]	valid_0's l1: 0.549249
[1200]	valid_0's l1: 0.544366
[1500]	valid_0's l1: 0.541279
[1800]	valid_0's l1: 0.538929
Did not meet early stopping. Best iteration is:
[2000]	valid_0's l1: 0.538043
Fold 2 SMAPE: 53.96%

--- Fold 3/5 ---
Training until validation scores don't improve for 150 rounds
[300]	valid_0's l1: 0.570121
[600]	valid_0's l1: 0.555698
[900]	valid_0's l1: 0.548804
[1200]	valid_0's l1: 0.544692
[1500]	valid_0's l1: 0.541812
[1800]	valid_0's l1:

In [36]:
print("\n" + "="*70)
print("FINAL MODEL PERFORMANCE SUMMARY")
print("="*70)

final_results = {
    'Ridge': overall_ridge,
    'Lasso': overall_lasso,
    'LightGBM': overall_lgb
}

for name, score in final_results.items():
    print(f"{name:20s} SMAPE: {score:.2f}%")

best_model = min(final_results, key=final_results.get)
best_score = final_results[best_model]

print("="*70)
print(f"🏆 BEST MODEL: {best_model} with SMAPE: {best_score:.2f}%")
print("="*70)


FINAL MODEL PERFORMANCE SUMMARY
Ridge                SMAPE: 59.55%
Lasso                SMAPE: 66.42%
LightGBM             SMAPE: 54.14%
🏆 BEST MODEL: LightGBM with SMAPE: 54.14%


In [37]:
print("\n" + "="*70)
print("TRAINING MULTIPLE LIGHTGBM VARIANTS")
print("="*70)

lgb_variants = {}
lgb_test_variants = {}

# Variant 1: Deep trees, low learning rate
params_v1 = {
    'objective': 'regression',
    'metric': 'mae',
    'num_leaves': 128,
    'learning_rate': 0.02,
    'feature_fraction': 0.75,
    'bagging_fraction': 0.75,
    'bagging_freq': 5,
    'min_child_samples': 15,
    'lambda_l1': 2.0,
    'lambda_l2': 2.0,
    'verbosity': -1,
    'random_state': 42
}

# Variant 2: Shallow trees, higher learning rate
params_v2 = {
    'objective': 'regression',
    'metric': 'mae',
    'num_leaves': 48,
    'learning_rate': 0.05,
    'feature_fraction': 0.85,
    'bagging_fraction': 0.85,
    'bagging_freq': 3,
    'min_child_samples': 30,
    'lambda_l1': 0.5,
    'lambda_l2': 0.5,
    'verbosity': -1,
    'random_state': 123
}

# Variant 3: Balanced
params_v3 = {
    'objective': 'regression',
    'metric': 'mae',
    'num_leaves': 96,
    'learning_rate': 0.03,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 4,
    'min_child_samples': 20,
    'lambda_l1': 1.0,
    'lambda_l2': 1.0,
    'verbosity': -1,
    'random_state': 999
}

variants = [
    ('lgb_deep', params_v1),
    ('lgb_shallow', params_v2),
    ('lgb_balanced', params_v3)
]

for variant_name, params in variants:
    print(f"\n{'='*70}")
    print(f"Training {variant_name}...")
    print(f"{'='*70}")
    
    oof = np.zeros(len(X_train_ultimate_scaled))
    test_pred = np.zeros(len(X_test_ultimate_scaled))
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_ultimate_scaled)):
        X_tr, X_val = X_train_ultimate_scaled[train_idx], X_train_ultimate_scaled[val_idx]
        y_tr, y_val = y_train_log[train_idx], y_train_log[val_idx]
        
        train_data = lgb.Dataset(X_tr, label=y_tr)
        val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
        
        model = lgb.train(
            params,
            train_data,
            num_boost_round=2500,
            valid_sets=[val_data],
            callbacks=[lgb.early_stopping(stopping_rounds=200), lgb.log_evaluation(500)]
        )
        
        oof[val_idx] = model.predict(X_val, num_iteration=model.best_iteration)
        test_pred += model.predict(X_test_ultimate_scaled, num_iteration=model.best_iteration) / n_splits
        
        val_smape = smape(np.expm1(y_val), np.expm1(oof[val_idx]))
        print(f"Fold {fold+1} SMAPE: {val_smape:.2f}%")
    
    lgb_variants[variant_name] = np.expm1(oof)
    lgb_test_variants[variant_name] = np.expm1(test_pred)
    
    score = smape(y_train, lgb_variants[variant_name])
    print(f"✓ {variant_name} Overall SMAPE: {score:.2f}%")


TRAINING MULTIPLE LIGHTGBM VARIANTS

Training lgb_deep...
Training until validation scores don't improve for 200 rounds
[500]	valid_0's l1: 0.568431
[1000]	valid_0's l1: 0.553175
[1500]	valid_0's l1: 0.547037
[2000]	valid_0's l1: 0.543623
[2500]	valid_0's l1: 0.541191
Did not meet early stopping. Best iteration is:
[2500]	valid_0's l1: 0.541191
Fold 1 SMAPE: 54.26%
Training until validation scores don't improve for 200 rounds
[500]	valid_0's l1: 0.559367
[1000]	valid_0's l1: 0.545257
[1500]	valid_0's l1: 0.538974
[2000]	valid_0's l1: 0.535707
[2500]	valid_0's l1: 0.533498
Did not meet early stopping. Best iteration is:
[2499]	valid_0's l1: 0.533497
Fold 2 SMAPE: 53.56%
Training until validation scores don't improve for 200 rounds
[500]	valid_0's l1: 0.560234
[1000]	valid_0's l1: 0.546384
[1500]	valid_0's l1: 0.539944
[2000]	valid_0's l1: 0.536521
[2500]	valid_0's l1: 0.534487
Did not meet early stopping. Best iteration is:
[2500]	valid_0's l1: 0.534487
Fold 3 SMAPE: 53.85%
Training un

In [39]:
print("\n" + "="*70)
print("CREATING SUPER ENSEMBLE")
print("="*70)

# Collect all OOF predictions
all_oof = {
    'ridge': ridge_final_pred,
    'lasso': lasso_final_pred,
    'lgbm': lgb_final_pred,
    **lgb_variants
}

# Collect all test predictions
all_test = {
    'ridge': ridge_final_test,
    'lasso': lasso_final_test,
    'lgbm': lgb_final_test,
    **lgb_test_variants
}

# Show individual scores
print("\nIndividual Model Scores:")
for name, pred in all_oof.items():
    score = smape(y_train, pred)
    print(f"  {name:20s}: {score:.2f}%")

# Optimize ensemble weights
def ensemble_objective(weights, preds_dict, y_true):
    weights = np.abs(weights)
    weights /= weights.sum()
    
    ensemble = np.zeros(len(y_true))
    for i, pred in enumerate(preds_dict.values()):
        ensemble += weights[i] * pred
    
    return smape(y_true, ensemble)

model_names = list(all_oof.keys())
initial_weights = np.ones(len(model_names)) / len(model_names)

print(f"\nOptimizing weights for {len(model_names)} models...")
result = minimize(
    ensemble_objective,
    initial_weights,
    args=(all_oof, y_train),
    method='Nelder-Mead',
    options={'maxiter': 2000}
)

optimal_weights = np.abs(result.x)
optimal_weights /= optimal_weights.sum()

print("\n" + "="*70)
print("OPTIMAL ENSEMBLE WEIGHTS:")
for name, weight in zip(model_names, optimal_weights):
    print(f"  {name:20s}: {weight:.4f} ({weight*100:.1f}%)")

# Create final ensemble
final_ensemble_oof = np.zeros(len(y_train))
final_ensemble_test = np.zeros(len(X_test_ultimate_scaled))

for i, (name, pred) in enumerate(all_oof.items()):
    final_ensemble_oof += optimal_weights[i] * pred
    final_ensemble_test += optimal_weights[i] * all_test[name]

final_smape = smape(y_train, final_ensemble_oof)

print("\n" + "="*70)
print(f"🎯 FINAL ENSEMBLE SMAPE: {final_smape:.2f}%")
print("="*70)


CREATING SUPER ENSEMBLE

Individual Model Scores:
  ridge               : 59.55%
  lasso               : 66.42%
  lgbm                : 54.14%
  lgb_deep            : 53.72%
  lgb_shallow         : 54.22%
  lgb_balanced        : 53.86%

Optimizing weights for 6 models...

OPTIMAL ENSEMBLE WEIGHTS:
  ridge               : 0.0000 (0.0%)
  lasso               : 0.0000 (0.0%)
  lgbm                : 0.2508 (25.1%)
  lgb_deep            : 0.0953 (9.5%)
  lgb_shallow         : 0.2809 (28.1%)
  lgb_balanced        : 0.3730 (37.3%)

🎯 FINAL ENSEMBLE SMAPE: 53.65%


In [40]:
# Post-process predictions
final_ensemble_test_processed = postprocess_predictions(final_ensemble_test, train['price'])

# Create submission
submission_final = pd.DataFrame({
    'sample_id': test['sample_id'],
    'price': final_ensemble_test_processed.round(2)
})

submission_final.to_csv('submission_final.csv', index=False)

print("\n" + "="*70)
print("FINAL SUBMISSION CREATED!")
print("="*70)
print(f"File: submission_final.csv")
print(f"Shape: {submission_final.shape}")
print(f"\nPrediction Stats:")
print(f"  Min:    {submission_final['price'].min():.2f}")
print(f"  Max:    {submission_final['price'].max():.2f}")
print(f"  Mean:   {submission_final['price'].mean():.2f}")
print(f"  Median: {submission_final['price'].median():.2f}")

print("\n" + "="*70)
print(f"🏆 FINAL CROSS-VALIDATION SMAPE: {final_smape:.2f}%")
print("="*70)


FINAL SUBMISSION CREATED!
File: submission_final.csv
Shape: (75000, 2)

Prediction Stats:
  Min:    1.01
  Max:    222.71
  Mean:   17.88
  Median: 13.94

🏆 FINAL CROSS-VALIDATION SMAPE: 53.65%


In [48]:
import pandas as pd

# ✅ Create submission DataFrame
submission = pd.DataFrame({
    'sample_id': test['sample_id'].astype(int),
    'price': final_ensemble_test_processed.astype(float).round(2)
})

# ✅ Sort and save
submission = submission.sort_values(by='sample_id')
submission.to_csv('submission.csv', index=False)

# ✅ Confirmation
print("\n" + "="*70)
print("🏆 FINAL SUBMISSION FILE CREATED SUCCESSFULLY!")
print("="*70)
print("File saved as: submission.csv")
print(f"Total rows: {len(submission)}")
print("="*70)

# (Optional) Quick preview
print("\nSample preview:")
print(submission.head())



🏆 FINAL SUBMISSION FILE CREATED SUCCESSFULLY!
File saved as: submission.csv
Total rows: 75000

Sample preview:
       sample_id  price
4020           1  22.07
39491          3  26.01
6852           9  15.30
64597         19   6.23
10141         20  21.33
