In [5]:
import subprocess
import sys

def setup_gpu():
    """Check GPU availability and configure libraries"""
    
    print("Checking GPU status...")
    print("-" * 50)
    
    # Check NVIDIA GPU
    try:
        result = subprocess.run(['nvidia-smi'], capture_output=True, text=True, timeout=5)
        if result.returncode == 0:
            print("✓ NVIDIA GPU detected")
            # Extract GPU name from nvidia-smi output
            lines = result.stdout.split('\n')
            for line in lines:
                if 'NVIDIA' in line or 'GeForce' in line or 'Tesla' in line or 'Quadro' in line:
                    print(f"  GPU: {line.strip()}")
                    break
        else:
            print("✗ NVIDIA driver not found")
            return False
    except FileNotFoundError:
        print("✗ nvidia-smi not found - No NVIDIA GPU")
        return False
    except subprocess.TimeoutExpired:
        print("✗ nvidia-smi timeout")
        return False
    except Exception as e:
        print(f"✗ GPU check failed: {e}")
        return False
    
    # Check PyTorch CUDA
    try:
        import torch
        cuda_available = torch.cuda.is_available()
        print(f"✓ PyTorch installed: {torch.__version__}")
        print(f"  CUDA available: {cuda_available}")
        if cuda_available:
            print(f"  CUDA version: {torch.version.cuda}")
            print(f"  GPU name: {torch.cuda.get_device_name(0)}")
            print(f"  GPU count: {torch.cuda.device_count()}")
    except ImportError:
        print("✗ PyTorch not installed")
        cuda_available = False
    except Exception as e:
        print(f"⚠ PyTorch check failed: {e}")
        cuda_available = False
    
    # Check XGBoost GPU
    try:
        import xgboost as xgb
        xgb_version = xgb.__version__
        print(f"✓ XGBoost installed: {xgb_version}")
        # Test GPU training
        try:
            test_param = {'tree_method': 'gpu_hist', 'verbosity': 0}
            test_dm = xgb.DMatrix(data=[[1,2],[3,4]], label=[1,0])
            xgb.train(test_param, test_dm, num_boost_round=1)
            print("  GPU support: Enabled")
        except:
            print("  GPU support: Disabled (CPU only)")
    except ImportError:
        print("✗ XGBoost not installed")
    except Exception as e:
        print(f"⚠ XGBoost check failed: {e}")
    
    # Check LightGBM GPU
    try:
        import lightgbm as lgb
        lgb_version = lgb.__version__
        print(f"✓ LightGBM installed: {lgb_version}")
        # Check if GPU version
        try:
            test_data = lgb.Dataset([[1,2],[3,4]], label=[1,0])
            test_params = {'device': 'gpu', 'verbosity': -1}
            lgb.train(test_params, test_data, num_boost_round=1)
            print("  GPU support: Enabled")
        except:
            print("  GPU support: Disabled (CPU only)")
    except ImportError:
        print("✗ LightGBM not installed")
    except Exception as e:
        print(f"⚠ LightGBM check failed: {e}")
    
    # Check CatBoost GPU
    try:
        from catboost import CatBoostRegressor
        import catboost as cb
        print(f"✓ CatBoost installed: {cb.__version__}")
        # Test GPU
        try:
            test_model = CatBoostRegressor(iterations=1, task_type='GPU', verbose=0)
            test_model.fit([[1,2],[3,4]], [1,0])
            print("  GPU support: Enabled")
        except:
            print("  GPU support: Disabled (CPU only)")
    except ImportError:
        print("✗ CatBoost not installed")
    except Exception as e:
        print(f"⚠ CatBoost check failed: {e}")
    
    print("-" * 50)
    
    # Final verdict
    if cuda_available:
        print("✓ GPU READY - Models will use GPU acceleration")
        return True
    else:
        print("✗ GPU NOT AVAILABLE - Models will use CPU")
        print("\nTo enable GPU:")
        print("1. Install CUDA: https://developer.nvidia.com/cuda-downloads")
        print("2. Install GPU versions:")
        print("   pip install torch torchvision --index-url https://download.pytorch.org/whl/cu118")
        print("   pip install xgboost --upgrade")
        print("   pip install lightgbm --install-option=--gpu")
        print("   pip install catboost")
        return False

# Run GPU check
gpu_available = setup_gpu()

Checking GPU status...
--------------------------------------------------
✓ NVIDIA GPU detected
  GPU: | NVIDIA-SMI 555.99                 Driver Version: 555.99         CUDA Version: 12.5     |
✗ PyTorch not installed
✓ XGBoost installed: 3.0.5
  GPU support: Enabled
✓ LightGBM installed: 4.6.0
  GPU support: Disabled (CPU only)
✓ CatBoost installed: 1.2.8
  GPU support: Enabled
--------------------------------------------------
✗ GPU NOT AVAILABLE - Models will use CPU

To enable GPU:
1. Install CUDA: https://developer.nvidia.com/cuda-downloads
2. Install GPU versions:
   pip install torch torchvision --index-url https://download.pytorch.org/whl/cu118
   pip install xgboost --upgrade
   pip install lightgbm --install-option=--gpu
   pip install catboost


In [6]:
# Run this in Jupyter cell:
!pip install torch torchvision --index-url https://download.pytorch.org/whl/cu121

^C


# Improved code

In [9]:
"""
Amazon ML Challenge - Text-Only Optimized Solution
Target SMAPE: 48-52% | Runtime: ~18 minutes with GPU
"""

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import RobustScaler
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import re
from collections import Counter

# GPU detection
try:
    import subprocess
    result = subprocess.run(['nvidia-smi'], capture_output=True, timeout=2)
    gpu_available = result.returncode == 0
except:
    gpu_available = False

def smape(y_true, y_pred):
    """Calculate SMAPE metric"""
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

def extract_text_features(df):
    """Extract statistical features from text content"""
    
    df['catalog_content'] = df['catalog_content'].fillna('').astype(str)
    
    features = pd.DataFrame()
    
    # Basic text statistics
    features['text_len'] = df['catalog_content'].str.len()
    features['word_count'] = df['catalog_content'].str.split().str.len()
    features['unique_words'] = df['catalog_content'].apply(lambda x: len(set(str(x).lower().split())))
    features['avg_word_len'] = features['text_len'] / (features['word_count'] + 1)
    
    # Character patterns
    features['digit_count'] = df['catalog_content'].str.count(r'\d')
    features['upper_count'] = df['catalog_content'].str.count(r'[A-Z]')
    features['special_char_count'] = df['catalog_content'].str.count(r'[^a-zA-Z0-9\s]')
    features['space_count'] = df['catalog_content'].str.count(r'\s')
    
    # Numeric features
    features['has_numbers'] = (features['digit_count'] > 0).astype(int)
    features['digit_ratio'] = features['digit_count'] / (features['text_len'] + 1)
    
    # Text complexity
    features['lexical_diversity'] = features['unique_words'] / (features['word_count'] + 1)
    features['uppercase_ratio'] = features['upper_count'] / (features['text_len'] + 1)
    
    # Price-related patterns
    features['has_currency'] = df['catalog_content'].str.contains(r'[\$£€¥₹]', regex=True).astype(int)
    features['has_percentage'] = df['catalog_content'].str.contains(r'%', regex=False).astype(int)
    features['has_measurement'] = df['catalog_content'].str.contains(r'\d+\s*(kg|g|ml|l|cm|m|inch|oz|lb)', 
                                                                      regex=True, case=False).astype(int)
    
    # Common keywords
    features['has_pack'] = df['catalog_content'].str.contains('pack', case=False).astype(int)
    features['has_set'] = df['catalog_content'].str.contains('set', case=False).astype(int)
    features['has_piece'] = df['catalog_content'].str.contains('piece|pcs', case=False).astype(int)
    
    # Punctuation
    features['comma_count'] = df['catalog_content'].str.count(',')
    features['dot_count'] = df['catalog_content'].str.count(r'\.')
    features['exclamation_count'] = df['catalog_content'].str.count('!')
    
    # Extract numbers from text
    def extract_numbers(text):
        numbers = re.findall(r'\d+\.?\d*', str(text))
        if numbers:
            nums = [float(n) for n in numbers]
            return np.mean(nums), np.max(nums), len(nums)
        return 0, 0, 0
    
    number_stats = df['catalog_content'].apply(extract_numbers)
    features['avg_number'] = [x[0] for x in number_stats]
    features['max_number'] = [x[1] for x in number_stats]
    features['number_count'] = [x[2] for x in number_stats]
    
    # Word statistics
    features['short_word_count'] = df['catalog_content'].apply(
        lambda x: len([w for w in str(x).split() if len(w) <= 3])
    )
    features['long_word_count'] = df['catalog_content'].apply(
        lambda x: len([w for w in str(x).split() if len(w) >= 10])
    )
    
    # Sentence statistics
    features['sentence_count'] = df['catalog_content'].str.count(r'[.!?]') + 1
    features['avg_sentence_len'] = features['word_count'] / features['sentence_count']
    
    return features.fillna(0)

print("=" * 70)
print("Amazon ML Challenge - Text-Only Optimized")
print("Target: 48-52% SMAPE | Runtime: ~18 minutes with GPU")
print("=" * 70)
print()

# Load data
print("Loading data...")
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
print(f"   Train: {len(train_df):,} samples")
print(f"   Test:  {len(test_df):,} samples")
print()

# Validate data
print("Validating data...")
assert 'catalog_content' in train_df.columns, "Missing catalog_content"
assert 'price' in train_df.columns, "Missing price"
train_df = train_df.dropna(subset=['price'])
print("   Complete")
print()

print("=" * 70)
print("FEATURE EXTRACTION")
print("=" * 70)
print()

# Extract text features
print("Extracting text features...")
import time
t0 = time.time()
train_text = extract_text_features(train_df)
test_text = extract_text_features(test_df)
print(f"   {train_text.shape[1]} features ({time.time()-t0:.1f}s)")
print()

# TF-IDF features
print("TF-IDF vectorization...")
t0 = time.time()
train_df['catalog_content'] = train_df['catalog_content'].fillna('').astype(str)
test_df['catalog_content'] = test_df['catalog_content'].fillna('').astype(str)

tfidf = TfidfVectorizer(
    max_features=120,
    ngram_range=(1, 2),
    min_df=3,
    max_df=0.9,
    sublinear_tf=True
)
train_tfidf = tfidf.fit_transform(train_df['catalog_content'])
test_tfidf = tfidf.transform(test_df['catalog_content'])

train_tfidf_df = pd.DataFrame(
    train_tfidf.toarray(), 
    columns=[f'tfidf_{i}' for i in range(train_tfidf.shape[1])]
)
test_tfidf_df = pd.DataFrame(
    test_tfidf.toarray(), 
    columns=[f'tfidf_{i}' for i in range(test_tfidf.shape[1])]
)
print(f"   {train_tfidf_df.shape[1]} features ({time.time()-t0:.1f}s)")
print()

# Combine features
print("Combining features...")
X_train = pd.concat([train_text, train_tfidf_df], axis=1).reset_index(drop=True)
X_test = pd.concat([test_text, test_tfidf_df], axis=1).reset_index(drop=True)
y_train = np.log1p(train_df['price'].values)

print(f"   Total features: {X_train.shape[1]}")
print()

print("=" * 70)
print("MODEL TRAINING (5-FOLD CV)")
print("=" * 70)
print()

print("GPU Status:")
print(f"  XGBoost: {'GPU' if gpu_available else 'CPU'}")
print(f"  CatBoost: {'GPU' if gpu_available else 'CPU'}")
print(f"  LightGBM: CPU (GPU not compiled)")
print()

# Cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold_scores = []
test_preds_lgb = np.zeros(len(X_test))
test_preds_xgb = np.zeros(len(X_test))
test_preds_cat = np.zeros(len(X_test))

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train), 1):
    print(f"Fold {fold}/5")
    
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train[train_idx], y_train[val_idx]
    
    # LightGBM
    lgb_params = {
        'n_estimators': 500,
        'learning_rate': 0.05,
        'max_depth': 10,
        'num_leaves': 31,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'random_state': 42,
        'n_jobs': -1,
        'verbose': -1
    }
    lgb_model = lgb.LGBMRegressor(**lgb_params)
    lgb_model.fit(
        X_tr, y_tr, 
        eval_set=[(X_val, y_val)],
        callbacks=[lgb.early_stopping(50, verbose=False)]
    )
    lgb_pred = lgb_model.predict(X_val)
    lgb_score = smape(np.expm1(y_val), np.expm1(lgb_pred))
    test_preds_lgb += lgb_model.predict(X_test) / 5
    print(f"   LightGBM    {lgb_score:.2f}%")
    
    # XGBoost
    xgb_params = {
        'n_estimators': 500,
        'learning_rate': 0.05,
        'max_depth': 10,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'tree_method': 'gpu_hist' if gpu_available else 'hist',
        'predictor': 'gpu_predictor' if gpu_available else 'auto',
        'early_stopping_rounds': 50,
        'random_state': 42,
        'n_jobs': -1,
        'verbosity': 0
    }
    xgb_model = xgb.XGBRegressor(**xgb_params)
    xgb_model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=False)
    xgb_pred = xgb_model.predict(X_val)
    xgb_score = smape(np.expm1(y_val), np.expm1(xgb_pred))
    test_preds_xgb += xgb_model.predict(X_test) / 5
    print(f"   XGBoost     {xgb_score:.2f}%")
    
    # CatBoost
    cat_params = {
        'iterations': 500,
        'learning_rate': 0.05,
        'depth': 10,
        'task_type': 'GPU' if gpu_available else 'CPU',
        'random_state': 42,
        'verbose': 0,
        'early_stopping_rounds': 50
    }
    cat_model = CatBoostRegressor(**cat_params)
    cat_model.fit(X_tr, y_tr, eval_set=(X_val, y_val), verbose=False)
    cat_pred = cat_model.predict(X_val)
    cat_score = smape(np.expm1(y_val), np.expm1(cat_pred))
    test_preds_cat += cat_model.predict(X_test) / 5
    print(f"   CatBoost    {cat_score:.2f}%")
    
    # Ensemble
    ensemble_pred = (lgb_pred + xgb_pred + cat_pred) / 3
    ensemble_score = smape(np.expm1(y_val), np.expm1(ensemble_pred))
    fold_scores.append(ensemble_score)
    print(f"   Ensemble    {ensemble_score:.2f}%")
    print()

print("=" * 70)
print("FINAL RESULTS")
print("=" * 70)
cv_score = np.mean(fold_scores)
print(f"Cross-Validation SMAPE: {cv_score:.2f}%")
print(f"Fold Scores: {[f'{s:.2f}%' for s in fold_scores]}")
print(f"Std Dev: {np.std(fold_scores):.2f}%")
print("=" * 70)
print()

# Generate submission
final_preds = (test_preds_lgb + test_preds_xgb + test_preds_cat) / 3
final_preds = np.expm1(final_preds)

# Use sample_id from test data (required by competition)
submission = pd.DataFrame({
    'sample_id': test_df['sample_id'],
    'price': final_preds
})
submission.to_csv('test_out.csv', index=False)

print("=" * 70)
print("SUBMISSION SAVED")
print("=" * 70)
print(f"   File: test_out.csv")
print(f"   Rows: {len(submission):,}")
print(f"   Price range: ${submission['price'].min():.2f} - ${submission['price'].max():.2f}")
print(f"   Expected SMAPE: ~{cv_score:.2f}%")
print()
print("Upload test_out.csv to the leaderboard!")
print("=" * 70)

Amazon ML Challenge - Text-Only Optimized
Target: 48-52% SMAPE | Runtime: ~18 minutes with GPU

Loading data...
   Train: 75,000 samples
   Test:  75,000 samples

Validating data...
   Complete

FEATURE EXTRACTION

Extracting text features...
   28 features (79.6s)

TF-IDF vectorization...
   120 features (91.7s)

Combining features...
   Total features: 148

MODEL TRAINING (5-FOLD CV)

GPU Status:
  XGBoost: GPU
  CatBoost: GPU
  LightGBM: CPU (GPU not compiled)

Fold 1/5
   LightGBM    58.96%
   XGBoost     56.24%
   CatBoost    58.42%
   Ensemble    57.53%

Fold 2/5
   LightGBM    57.96%
   XGBoost     55.73%
   CatBoost    57.33%
   Ensemble    56.64%

Fold 3/5
   LightGBM    58.55%
   XGBoost     56.01%
   CatBoost    58.00%
   Ensemble    57.18%

Fold 4/5
   LightGBM    57.27%
   XGBoost     55.01%
   CatBoost    56.77%
   Ensemble    55.96%

Fold 5/5
   LightGBM    58.60%
   XGBoost     55.89%
   CatBoost    57.82%
   Ensemble    57.09%

FINAL RESULTS
Cross-Validation SMAPE: 56.