## 1. Setup and Imports

In [None]:
# Core imports
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, Image
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

# Add src to path
sys.path.insert(0, '../src')

print("‚úÖ Imports complete!")

In [None]:
# Import custom modules
from config import *
from data_preprocessing import TextPreprocessor, preprocess_dataframe
from feature_extraction import FeatureEngineer
from model import PricePredictor, ModelSelector, calculate_smape

print("‚úÖ Custom modules loaded!")

## 2. Load and Explore Data

In [None]:
# Load sample data
DATASET_FOLDER = '../dataset/'

sample_test = pd.read_csv(os.path.join(DATASET_FOLDER, 'sample_test.csv'))
sample_test_out = pd.read_csv(os.path.join(DATASET_FOLDER, 'sample_test_out.csv'))

print(f"üìä Sample Test Shape: {sample_test.shape}")
print(f"üìä Sample Output Shape: {sample_test_out.shape}")
print(f"\nüìã Columns: {sample_test.columns.tolist()}")

In [None]:
# Display first few rows
print("üîç Sample Data Preview:")
display(sample_test.head(3))

In [None]:
# Examine a single catalog content
print("üìù Sample Catalog Content:")
print("=" * 80)
print(sample_test['catalog_content'].iloc[0][:2000])
print("=" * 80)

## 3. Data Preprocessing

In [None]:
# Initialize preprocessor
preprocessor = TextPreprocessor()

# Test on single example
sample_content = sample_test['catalog_content'].iloc[0]
processed = preprocessor.process_catalog_content(sample_content)

print("üîß Processed Features:")
for key, value in processed.items():
    if key != 'clean_text' and key != 'bullet_points':
        print(f"   {key}: {value}")

In [None]:
# Preprocess entire dataframe
sample_processed = preprocess_dataframe(sample_test.copy())

print(f"\nüìä Processed DataFrame Shape: {sample_processed.shape}")
print(f"üìã New Columns: {[col for col in sample_processed.columns if col not in sample_test.columns]}")

In [None]:
# Explore extracted features
print("üìä Quantity Value Distribution:")
print(sample_processed['quantity_value'].describe())

print("\nüìä Quantity Units:")
print(sample_processed['quantity_unit'].value_counts())

In [None]:
# Visualize feature distributions
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# Quantity Value
axes[0, 0].hist(sample_processed['quantity_value'], bins=30, edgecolor='black', alpha=0.7)
axes[0, 0].set_title('Quantity Value Distribution')
axes[0, 0].set_xlabel('Quantity Value')

# Text Length
axes[0, 1].hist(sample_processed['text_length'], bins=30, edgecolor='black', alpha=0.7, color='orange')
axes[0, 1].set_title('Text Length Distribution')
axes[0, 1].set_xlabel('Text Length (characters)')

# Pack Size
axes[1, 0].hist(sample_processed['pack_size'], bins=20, edgecolor='black', alpha=0.7, color='green')
axes[1, 0].set_title('Pack Size Distribution')
axes[1, 0].set_xlabel('Pack Size')

# Categorical Features
cat_features = ['has_organic', 'has_gluten_free', 'is_gift', 'is_bulk']
cat_counts = sample_processed[cat_features].sum()
axes[1, 1].bar(cat_counts.index, cat_counts.values, color='purple', alpha=0.7)
axes[1, 1].set_title('Categorical Features Count')
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 4. Feature Engineering

In [None]:
# Initialize feature engineer
feature_engineer = FeatureEngineer(max_tfidf_features=1000)

# Create synthetic prices for demonstration (only for sample data)
np.random.seed(42)
sample_processed['price'] = np.random.uniform(5, 200, len(sample_processed))

# Fit and transform features
X = feature_engineer.fit_transform(sample_processed)
y = sample_processed['price'].values

print(f"üìä Feature Matrix Shape: {X.shape}")
print(f"üìä Target Shape: {y.shape}")

In [None]:
# Feature statistics
print("üìä Feature Matrix Statistics:")
print(f"   Min: {X.min():.4f}")
print(f"   Max: {X.max():.4f}")
print(f"   Mean: {X.mean():.4f}")
print(f"   Non-zero elements: {np.count_nonzero(X)} ({np.count_nonzero(X)/(X.shape[0]*X.shape[1])*100:.2f}%)")

## 5. Model Training

In [None]:
# Split data
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"üìä Training set: {X_train.shape}")
print(f"üìä Validation set: {X_val.shape}")

In [None]:
# Train Random Forest model
model_rf = PricePredictor(model_type='rf')
model_rf.fit(X_train, y_train)

# Evaluate
metrics_rf = model_rf.evaluate(X_val, y_val)

In [None]:
# Try different models
models_to_test = ['rf', 'ridge']
results = {}

for model_name in models_to_test:
    print(f"\nüîÑ Training {model_name.upper()}...")
    model = PricePredictor(model_type=model_name)
    model.fit(X_train, y_train, verbose=False)
    metrics = model.evaluate(X_val, y_val, verbose=False)
    results[model_name] = metrics
    print(f"   SMAPE: {metrics['smape']:.4f}%")

# Display comparison
results_df = pd.DataFrame(results).T
print("\nüìä Model Comparison:")
display(results_df)

In [None]:
# Visualize predictions vs actual
predictions = model_rf.predict(X_val)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Scatter plot
axes[0].scatter(y_val, predictions, alpha=0.5, edgecolors='black', linewidth=0.5)
axes[0].plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'r--', lw=2)
axes[0].set_xlabel('Actual Price ($)')
axes[0].set_ylabel('Predicted Price ($)')
axes[0].set_title('Predicted vs Actual Prices')

# Residuals
residuals = y_val - predictions
axes[1].hist(residuals, bins=30, edgecolor='black', alpha=0.7)
axes[1].axvline(x=0, color='r', linestyle='--', lw=2)
axes[1].set_xlabel('Residual ($)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Residuals Distribution')

plt.tight_layout()
plt.show()

## 6. Generate Predictions

In [None]:
# Train final model on all data
final_model = PricePredictor(model_type='rf')
final_model.fit(X, y)

# Save models
feature_engineer.save()
final_model.save()

print("‚úÖ Models saved!")

In [None]:
# Generate predictions for sample test
# Load fresh test data
test_df = pd.read_csv(os.path.join(DATASET_FOLDER, 'sample_test.csv'))

# Preprocess
test_processed = preprocess_dataframe(test_df)

# Load saved models
fe_loaded = FeatureEngineer.load()
model_loaded = PricePredictor.load()

# Transform and predict
X_test = fe_loaded.transform(test_processed)
predictions = model_loaded.predict(X_test)

# Create output
output_df = pd.DataFrame({
    'sample_id': test_df['sample_id'],
    'price': predictions
})

print(f"üìä Generated {len(output_df)} predictions")
display(output_df.head(10))

In [None]:
# Save predictions
output_path = '../outputs/sample_test_predictions.csv'
os.makedirs('../outputs', exist_ok=True)
output_df.to_csv(output_path, index=False)

print(f"‚úÖ Predictions saved to {output_path}")

## 7. Model Analysis

In [None]:
# Prediction distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Price distribution
axes[0].hist(predictions, bins=30, edgecolor='black', alpha=0.7)
axes[0].axvline(x=predictions.mean(), color='r', linestyle='--', label=f'Mean: ${predictions.mean():.2f}')
axes[0].axvline(x=np.median(predictions), color='g', linestyle='--', label=f'Median: ${np.median(predictions):.2f}')
axes[0].set_xlabel('Predicted Price ($)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Predicted Price Distribution')
axes[0].legend()

# Box plot
axes[1].boxplot(predictions)
axes[1].set_ylabel('Predicted Price ($)')
axes[1].set_title('Price Distribution Box Plot')

plt.tight_layout()
plt.show()

print(f"\nüìä Prediction Statistics:")
print(f"   Min:    ${predictions.min():.2f}")
print(f"   Max:    ${predictions.max():.2f}")
print(f"   Mean:   ${predictions.mean():.2f}")
print(f"   Median: ${np.median(predictions):.2f}")
print(f"   Std:    ${predictions.std():.2f}")

## 8. Download Images (Optional)

In [None]:
# Download sample images
from utils import download_images

# Download only first 5 images as example
sample_images = sample_test['image_link'].head(5).tolist()
download_folder = '../images/sample'

print(f"üì• Downloading {len(sample_images)} sample images...")
download_images(sample_images, download_folder)
print(f"‚úÖ Images downloaded to {download_folder}")

## 9. Summary

### Key Steps Completed:

1. **Data Loading**: Loaded and explored sample test data
2. **Preprocessing**: Extracted item names, descriptions, quantities, and categorical features
3. **Feature Engineering**: Created TF-IDF features from text + numeric features
4. **Model Training**: Trained and compared multiple models
5. **Prediction**: Generated price predictions for test data
6. **Analysis**: Visualized prediction distributions

### Next Steps:

- Train on full `train.csv` dataset (75K samples)
- Add image features using CNN
- Hyperparameter tuning
- Ensemble multiple models
- Generate final predictions for `test.csv`

In [None]:
print("üéâ Notebook Complete!")