# Baseline Geometric Method

This notebook implements a pure geometric mean baseline:
- Geometric mean with zero guard
- No seasonality adjustments
- Validation with competition metric
- Performance comparison with seasonality method

In [None]:
import sys
from pathlib import Path
sys.path.append(str(Path('..').resolve()))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.data import DatasetPaths, load_all_training_tables
from src.models import competition_score
from src.utils import build_amount_wide, geometric_mean_with_zero_guard

plt.rcParams['figure.figsize'] = (12, 6)

In [None]:
# Load data
paths = DatasetPaths(root_dir=str(Path('..').resolve()))
train = load_all_training_tables(paths)
amount = build_amount_wide(train['new_house_transactions'])
print(f"Training data shape: {amount.shape}")

## Hyperparameter Tuning: Lookback & Zero Guard Windows

In [None]:
# Grid search over lookback and zero_guard_window
lookback_windows = [3, 6, 9, 12, 18]
zero_guard_windows = [3, 6, 9, 12]
results = []

train_amount = amount.iloc[:-12]
val_amount = amount.iloc[-12:]

for lookback in lookback_windows:
    for zero_guard in zero_guard_windows:
        base_geo = geometric_mean_with_zero_guard(train_amount, lookback_months=lookback, zero_guard_window=zero_guard)
        
        # Predict same value for all 12 validation months
        val_predictions = np.tile(base_geo.values, (12, 1)).flatten()
        val_true = val_amount.values.flatten()
        
        score_dict = competition_score(val_true, val_predictions)
        results.append({
            'lookback': lookback,
            'zero_guard': zero_guard,
            'score': score_dict['score'],
            'good_rate': score_dict['good_rate']
        })

results_df = pd.DataFrame(results)
best_result = results_df.loc[results_df['score'].idxmin()]
print(f"Best parameters:")
print(f"  Lookback: {best_result['lookback']:.0f} months")
print(f"  Zero Guard: {best_result['zero_guard']:.0f} months")
print(f"  Score: {best_result['score']:.4f}")
print(f"  Good Rate: {best_result['good_rate']:.4f}")

In [None]:
# Heatmap of scores
pivot = results_df.pivot(index='zero_guard', columns='lookback', values='score')

fig, ax = plt.subplots(1, 1, figsize=(10, 6))
sns.heatmap(pivot, annot=True, fmt='.4f', cmap='YlOrRd', ax=ax)
ax.set_title('Competition Score by Lookback and Zero Guard Window')
ax.set_xlabel('Lookback Window (months)')
ax.set_ylabel('Zero Guard Window (months)')
plt.tight_layout()
plt.show()

In [None]:
# MAPE curves
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Score vs lookback (for best zero_guard)
best_zg = best_result['zero_guard']
subset = results_df[results_df['zero_guard'] == best_zg]
axes[0].plot(subset['lookback'], subset['score'], marker='o', linewidth=2)
axes[0].set_title(f'Competition Score vs Lookback (zero_guard={best_zg:.0f})')
axes[0].set_xlabel('Lookback Window (months)')
axes[0].set_ylabel('Competition Score')
axes[0].grid(True, alpha=0.3)

# Good rate vs lookback
axes[1].plot(subset['lookback'], subset['good_rate'], marker='o', linewidth=2, color='orange')
axes[1].set_title(f'Good Rate vs Lookback (zero_guard={best_zg:.0f})')
axes[1].set_xlabel('Lookback Window (months)')
axes[1].set_ylabel('Good Rate')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Final Model Validation

In [None]:
# Train with best params on all data
base_geo = geometric_mean_with_zero_guard(amount, lookback_months=int(best_result['lookback']), zero_guard_window=int(best_result['zero_guard']))

# Validation
val_predictions = np.tile(base_geo.values, (12, 1)).flatten()
val_true = amount.iloc[-12:].values.flatten()

final_score = competition_score(val_true, val_predictions)
print(f"\nFinal Validation Results:")
print(f"  Competition Score: {final_score['score']:.4f}")
print(f"  Good Rate: {final_score['good_rate']:.4f}")
print(f"  Zero Predictions: {(val_predictions == 0).sum()}")

In [None]:
# Error distribution
ape = np.abs((val_true - val_predictions) / np.maximum(val_true, 1e-12))

fig, ax = plt.subplots(1, 1, figsize=(10, 6))
ax.hist(ape[ape < 2], bins=50, edgecolor='black', alpha=0.7)
ax.axvline(1.0, color='red', linestyle='--', label='APE = 100%')
ax.set_title('APE Distribution')
ax.set_xlabel('Absolute Percentage Error')
ax.set_ylabel('Frequency')
ax.legend()
ax.grid(True, alpha=0.3)
plt.show()

print(f"\nAPE < 100%: {(ape < 1.0).mean():.2%}")

## Summary

**Method**: Pure geometric mean with zero guard

**Advantages**:
- Simple and robust
- Handles skewed distributions well
- Zero guard prevents metric explosions

**Disadvantages**:
- No seasonality modeling
- Slightly worse than seasonality-aware method