In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Load features
df = pd.read_csv('../data/avocado_features.csv')
df['date'] = pd.to_datetime(df['date'])

# Sort by date
df = df.sort_values('date')

# Create train/test split (last 20% for testing)
split_date = df['date'].quantile(0.8)
train = df[df['date'] < split_date]
test = df[df['date'] >= split_date]

# Persistence baseline (last known price)
test['baseline_pred'] = test.groupby('region')['price_lag_1'].transform('first')

# Calculate metrics
mae = mean_absolute_error(test['AveragePrice'], test['baseline_pred'])
rmse = np.sqrt(mean_squared_error(test['AveragePrice'], test['baseline_pred']))

print(f"Baseline MAE: {mae:.4f}")
print(f"Baseline RMSE: {rmse:.4f}")

# Save baseline results for comparison
baseline_results = {
    'model': 'baseline',
    'mae': mae,
    'rmse': rmse
}

import json
with open('../models/baseline_metrics.json', 'w') as f:
    json.dump(baseline_results, f)

Baseline MAE: 0.3482
Baseline RMSE: 0.4533


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['baseline_pred'] = test.groupby('region')['price_lag_1'].transform('first')
