In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error

# Generate synthetic data
np.random.seed(42)
dates = pd.date_range(start='2021-01-01', periods=1000, freq='D')
product_ids = np.random.choice(['A', 'B', 'C', 'D'], size=1000)
sales = np.random.poisson(lam=20, size=1000)

data = pd.DataFrame({'date': dates, 'product_id': product_ids, 'sales': sales})

# Feature engineering
data['date'] = pd.to_datetime(data['date'])
data['month'] = data['date'].dt.month
data['day_of_week'] = data['date'].dt.dayofweek

# Lag features
data['lag_1'] = data.groupby('product_id')['sales'].shift(1)
data['lag_2'] = data.groupby('product_id')['sales'].shift(2)

# Drop rows with NaN values (generated by lag features)
data = data.dropna()

# Define features and target
features = ['month', 'day_of_week', 'lag_1', 'lag_2']
target = 'sales'

# Split data
X = data[features]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'RMSE: {rmse}')

# Hyperparameter tuning (optional)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5]
}
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

# Use best model for predictions
y_pred_best = best_model.predict(X_test)
rmse_best = mean_squared_error(y_test, y_pred_best, squared=False)
print(f'Optimized RMSE: {rmse_best}')

RMSE: 4.88219105111479
Optimized RMSE: 4.647760010573248
