In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import optuna
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
np.random.seed(42)

In [3]:
def load_data(file_path):
    print(f"Loading data from {file_path}...")
    df = pd.read_csv(file_path)
    
    # Convert date column to datetime
    df['date'] = pd.to_datetime(df['date'])
    
    print(f"Data loaded successfully. Shape: {df.shape}")
    return df

In [4]:
def create_time_features(df):
    """Create time-based features from datetime column"""
    print("Creating time-based features...")
    df_features = df.copy()
    
    # Extract datetime components
    df_features['year'] = df_features['date'].dt.year
    df_features['month'] = df_features['date'].dt.month
    df_features['day'] = df_features['date'].dt.day
    df_features['dayofweek'] = df_features['date'].dt.dayofweek
    df_features['quarter'] = df_features['date'].dt.quarter
    df_features['dayofyear'] = df_features['date'].dt.dayofyear
    df_features['is_weekend'] = df_features['dayofweek'].isin([5, 6]).astype(int)
    df_features['is_month_start'] = df_features['date'].dt.is_month_start.astype(int)
    df_features['is_month_end'] = df_features['date'].dt.is_month_end.astype(int)
    df_features['week'] = df_features['date'].dt.isocalendar().week
    
    # Monthly cyclical encoding using sine and cosine
    df_features['month_sin'] = np.sin(2 * np.pi * df_features['month'] / 12)
    df_features['month_cos'] = np.cos(2 * np.pi * df_features['month'] / 12)
    
    # Weekly cyclical encoding
    df_features['day_sin'] = np.sin(2 * np.pi * df_features['dayofweek'] / 7)
    df_features['day_cos'] = np.cos(2 * np.pi * df_features['dayofweek'] / 7)
    
    return df_features

In [5]:
def create_lag_features(df, lag_days=[1, 7, 14, 28, 365]):
    """Create lag features for each store-item combination"""
    print("Creating lag features...")
    df_lag = df.copy()
    
    # Sort by date for proper lagging
    df_lag = df_lag.sort_values(['store', 'item', 'date'])
    
    # Create lag features
    for lag in lag_days:
        df_lag[f'sales_lag_{lag}'] = df_lag.groupby(['store', 'item'])['sales'].shift(lag)
    
    return df_lag

In [6]:
def create_rolling_features(df, windows=[7, 14, 30, 90]):
    """Create rolling window statistics for each store-item combination"""
    print("Creating rolling window features...")
    df_rolling = df.copy()
    
    # Sort by date for proper rolling calculations
    df_rolling = df_rolling.sort_values(['store', 'item', 'date'])
    
    # Create rolling features
    for window in windows:
        df_rolling[f'sales_rolling_mean_{window}'] = df_rolling.groupby(['store', 'item'])['sales'].transform(
            lambda x: x.rolling(window, min_periods=1).mean())
        df_rolling[f'sales_rolling_std_{window}'] = df_rolling.groupby(['store', 'item'])['sales'].transform(
            lambda x: x.rolling(window, min_periods=1).std())
        df_rolling[f'sales_rolling_min_{window}'] = df_rolling.groupby(['store', 'item'])['sales'].transform(
            lambda x: x.rolling(window, min_periods=1).min())
        df_rolling[f'sales_rolling_max_{window}'] = df_rolling.groupby(['store', 'item'])['sales'].transform(
            lambda x: x.rolling(window, min_periods=1).max())
    
    return df_rolling

In [7]:
def create_target_encoding(df):
    """Create target encodings for categorical variables"""
    print("Creating target encodings...")
    df_target = df.copy()
    
    # Target encoding for store
    store_means = df_target.groupby('store')['sales'].mean().to_dict()
    df_target['store_mean_sales'] = df_target['store'].map(store_means)
    
    # Target encoding for item
    item_means = df_target.groupby('item')['sales'].mean().to_dict()
    df_target['item_mean_sales'] = df_target['item'].map(item_means)
    
    # Target encoding for store-item interaction
    store_item_means = df_target.groupby(['store', 'item'])['sales'].mean().to_dict()
    df_target['store_item_mean_sales'] = df_target.apply(lambda x: store_item_means.get((x['store'], x['item']), 0), axis=1)
    
    # Target encoding by time components
    month_means = df_target.groupby('month')['sales'].mean().to_dict()
    df_target['month_mean_sales'] = df_target['month'].map(month_means)
    
    dayofweek_means = df_target.groupby('dayofweek')['sales'].mean().to_dict()
    df_target['dayofweek_mean_sales'] = df_target['dayofweek'].map(dayofweek_means)
    
    # Month-store interaction
    month_store_means = df_target.groupby(['month', 'store'])['sales'].mean().to_dict()
    df_target['month_store_mean_sales'] = df_target.apply(lambda x: month_store_means.get((x['month'], x['store']), 0), axis=1)
    
    # Month-item interaction
    month_item_means = df_target.groupby(['month', 'item'])['sales'].mean().to_dict()
    df_target['month_item_mean_sales'] = df_target.apply(lambda x: month_item_means.get((x['month'], x['item']), 0), axis=1)
    
    return df_target

In [8]:
def prepare_features(df):
    """Combine all feature engineering steps"""
    print("Preparing all features...")
    df_features = create_time_features(df)
    df_features = create_lag_features(df_features)
    df_features = create_rolling_features(df_features)
    df_features = create_target_encoding(df_features)
    
    # Drop rows with NaN values (from lag/rolling features)
    df_features_clean = df_features.dropna()
    print(f"Final features prepared. Shape after dropping NaN: {df_features_clean.shape}")
    
    return df_features_clean


In [9]:
# Model training and tuning functions
def objective(trial, X_train, y_train, X_val, y_val, model_type="lightgbm"):
    """Optuna objective function for hyperparameter tuning"""
    if model_type == "lightgbm":
        # LightGBM hyperparameters
        params = {
            'objective': 'regression',
            'metric': 'rmse',
            'boosting_type': 'gbdt',
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'num_leaves': trial.suggest_int('num_leaves', 20, 150),
            'max_depth': trial.suggest_int('max_depth', 3, 15),
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 5, 100),
            'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
            'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
            'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
            'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
            'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
            'verbose': -1,
            'seed': 42
        }
        
        model = lgb.LGBMRegressor(**params)
    
    elif model_type == "xgboost":
        # XGBoost hyperparameters
        params = {
            'objective': 'reg:squarederror',
            'eval_metric': 'rmse',
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'max_depth': trial.suggest_int('max_depth', 3, 15),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
            'verbosity': 0,
            'seed': 42
        }
        
        model = xgb.XGBRegressor(**params)
    
    else:
        raise ValueError(f"Unknown model type: {model_type}")
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Evaluate on validation data
    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    
    return rmse

In [10]:
def train_and_tune_model(df_features, target_col='sales', model_type="lightgbm", n_trials=50):
    """Train and tune model with time series cross-validation"""
    print(f"Training and tuning {model_type} model...")
    
    # Prepare data for modeling
    features = [col for col in df_features.columns if col not in ['date', target_col]]
    X = df_features[features]
    y = df_features[target_col]
    
    # Time series cross-validation
    tscv = TimeSeriesSplit(n_splits=3)
    
    best_params = None
    best_models = []
    
    for train_idx, test_idx in tscv.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        # Further split test set into validation and test
        split_point = len(X_test) // 2
        X_val, X_test = X_test.iloc[:split_point], X_test.iloc[split_point:]
        y_val, y_test = y_test.iloc[:split_point], y_test.iloc[split_point:]
        
        print(f"Split sizes - Train: {len(X_train)}, Validation: {len(X_val)}, Test: {len(X_test)}")
        
        # Hyperparameter tuning with Optuna
        study = optuna.create_study(direction='minimize')
        study.optimize(lambda trial: objective(trial, X_train, y_train, X_val, y_val, model_type), 
                      n_trials=n_trials)
        
        best_params = study.best_params
        print(f"Best parameters: {best_params}")
        
        # Train model with best parameters
        if model_type == "lightgbm":
            model = lgb.LGBMRegressor(objective='regression', **best_params)
        else:  # xgboost
            model = xgb.XGBRegressor(objective='reg:squarederror', **best_params)
        
        model.fit(X_train, y_train)
        
        # Evaluate on test set
        y_pred = model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mae = mean_absolute_error(y_test, y_pred)
        
        print(f"Test RMSE: {rmse:.4f}, MAE: {mae:.4f}")
        
        # Save the model
        best_models.append(model)
    
    # Return the model from the last fold and best parameters
    return best_models[-1], best_params, features

In [11]:
def generate_future_dates(last_date, months=3):
    """Generate future dates for prediction"""
    print(f"Generating future dates for {months} months from {last_date}...")
    
    future_dates = []
    current_date = last_date + timedelta(days=1)
    
    # Calculate end date (approximately 3 months)
    end_date = last_date + pd.DateOffset(months=months)
    
    while current_date <= end_date:
        future_dates.append(current_date)
        current_date += timedelta(days=1)
    
    return future_dates

In [12]:
def prepare_future_features(df, last_date, months=3, store_ids=None, item_ids=None):
    """Prepare features for future prediction"""
    print("Preparing features for future prediction...")
    
    # If store_ids and item_ids are not provided, use all from the dataframe
    if store_ids is None:
        store_ids = df['store'].unique()
    if item_ids is None:
        item_ids = df['item'].unique()
    
    # Generate future dates
    future_dates = generate_future_dates(last_date, months)
    
    # Create combinations of store, item, and future dates
    future_rows = []
    for date in future_dates:
        for store in store_ids:
            for item in item_ids:
                future_rows.append({
                    'date': date,
                    'store': store,
                    'item': item
                })
    
    # Create future dataframe
    future_df = pd.DataFrame(future_rows)
    
    # Add time features
    future_df = create_time_features(future_df)
    
    # For each store-item combination, we need historical data to create lag features
    print("Creating historical context for future predictions...")
    
    # Combine historical data with future dataframe
    combined_df = pd.concat([
        df[['date', 'store', 'item', 'sales']],
        future_df[['date', 'store', 'item']].assign(sales=np.nan)
    ]).sort_values(['store', 'item', 'date']).reset_index(drop=True)
    
    # Create lag features
    combined_df = create_lag_features(combined_df)
    
    # Create rolling features
    combined_df = create_rolling_features(combined_df)
    
    # Create target encodings
    combined_df = create_target_encoding(combined_df)
    
    # Extract only the future rows with features
    future_with_features = combined_df[combined_df['date'] > last_date].copy()
    
    print(f"Future features prepared. Shape: {future_with_features.shape}")
    return future_with_features

In [13]:
def predict_future(model, df_features, future_df, features):
    """Generate predictions for future dates"""
    print("Generating predictions for future dates...")
    
    # Prepare future features for prediction
    X_future = future_df[features]
    
    # Handle missing values in future features
    for col in X_future.columns:
        if X_future[col].isna().any():
            # If a feature has missing values, fill with the mean from training data
            mean_value = df_features[col].mean()
            X_future[col] = X_future[col].fillna(mean_value)
    
    # Generate predictions
    future_predictions = model.predict(X_future)
    
    # Add predictions to future dataframe
    future_df['predicted_sales'] = future_predictions
    
    # Ensure no negative sales predictions
    future_df['predicted_sales'] = future_df['predicted_sales'].clip(0)
    
    return future_df


In [14]:
def visualize_predictions(df, future_df, store_id=1, item_id=1):
    """Visualize historical sales and future predictions for a specific store and item"""
    print(f"Visualizing predictions for Store {store_id}, Item {item_id}...")
    
    # Filter historical data for the specific store and item
    historical = df[(df['store'] == store_id) & (df['item'] == item_id)].copy()
    historical = historical.sort_values('date')
    
    # Filter future predictions for the specific store and item
    future = future_df[(future_df['store'] == store_id) & (future_df['item'] == item_id)].copy()
    future = future.sort_values('date')
    
    # Plot
    plt.figure(figsize=(15, 7))
    plt.plot(historical['date'], historical['sales'], label='Historical Sales', color='blue')
    plt.plot(future['date'], future['predicted_sales'], label='Predicted Sales', color='red', linestyle='--')
    
    # Add vertical line separating historical and future data
    last_historical_date = historical['date'].max()
    plt.axvline(x=last_historical_date, color='green', linestyle='-', alpha=0.7,
               label=f'Last Historical Date: {last_historical_date.strftime("%Y-%m-%d")}')
    
    plt.title(f'Sales Prediction for Store {store_id}, Item {item_id}')
    plt.xlabel('Date')
    plt.ylabel('Sales')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(f'store_{store_id}_item_{item_id}_prediction.png')
    plt.close()

In [17]:
def evaluate_feature_importance(model, features, model_type="lightgbm"):
    """Evaluate and visualize feature importance"""
    print("Analyzing feature importance...")
    
    plt.figure(figsize=(12, 10))
    if model_type == "lightgbm":
        importance = model.feature_importances_
    else:
        pass# xgbo

In [21]:
def main():
    """Main execution function"""
    # Set paths
    input_path = 'train.csv'
    output_path = 'sales_predictions.csv'
    
    # 1. Load data
    df = load_data(input_path)
    
    # Display some basic statistics
    print("\nBasic Statistics:")
    print(df.describe())
    
    # 2. Prepare features
    df_features = prepare_features(df)
    
    # 3. Split dataset for training
    # Use data up to last 3 months for training
    last_date = df['date'].max()
    train_end_date = last_date - pd.DateOffset(months=3)
    
    print(f"\nTraining on data up to: {train_end_date}")
    df_train = df_features[df_features['date'] <= train_end_date].copy()
    df_val = df_features[df_features['date'] > train_end_date].copy()
    
    print(f"Training data shape: {df_train.shape}")
    print(f"Validation data shape: {df_val.shape}")
    
    # 4. Train and tune model (choose model_type: "lightgbm" or "xgboost")
    model_type = "lightgbm"  # Change to "xgboost" if preferred
    n_trials = 50  # Number of hyperparameter optimization trials
    
    model, best_params, features = train_and_tune_model(
        df_train, target_col='sales', model_type=model_type, n_trials=n_trials
    )
    
    # 5. Evaluate on validation data
    X_val = df_val[features]
    y_val = df_val['sales']
    y_pred = model.predict(X_val)
    
    val_rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    val_mae = mean_absolute_error(y_val, y_pred)
    
    print(f"\nValidation metrics - RMSE: {val_rmse:.4f}, MAE: {val_mae:.4f}")
    
    # 6. Prepare features for future prediction
    future_df = prepare_future_features(df, last_date, months=3)
    
    # 7. Generate predictions for future dates
    future_predictions = predict_future(model, df_features, future_df, features)
    
    # 8. Save predictions
    future_predictions[['date', 'store', 'item', 'predicted_sales']].to_csv(
        output_path, index=False
    )
    print(f"\nFuture predictions saved to {output_path}")
    
    # 9. Visualize predictions for a few store-item combinations
    for store_id in [1, 2, 3]:
        for item_id in [1, 15, 25]:
            visualize_predictions(df, future_predictions, store_id, item_id)
    
    # 10. Analyze feature importance
    print("\nTop 10 most important features:")
    evaluate_feature_importance(model, features, model_type)

In [26]:
main()ost
model = xgb.XGBRegressor(objective='reg:squarederror', **best_params)
        
model.fit(X_train, y_train)
        
        # Evaluate on test set
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
        
print(f"Test RMSE: {rmse:.4f}, MAE: {mae:.4f}")
        
# Save the model
best_models.append(model)

SyntaxError: invalid syntax (3597455689.py, line 1)