# Sales Predictor - XGBoost Model

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

In [None]:
# Load training data
df_train = pd.read_csv('/mnt/data/train.csv', parse_dates=['date'])

# Sort values to maintain time order
df_train = df_train.sort_values(by=['store', 'item', 'date'])


In [None]:
# Lag Features
df_train['sales_lag_1'] = df_train.groupby(['store', 'item'])['sales'].shift(1)
df_train['sales_lag_7'] = df_train.groupby(['store', 'item'])['sales'].shift(7)
df_train['sales_lag_30'] = df_train.groupby(['store', 'item'])['sales'].shift(30)

# Difference Features
df_train['diff_1'] = df_train['sales'] - df_train['sales_lag_1']
df_train['diff_7'] = df_train['sales'] - df_train['sales_lag_7']
df_train['diff_30'] = df_train['sales'] - df_train['sales_lag_30']

# Rolling Statistics
df_train['rolling_mean_7'] = df_train.groupby(['store', 'item'])['sales'].transform(lambda x: x.shift(1).rolling(window=7).mean())
df_train['rolling_mean_30'] = df_train.groupby(['store', 'item'])['sales'].transform(lambda x: x.shift(1).rolling(window=30).mean())
df_train['rolling_std_7'] = df_train.groupby(['store', 'item'])['sales'].transform(lambda x: x.shift(1).rolling(window=7).std())
df_train['rolling_std_30'] = df_train.groupby(['store', 'item'])['sales'].transform(lambda x: x.shift(1).rolling(window=30).std())

# Drop rows with NaN (from feature engineering)
df_train.dropna(inplace=True)


In [None]:
# Prepare train data for XGBoost
X_train = df_train.drop(['date', 'sales'], axis=1)
y_train = df_train['sales']
dtrain = xgb.DMatrix(X_train, label=y_train)


In [None]:
# XGBoost parameters
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'seed': 42
}

# Train the model
model = xgb.train(params, dtrain, num_boost_round=100)


In [None]:
# Load test data
df_test = pd.read_csv('/mnt/data/test.csv', parse_dates=['date'])
df_test = df_test.sort_values(by=['store', 'item', 'date'])

# Combine train and test for feature engineering
full_df = pd.concat([df_train[['date', 'store', 'item', 'sales']], df_test], sort=False)
full_df = full_df.sort_values(by=['store', 'item', 'date'])


In [None]:
# Feature engineering on combined data
full_df['sales_lag_1'] = full_df.groupby(['store', 'item'])['sales'].shift(1)
full_df['sales_lag_7'] = full_df.groupby(['store', 'item'])['sales'].shift(7)
full_df['sales_lag_30'] = full_df.groupby(['store', 'item'])['sales'].shift(30)
full_df['diff_1'] = full_df['sales'] - full_df['sales_lag_1']
full_df['diff_7'] = full_df['sales'] - full_df['sales_lag_7']
full_df['diff_30'] = full_df['sales'] - full_df['sales_lag_30']
full_df['rolling_mean_7'] = full_df.groupby(['store', 'item'])['sales'].transform(lambda x: x.shift(1).rolling(window=7).mean())
full_df['rolling_mean_30'] = full_df.groupby(['store', 'item'])['sales'].transform(lambda x: x.shift(1).rolling(window=30).mean())
full_df['rolling_std_7'] = full_df.groupby(['store', 'item'])['sales'].transform(lambda x: x.shift(1).rolling(window=7).std())
full_df['rolling_std_30'] = full_df.groupby(['store', 'item'])['sales'].transform(lambda x: x.shift(1).rolling(window=30).std())


In [None]:
# Select only test rows (those without sales)
df_test_feat = full_df[full_df['sales'].isna()].copy()
df_test_feat.drop(['date', 'sales'], axis=1, inplace=True)

# Fill any remaining NaNs with 0
df_test_feat.fillna(0, inplace=True)

dtest = xgb.DMatrix(df_test_feat.drop('id', axis=1))

# Predict
preds = model.predict(dtest)


In [None]:
# Create submission
submission = pd.DataFrame({
    'id': df_test_feat['id'],
    'sales': preds
})

# Save to CSV
submission.to_csv('/mnt/data/xgb_submission.csv', index=False)

print("Submission file created: xgb_submission.csv")
