In [None]:
from azure.storage.blob import BlobServiceClient
import pandas as pd
import io

import numpy as np
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
import gc


In [None]:
# Azure Blob storage credentials
# Removed sensitive credential information in this file
conn_str = " "
container = " "

# File names
files = ["calendar.csv", "sell_prices.csv", "sales_train_validation.csv", "sales_train_evaluation.csv", "sample_submission.csv"]

# Load files into DataFrames
blob_service = BlobServiceClient.from_connection_string(conn_str)
container_client = blob_service.get_container_client(container)

def load_csv_from_blob(filename):
    blob = container_client.get_blob_client(filename)
    content = blob.download_blob().readall()
    return pd.read_csv(io.BytesIO(content))

# Load data
calendar = load_csv_from_blob("calendar.csv")
sell_prices = load_csv_from_blob("sell_prices.csv")
sales_train = load_csv_from_blob("sales_train_validation.csv")
sales_eval = load_csv_from_blob("sales_train_evaluation.csv")
sample = load_csv_from_blob("sample_submission.csv")

print("All files loaded")

All files loaded


In [None]:

# Downcasting memory
def reduce_mem(df):
    for col in df.columns:
        if df[col].dtype == 'float64':
            df[col] = df[col].astype('float32')
        elif df[col].dtype == 'int64':
            df[col] = df[col].astype('int32')
    return df
calendar = reduce_mem(calendar)
sell_prices = reduce_mem(sell_prices)
sales = reduce_mem(sales)


In [None]:
# Converting to Long Format
sales_long = pd.melt(sales,
                     id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
                     var_name='d',
                     value_name='sales')

In [None]:
# Merged with calendar and prices
df = sales_long.merge(calendar, on='d', how='left')
df = df.merge(sell_prices, on=['store_id', 'item_id', 'wm_yr_wk'], how='left')

# Convert date to datetime format
df['date'] = pd.to_datetime(df['date'])

# Encoding categoricals
cat_features = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'event_name_1', 'event_type_1']
for col in cat_features:
    df[col] = df[col].astype('category').cat.codes


In [None]:

# Feature Engineering
def create_features(df):
    df['lag_28'] = df.groupby('id')['sales'].shift(28)
    df['lag_7'] = df.groupby('id')['sales'].shift(7)
    df['rolling_mean_7'] = df.groupby('id')['sales'].shift(28).rolling(7).mean().reset_index(level=0, drop=True)
    df['rolling_std_7'] = df.groupby('id')['sales'].shift(28).rolling(7).std().reset_index(level=0, drop=True)
    df['dayofweek'] = df['date'].dt.dayofweek
    df['month'] = df['date'].dt.month
    df['sell_price'] = df['sell_price'].fillna(0)
    df['price_change_t1'] = df.groupby(['id'])['sell_price'].pct_change()
    return df

df = create_features(df)


In [None]:
# Filtered for training
df = df[df['d'].isin([f'd_{i}' for i in range(1000, 1942)])]
df.dropna(inplace=True)

In [None]:
# Train-Test Split
train = df[df['d'].isin([f'd_{i}' for i in range(1000, 1914)])]
test = df[df['d'].isin([f'd_{i}' for i in range(1914, 1942)])]

X_train = train.drop(columns=['id', 'd', 'date', 'sales'])
y_train = train['sales']
X_test = test.drop(columns=['id', 'd', 'date', 'sales'])

In [None]:
# Convert object columns in X_train and X_test to category codes
for col in X_train.select_dtypes(include='object').columns:
    X_train[col] = X_train[col].astype('category').cat.codes
for col in X_test.select_dtypes(include='object').columns:
    X_test[col] = X_test[col].astype('category').cat.codes


In [None]:
# Convert object columns to categorical and then to numeric codes
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].astype('category').cat.codes

In [None]:
print(df.select_dtypes(include='object').columns)

Index([], dtype='object')


In [None]:
# Training XGBoost
model = XGBRegressor(
    n_estimators=300,
    max_depth=9,
    learning_rate=0.2,
    subsample=0.9,
    colsample_bytree=0.8,
    tree_method='hist',
    n_jobs=-1,
    random_state=42
)
model.fit(X_train, y_train)

In [None]:
# Forecasting next 28 days (d_1914 to d_1941)
forecast_df = df[df['d'].isin([f'd_{i}' for i in range(1914, 1942)])].copy()

# Dropping unneeded columns and rows with missing values
forecast_df = forecast_df.dropna()
X_forecast = forecast_df.drop(columns=['id', 'd', 'date', 'sales', 'predicted_sales'], errors='ignore')


# Predicting using trained model
forecast_df['predicted_sales'] = model.predict(X_forecast)

# Adding predictions back to original df
df = df.merge(forecast_df[['id', 'd', 'predicted_sales']], on=['id', 'd'], how='left', suffixes=('', '_new'))
df['predicted_sales'] = df['predicted_sales'].fillna(df['predicted_sales_new'])
df.drop(columns='predicted_sales_new', inplace=True)

print("✅ Forecasts added for d_1914 to d_1941")


✅ Forecasts added for d_1914 to d_1941


In [None]:
#Forecasting future dates
test_df['predicted_sales'] = model.predict(X_test)

# Merge predictions back into main df
df = df.merge(test_df[['id', 'd', 'predicted_sales']], on=['id', 'd'], how='left', suffixes=('', '_new'))
df['predicted_sales'] = df['predicted_sales'].fillna(df['predicted_sales_new'])
df.drop(columns='predicted_sales_new', inplace=True)


In [None]:
# Loading sample_submission.csv (if not already loaded)
sample = load_csv_from_blob("sample_submission.csv")
valid_ids = sample[['id']].copy()

# Extracting forecast rows with predictions
forecast = df[df['d'].isin([f'd_{i}' for i in range(1914, 1942)])][['id', 'd', 'predicted_sales']].copy()

# Converting d_1914 → F1, ..., d_1941 → F28
forecast['F'] = forecast['d'].astype(str).str.extract(r'd_19(\\d{2})').astype(int) - 13
forecast['F'] = 'F' + forecast['F'].astype(str)

# Converting ids to _evaluation
forecast['id'] = forecast['id'].astype(str).str.replace('_validation', '_evaluation')

# Dropping any accidental duplicates (very important)
forecast = forecast.drop_duplicates(subset=['id', 'F'])

# Pivot: id | F1 | F2 | ... | F28
pivot = forecast.pivot(index='id', columns='F', values='predicted_sales').reset_index()

# Merging with sample submission
submission = valid_ids.merge(pivot, on='id', how='left')

# Ensuring all columns exist and fill missing with 0
for f in [f'F{i}' for i in range(1, 29)]:
    if f not in submission.columns:
        submission[f] = 0.0
submission.fillna(0, inplace=True)

# Save to disk
submission = submission[['id'] + [f'F{i}' for i in range(1, 29)]]
submission.to_csv("submission.csv", index=False)
print("✅ submission.csv created and ready for upload")


✅ submission.csv created and ready for upload
