In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_absolute_percentage_error

In [None]:
train_path = "/Users/pranjalsharma/Downloads/sales-forecasting/archive/train.csv"
test_path  = "/Users/pranjalsharma/Downloads/sales-forecasting/archive/test.csv"
store_path = "/Users/pranjalsharma/Downloads/sales-forecasting/archive/store.csv"


In [None]:
train = pd.read_csv(train_path)
test  = pd.read_csv(test_path)
store = pd.read_csv(store_path)
print(train.shape, test.shape, store.shape)
train.head()
train['Date'] = pd.to_datetime(train['Date'])
test['Date']  = pd.to_datetime(test['Date'])


In [None]:
for df in [train, test]:
    df['Year']  = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day']   = df['Date'].dt.day
    df['DayOfWeek'] = df['Date'].dt.dayofweek
    df = train.merge(store, on='Store', how='left')
print(df.shape)
df.head()

In [None]:
df.isnull().sum().sort_values(ascending=False)


In [None]:
# CompetitionDistance: replace NaN with median
df['CompetitionDistance'].fillna(df['CompetitionDistance'].median(), inplace=True)

# CompetitionOpenSince* → fill with mode or backfill
df['CompetitionOpenSinceYear'].fillna(df['Year'], inplace=True)
df['CompetitionOpenSinceMonth'].fillna(df['Month'], inplace=True)

# Promo2-related
df['Promo2SinceYear'].fillna(0, inplace=True)
df['Promo2SinceWeek'].fillna(0, inplace=True)
df['PromoInterval'].fillna('None', inplace=True)


In [None]:
# Convert to category dtype
cat_cols = ['StoreType', 'Assortment', 'PromoInterval']
for col in cat_cols:
    df[col] = df[col].astype('category')

# One-hot encode
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)


In [None]:
# Weekend / weekday flag
df['IsWeekend'] = df['DayOfWeek'].isin([5,6]).astype(int)

# Competition active flag
df['CompetitionOpen'] = (
    12 * (df['Year'] - df['CompetitionOpenSinceYear']) +
    (df['Month'] - df['CompetitionOpenSinceMonth'])
)
df['CompetitionOpen'] = df['CompetitionOpen'].apply(lambda x: 1 if x > 0 else 0)

# Promo ongoing flag
df['IsPromo'] = df['Promo'].astype(int)


In [None]:
print(df.describe())
print(df.info())

# Total unique stores
print("Number of stores:", df['Store'].nunique())


In [None]:
daily_sales = df.groupby('Date')['Sales'].sum()

plt.figure(figsize=(12,6))
daily_sales.plot()
plt.title('Total Daily Sales Trend (All Stores)')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.show()


In [None]:
monthly_sales = df.groupby(['Year','Month'])['Sales'].sum().reset_index()
monthly_sales['YearMonth'] = pd.to_datetime(monthly_sales[['Year','Month']].assign(DAY=1))

plt.figure(figsize=(12,6))
plt.plot(monthly_sales['YearMonth'], monthly_sales['Sales'])
plt.title('Monthly Sales Trend')
plt.xlabel('Month')
plt.ylabel('Sales')
plt.show()


In [None]:
promo_effect = df.groupby('Promo')['Sales'].mean()
sns.barplot(x=promo_effect.index, y=promo_effect.values)
plt.title('Average Sales with and without Promo')
plt.show()


In [None]:
avg_sales_type = train.merge(store, on='Store', how='left').groupby('StoreType')['Sales'].mean()
sns.barplot(x=avg_sales_type.index, y=avg_sales_type.values)
plt.title('Average Sales by Store Type')
plt.xlabel('Store Type')
plt.ylabel('Average Sales')
plt.show()
