In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [None]:
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.graphics.tsaplots import plot_pacf,plot_acf 
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose

In [None]:
# Load dataset
data = pd.read_csv('C:/Users/Woodpecker/Downloads/sales_data_sample.csv',encoding='ISO-8859-1')

In [None]:
data.head()

In [None]:
data.columns

In [None]:
data.dtypes

In [None]:
data.size

In [None]:

# Convert 'ORDERDATE' to datetime
data['ORDERDATE'] = pd.to_datetime(data['ORDERDATE'])

# Sort by date
data.sort_values('ORDERDATE', inplace=True)


In [None]:
data.describe()

In [None]:
data.set_index('ORDERDATE' , inplace=True)

In [None]:
data

In [None]:
plt.figure(figsize=(30,15))
sns.lineplot(data=data , x='ORDERDATE',y='QUANTITYORDERED')
plt.xlabel('Date')
plt.ylabel('QUANTITYORDERED')
plt.title('series plot')
plt.show()

In [None]:
# Select only numeric columns
numeric_data = data.select_dtypes(include=['number'])

# Compute correlation matrix
correlation_matrix = numeric_data.corr()

# Plot heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix of Numeric Variables")
plt.show()

In [None]:
def testing_stationarity(df):
    result= adfuller(df)
    print(f"ADF STAT:{result[0]}")
    print(f"P-value :{result[1]}")
    if result[1] < 0.05:
        print("Data is stationary")
    else:
        print("Data is not stationary")

testing_stationarity(data['QUANTITYORDERED'])

In [None]:


# Aggregate demand by date
daily_demand = data.groupby('ORDERDATE')[['QUANTITYORDERED', 'SALES']].sum().reset_index()

# Feature engineering
daily_demand['year'] = daily_demand['ORDERDATE'].dt.year
daily_demand['month'] = daily_demand['ORDERDATE'].dt.month
daily_demand['day'] = daily_demand['ORDERDATE'].dt.day
daily_demand['dayofweek'] = daily_demand['ORDERDATE'].dt.dayofweek


In [None]:
daily_demand.head()

In [None]:
daily_demand.set_index('ORDERDATE', inplace=True)
daily_demand

In [None]:
# Decompose using a period of 3 (quarterly seasonality)
decomposition = seasonal_decompose(daily_demand['QUANTITYORDERED'], model='additive', period=12)

plt.figure(figsize=(12, 10))

# Original Series
plt.subplot(4, 1, 1)
plt.plot(daily_demand['QUANTITYORDERED'], label="Original", color='blue')
plt.title("Original Series")
plt.legend()

# Trend Component
plt.subplot(4, 1, 2)
plt.plot(decomposition.trend, label="Trend", color='red')
plt.title("Trend Component")
plt.legend()

# Seasonal Component
plt.subplot(4, 1, 3)
plt.plot(decomposition.seasonal, label="Seasonal", color='green')
plt.title("Seasonal Component")
plt.legend()

# Residual Component
plt.subplot(4, 1, 4)
plt.plot(decomposition.resid, label="Residual", color='purple')
plt.title("Residual Component")
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# Create lag features (Previous demand values as exogenous variables)
for lag in range(1, 8):
  daily_demand[f'lag_{lag}'] = daily_demand['QUANTITYORDERED'].shift(lag)


In [None]:
daily_demand

In [None]:
# Drop rows with missing values due to lagging
daily_demand.dropna(inplace=True)
daily_demand

In [None]:

# Define features (Exogenous) and target (Endogenous)
X = daily_demand[['SALES'] + [f'lag_{lag}' for lag in range(1, 8)]]
y = daily_demand['QUANTITYORDERED']

# Split data into training and test sets (Time-based split, no shuffle)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)


In [None]:
len(y_train)

In [None]:
# Fit the ARIMAX model
model = SARIMAX(y_train, exog=X_train, order=(5, 1, 5), seasonal_order=(1, 1, 1, 12))
arimax_result = model.fit(disp=False)

# Make predictions
y_pred = arimax_result.predict(start=len(y_train), end=len(y_train) + len(y_test) - 1, exog=X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error (ARIMAX): {mse}')

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Root Mean Squared Error (ARIMAX): {rmse}')

In [None]:
# Plot actual vs predicted values
plt.figure(figsize=(10, 5))
plt.plot(y_test.index, y_test, label="Actual", color="blue", marker='o')
plt.plot(y_test.index, y_pred, label="Predicted", color="red", linestyle="dashed", marker='x')

plt.xlabel("Time")
plt.ylabel("Values")
plt.title("Actual vs Predicted (ARIMAX)")
plt.legend()
plt.grid()
plt.show()