<a href="https://colab.research.google.com/github/Shreeshambav/DeepLearning_training/blob/main/Statsmodel_SARIMA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Test 7 - SARIMA - Model
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import warnings

# Read the data
file_path_merged = 'C:\\Users\\mraj4\\Documents\\OptimumPython\\DS\\DL\\Sunoida\\Budget_merged_file_Cat.xlsx'
df = pd.read_excel(file_path_merged)

# Replacing NAN with different categories
df['VISION_SBU'].fillna('Missing', inplace=True)
df['ACCOUNT_OFFICER'].fillna('0000', inplace=True)

# Drop rows with missing values in the specified columns
df = df.dropna(subset=["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"])

# Convert the values in columns "01" to "12" to numeric
df[["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]] = df[["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]].apply(pd.to_numeric, errors="coerce")

# Calculate the sum of values for each row
df["Sum"] = df[["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]].sum(axis=1)

# Filter rows where the sum of values is greater than 0
filtered_data = df[df["Sum"] > 0]

# Group the data by "COUNTRY" and "Year" and calculate the mean of the sum for each group
grouped_data = filtered_data.groupby(["COUNTRY", "Year"])["Sum"].mean()

print(grouped_data)

# Group the data based on 'MRL_Category'
grouped_df = df.groupby('MRL_Category')

# Create an empty DataFrame to store the predictions
predictions_df = pd.DataFrame(columns=["Category", "Year", "Month", "Prediction", "Forecast Trend"])

# Create empty lists to store statistics for ARIMA and Exponential Smoothing separately
arima_stats_list = []
ets_stats_list = []

# Filter data for each year (2019, 2020, 2021) and apply ARIMA model for each category
years = [2019, 2020, 2021, 2022]
months = ["{:02d}".format(month) for month in range(1, 13)]

# Helper function to find the last financial year
def find_last_financial_year(years):
    sorted_years = sorted(years, reverse=True)
    for i in range(len(sorted_years) - 1):
        if sorted_years[i] - 1 != sorted_years[i + 1]:
            return sorted_years[i]
    return sorted_years[-1]

for category, group_df in grouped_df:
    for year in years:
        year_df = group_df[group_df['Year'] == year]
        data_columns = [str(month).zfill(2) for month in range(1, 13)]
        missing_columns = [col for col in data_columns if col not in year_df.columns]

        if missing_columns:
            print(f"Missing data columns for Category: {category}, Year: {year}: {missing_columns}")
            continue

        data = year_df[data_columns].values.flatten()

        # Handle cases where there might be missing data for some years
        if np.any((data != 0) & pd.notnull(data)):
            last_financial_year = find_last_financial_year(years)

            # Split data into train and test sets
            if year == last_financial_year:
                # Last 12 months as test data
                train_data = data[:-12]
                test_data = data[-12:]
            else:
                # Rest as train data
                train_data = data[:-12]
                test_data = data[-12:]

            # Perform ADF test to check for stationarity
            adf_result = adfuller(train_data)
            p_value = adf_result[1]
            is_stationary = p_value < 0.05
            print(f"Category: {category}, Year: {year} - Time series is Stationary.")

            if not is_stationary:
                print(f"Category: {category}, Year: {year} - Time series is non-stationary.")
                continue

            # Try different ARIMA model orders and methods
            orders_to_try = [(5, 1, 0), (4, 1, 0), (3, 1, 0)]

            model_fit = None
            converged = False
            for order in orders_to_try:
                try:
                    with warnings.catch_warnings():
                        warnings.simplefilter("ignore")  # Ignore ConvergenceWarning
                        model = ARIMA(train_data, order=order)
                        model_fit = model.fit()
                        arima_predictions = model_fit.forecast(steps=12)
                    converged = True
                    break
                except Exception as e:
                    print(f"ARIMA model did not converge for Category: {category}, Year: {year}, Order: {order}")
                    print(f"Error message: {str(e)}")
                    continue

            if not converged:
                print(f"ARIMA model did not converge for any combination of orders for Category: {category}, Year: {year}.")
                continue

            arima_predictions = arima_predictions.flatten()

            # Fit SARIMA model and make predictions for the out-of-sample period (2022)
            order = (5, 1, 0)
            seasonal_order = (0, 1, 1, 12)
            sarima_model = SARIMAX(train_data, order=order, seasonal_order=seasonal_order)
            sarima_fit = sarima_model.fit(disp=False)
            sarima_predictions = sarima_fit.forecast(steps=12)
            sarima_predictions = sarima_predictions.flatten()

##################################################################################################

            # Fit exponential smoothing model and make predictions for the out-of-sample period (2022)
            ets_model = ExponentialSmoothing(train_data, seasonal='add', seasonal_periods=12)
            ets_fit = ets_model.fit()
            ets_predictions = ets_fit.forecast(steps=12)
            # Flatten the ets_predictions to match the shape of test_data
            ets_predictions = ets_predictions.flatten()

###################################################################################################

            # Calculate Mean Squared Error (MSE), Root Mean Squared Error (RMSE), and Mean Absolute Percentage Error (MAPE)
            if np.any(test_data):
                mse_arima = mean_squared_error(test_data.reshape(-1, 1), arima_predictions.reshape(-1, 1))
                rmse_arima = np.sqrt(mse_arima)
                z_scores_test_arima = (test_data - np.mean(test_data)) / np.std(test_data)
                z_scores_pred_arima = (arima_predictions - np.mean(arima_predictions)) / np.std(arima_predictions)

                non_zero_indices_arima = test_data != 0
                mape_arima = np.mean(np.abs((test_data[non_zero_indices_arima] - arima_predictions[non_zero_indices_arima]) / test_data[non_zero_indices_arima])) * 100

                mse_sarima = mean_squared_error(test_data.reshape(-1, 1), sarima_predictions.reshape(-1, 1))
                rmse_sarima = np.sqrt(mse_sarima)
                z_scores_test_sarima = (test_data - np.mean(test_data)) / np.std(test_data)
                z_scores_pred_sarima = (sarima_predictions - np.mean(sarima_predictions)) / np.std(sarima_predictions)

                non_zero_indices_sarima = test_data != 0
                mape_sarima = np.mean(np.abs((test_data[non_zero_indices_sarima] - sarima_predictions[non_zero_indices_sarima]) / test_data[non_zero_indices_sarima])) * 100

                arima_stats_list.append({"Category": category, "Year": year, "Model": "ARIMA", "MSE": mse_arima, "RMSE": rmse_arima, "MAPE": mape_arima, "z_scores_t": z_scores_test_arima, "z_scores_p": z_scores_pred_arima})
                ets_stats_list.append({"Category": category, "Year": year, "Model": "SARIMA", "MSE": mse_sarima, "RMSE": rmse_sarima, "MAPE": mape_sarima, "z_scores_t": z_scores_test_sarima, "z_scores_p": z_scores_pred_sarima})

                print(f"Category: {category}, Year: {year}")
                print("ARIMA - MSE:", mse_arima, "RMSE:", rmse_arima, "MAPE:", mape_arima, "z_scores_t:", z_scores_test_arima, "z_scores_p:", z_scores_pred_arima)
                print("SARIMA - MSE:", mse_sarima, "RMSE:", rmse_sarima, "MAPE:", mape_sarima, "z_scores_t:", z_scores_test_sarima, "z_scores_p:", z_scores_pred_sarima)
                print()
            else:
                print(f"No test data available for Category: {category}, Year: {year}")

                # Append the predictions and forecast trend to the DataFrame
                forecast_trend = np.append(train_data[-1], arima_predictions.cumsum())
                prediction_data = []
                for month, arima_pred, sarima_pred, ets_pred, trend_value in zip(months[-12:], arima_predictions, sarima_predictions, ets_predictions, forecast_trend[-12:]):
                    prediction_data.append({"Category": category, "Year": year, "Month": month, "Prediction": arima_pred, "ARIMA_Prediction": arima_pred, "SARIMA_Prediction": sarima_pred, "ETS_Prediction": ets_pred, "Forecast Trend": trend_value})

                predictions_df = pd.concat([predictions_df, pd.DataFrame(prediction_data)], ignore_index=True)
                # Plot the actual data and predictions
                plt.figure(figsize=(12, 8))
                # Create a time range spanning from the beginning of the train data to the end of the prediction period (2022)
                time_range = np.arange(1, len(train_data) + len(arima_predictions) + 1)

                # Plot actual data
                plt.plot(time_range[:len(train_data)], train_data, label="Train Data", color="gray")
                plt.plot(time_range[-12:], test_data, label="Test Data", color="black")

                # Plot ARIMA predictions
                pred_time_range = np.arange(len(train_data), len(train_data) + len(arima_predictions))
                plt.plot(pred_time_range, arima_predictions, label="ARIMA Predictions", linestyle="--", color="blue")

                # Plot SARIMA predictions
                plt.plot(pred_time_range, sarima_predictions, label="SARIMA Predictions", linestyle="--", color="green")

                # Plot exponential smoothing predictions
                plt.plot(pred_time_range, ets_predictions, label="Exponential Smoothing Predictions")
                # Plot the trend of the forecast for 2022 (prediction)

                forecast_time_range = np.arange(len(train_data), len(train_data) + len(arima_predictions) + 1)
                plt.plot(forecast_time_range, forecast_trend, label="Forecast Trend", linestyle="--", color="orange")

                plt.xlabel("Months")
                plt.ylabel("Values")
                plt.title(f"Category: {category}, Year: {year}")
                plt.legend(loc="upper left")
                plt.grid(True)
                plt.tight_layout()
                plt.show()

################################################################################
# Create DataFrames from the lists of statistics
arima_stats_df = pd.DataFrame(arima_stats_list)
ets_stats_df = pd.DataFrame(ets_stats_list)

# Concatenate ARIMA and SARIMA statistics DataFrames
stats_df = pd.concat([arima_stats_df, ets_stats_df], ignore_index=True)

# Save the pred_output_file DataFrame to an Excel file
pred_output_file = 'C:\\Users\\mraj4\\Documents\\OptimumPython\\DS\\DL\\Sunoida\\Budget_merged_file_Cat_predicted.xlsx'
predictions_df.to_excel(pred_output_file, index=False)

# Save the statistics DataFrame to an Excel file
stats_output_file = 'C:\\Users\\mraj4\\Documents\\OptimumPython\\DS\\DL\\Sunoida\\Budget_merged_file_Cat_predicted_Stats.xlsx'
stats_df.to_excel(stats_output_file, index=False)