In [None]:
import pathlib
import datetime
import matplotlib.pyplot as plt
import numpy as np

import sys
sys.path.append("/Users/kaiknauf/Downloads/MSc IS/AFCS/Project/SalesPredictionProject/src")

from pandas import read_csv, merge, to_datetime

from src.sales_prediction import ROOT_DIR
from src.sales_prediction.classes.arima.arimaMain import ArimaTraining
from src.sales_prediction.constants.arimaConstants import ArimaModels
from src.sales_prediction.script_internal.transformation import (
    create_day_column,
    summarise_sales_train_validation,
)

In [None]:
def get_pd():
    sales_calendar = read_csv(
        pathlib.Path(
            ROOT_DIR,
            "data",
            "raw",
            "calendar_afcs2023.csv",
        )
    )
    sales_train_validation = read_csv(
        pathlib.Path(
            ROOT_DIR,
            "data",
            "raw",
            "sales_train_validation_afcs2023.csv",
        )
    )
    
    # return sales_train_validation

    # add day variable that is a d_ plus 1 to the end of the calendar
    sales_calendar["date"] = to_datetime(sales_calendar["date"], format="%m/%d/%Y")
    sales_calendar["day"] = create_day_column(sales_calendar)

    # summarise the sales per day by summing the sales of each item
    sales_by_day = summarise_sales_train_validation(sales_train_validation)

    # merge the two dataframes
    sales_byday_with_date = merge(sales_by_day, sales_calendar, on="day")

    # set the index to be the date
    sales_byday_with_date = sales_byday_with_date.set_index(["date"])
    
    return sales_byday_with_date

In [None]:
sales_train_validation = read_csv(
    pathlib.Path(
        ROOT_DIR,
        "data",
        "raw",
        "sales_train_validation_afcs2023.csv",
    )
)

# df = sales_train_validation 
# df['mean'] = df.iloc[:, 1:].mean(axis=1)
# df2 = df[['mean', 'id']]
# # df2.to_csv('df2.csv', index=False)
# # sorted(df2['mean'], reverse=True)
# plt.bar(df2['mean'])
# plt.plot()

## Head of dataframe

In [None]:
df = get_pd()
df.head()

## Statistics

In [None]:
mean = np.mean(df['sales'])
std = np.std(df['sales'])
var = np.var(df['sales'])

print(f"Mean: {mean}")
print(f"Variance: {var}")
print(f"Standard Deviation: {std}")

## Plot all train data

In [None]:
df = get_pd()

plt.plot(df['sales'])
plt.show()

## Plot 2015-2016
This plot shows the sales per day from 04-2015 till 05-2016 with highlighted events that lead to a change in sales that is more than one standard deviation from the mean.

In [None]:
df = get_pd()

fig, ax = plt.subplots()
df_year = df.iloc[-365:]
df_year['idx'] = df_year.index
ax.plot(df_year['sales'])

for index, row in df_year.iterrows():
    if type(row['event_name_1']) != float and (row['sales'] > mean + std or row['sales'] < mean - std):
        ax.text(row['idx'], -50, row['event_name_1'], rotation=90, size=7.5)

## Check stationary

In [None]:
from statsmodels.tsa.stattools import adfuller

# Assuming your time series data is stored in a variable called 'data'
result = adfuller(df['sales'])

# Extracting and printing the test statistic and p-value
print('ADF Statistic:', result[0])
print('p-value:', result[1])

In [None]:
# !pip install ydata_profiling
from ydata_profiling import ProfileReport

report = ProfileReport(df, title='My Data')
report.to_file("my_report.html")