In [1]:
import pandas as pd
import numpy as np
import random
from scipy.stats import norm

# Define the date range
start_date = "2020-01-01"
end_date = "2021-12-31"
dates = pd.date_range(start_date, end_date)

# Store locations
locations = ["Store_A", "Store_B", "Store_C", "Store_D"]

# Holidays (fixed for simplicity)
holidays = pd.to_datetime(["2020-07-04", "2020-12-25", "2021-07-04", "2021-12-25"])

# Generate data for each store for each day
records = []

for date in dates:
    weekday = date.weekday()
    quarter = (date.month - 1) // 3 + 1
    for location in locations:
        is_holiday = date in holidays
        is_day_after_holiday = (date - pd.Timedelta(days=1)) in holidays

        # Simulate temperature
        temperature = np.round(norm.rvs(loc=65, scale=20), 2)

        # Simulate gas price
        price_of_gas = np.round(norm.rvs(loc=2.5, scale=0.11), 2)

        # Simulate CPI
        current_cpi = np.round(norm.rvs(loc=250, scale=8), 2)

        # Boost sales on holidays with more variability
        if is_holiday:
            sales = np.round(norm.rvs(loc=1200, scale=random.randint(350, 450)), 2)
        else:
            sales = np.round(
                norm.rvs(loc=1000, scale=400), 2
            )  # Increased standard deviation

        if weekday in [5, 6]:
            sales *= norm.rvs(loc=1.1, scale=0.4)
            noise = np.random.normal(0, 195)  # Adjust the magnitude of noise as needed
            sales += noise

        elif weekday in [4]:
            sales *= norm.rvs(loc=1.05, scale=0.24)
            noise = np.random.normal(0, 175)

        elif weekday in [3]:
            sales *= norm.rvs(loc=1.05, scale=0.19)
            noise = np.random.normal(0, 175)
        elif weekday in [2]:
            sales *= norm.rvs(loc=1.15, scale=0.13)
            noise = np.random.normal(0, 175)

        elif weekday in [1]:
            sales *= norm.rvs(loc=1, scale=0.12)
            noise = np.random.normal(0, 175)

        if is_day_after_holiday:
            sales *= norm.rvs(loc=1.05, scale=0.15)
            noise = np.random.normal(0, 175)
            sales += noise

        if quarter == 1:
            sales *= norm.rvs(loc=1.05, scale=0.15)
            noise = np.random.normal(0, 111)
            sales += noise

        if quarter == 2:
            sales *= norm.rvs(loc=1.15, scale=0.15)
            noise = np.random.normal(0, 125)
            sales += noise

        if quarter == 3:
            sales *= norm.rvs(loc=1.25, scale=0.35)
            noise = np.random.normal(0, 155)
            sales += noise

        if quarter == 4:
            sales *= norm.rvs(loc=1.55, scale=0.65)
            noise = np.random.normal(0, 195)
            sales += noise

        if location == "Store_A":
            sales *= norm.rvs(loc=1.25, scale=0.15)
            noise = np.random.normal(0, 125)
            sales += noise

        if location == "Store_B":
            sales *= norm.rvs(loc=1.35, scale=0.15)
            noise = np.random.normal(0, 135)
            sales += noise

        if location == "Store_C":
            sales *= norm.rvs(loc=1.45, scale=0.15)
            noise = np.random.normal(0, 145)
            sales += noise

        if location == "Store_D":
            sales *= norm.rvs(loc=1.55, scale=0.15)
            noise = np.random.normal(0, 155)
            sales += noise

        sales = sales * (1 + 0.1 * (temperature - 65) / 65)  # Increase sales on warmer days
        sales += sales * 0.1 * (price_of_gas - 2.5) / 2.5 if price_of_gas > 2.5 else 0
        # Increase sales on days with higher gas prices

        # Increase sales on weekends

        # Introduce random noise to sales
        noise = np.random.normal(0, 150)  # Adjust the magnitude of noise as needed
        sales += noise

        record = {
            "Date": date,
            "Weekday": weekday,
            "quarter": quarter,
            "StoreLocation": location,
            "Sales": sales,
            "IsHoliday": is_holiday,
            "IsDayAfterHoliday": is_day_after_holiday,
            "Temperature": temperature,
            "PriceOfGas": price_of_gas,
            "CurrentCPI": current_cpi,
            
        }
        records.append(record)

df = pd.DataFrame(records)

# Process dataframe for regression (similar to your previous steps)
y = df["Sales"]
y = y.astype(float)
df["year"] = df["Date"].dt.year
df["month"] = df["Date"].dt.month
df["day"] = df["Date"].dt.day
df = df.drop(columns=["Date"])
X = pd.get_dummies(df.drop(columns=["Sales"]), drop_first=True, dtype="int64")
X["IsHoliday"] = X["IsHoliday"].astype(int)
X["IsDayAfterHoliday"] = X["IsDayAfterHoliday"].astype(int)

df.head()

Unnamed: 0,Weekday,quarter,StoreLocation,Sales,IsHoliday,IsDayAfterHoliday,Temperature,PriceOfGas,CurrentCPI,year,month,day
0,2,1,Store_A,1182.204556,False,False,59.77,2.7,243.55,2020,1,1
1,2,1,Store_B,1179.068876,False,False,69.53,2.61,239.53,2020,1,1
2,2,1,Store_C,1350.231195,False,False,45.98,2.31,255.92,2020,1,1
3,2,1,Store_D,3650.312342,False,False,67.76,2.66,255.25,2020,1,1
4,3,1,Store_A,327.106939,False,False,83.78,2.53,257.35,2020,1,2


In [3]:
df.to_csv("../../outputs/sales_data.csv", index=False)

In [None]:
import statsmodels.api as sm

model = sm.OLS(y, X).fit()
model.summary()

0,1,2,3
Dep. Variable:,Sales,R-squared (uncentered):,0.729
Model:,OLS,Adj. R-squared (uncentered):,0.728
Method:,Least Squares,F-statistic:,603.5
Date:,"Mon, 26 Feb 2024",Prob (F-statistic):,0.0
Time:,15:13:48,Log-Likelihood:,-24800.0
No. Observations:,2924,AIC:,49630.0
Df Residuals:,2911,BIC:,49700.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Weekday,23.2428,10.891,2.134,0.033,1.887,44.598
quarter,352.9328,81.711,4.319,0.000,192.716,513.149
IsHoliday,660.8745,295.123,2.239,0.025,82.203,1239.546
IsDayAfterHoliday,720.8462,294.435,2.448,0.014,143.524,1298.168
Temperature,2.8343,1.051,2.696,0.007,0.773,4.896
PriceOfGas,-39.9004,194.825,-0.205,0.838,-421.909,342.108
CurrentCPI,-2.4566,2.715,-0.905,0.366,-7.781,2.867
year,0.7870,0.416,1.890,0.059,-0.029,1.603
month,-44.2523,26.459,-1.672,0.095,-96.133,7.629

0,1,2,3
Omnibus:,717.744,Durbin-Watson:,1.957
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2839.421
Skew:,1.156,Prob(JB):,0.0
Kurtosis:,7.238,Cond. No.,27900.0


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import datetime

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Fit the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict sales on the testing set
y_pred = model.predict(X_test)

# Convert the ordinal dates back to datetime for plotting
X_test["Date"] = pd.to_datetime(X_test["Date"].apply(datetime.datetime.fromordinal))

# Plotting
plt.figure(figsize=(10, 6))
plt.plot_date(X_test["Date"], y_test, "-", label="Actual Sales")
plt.plot_date(X_test["Date"], y_pred, "-", label="Predicted Sales")
plt.title("Sales Over Time")
plt.xlabel("Date")
plt.ylabel("Sales")
plt.legend()
plt.tight_layout()
plt.show()

DTypePromotionError: The DType <class 'numpy.dtypes.DateTime64DType'> could not be promoted by <class 'numpy.dtypes.Float64DType'>. This means that no common DType exists for the given inputs. For example they cannot be stored in a single array unless the dtype is `object`. The full list of DTypes is: (<class 'numpy.dtypes.DateTime64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int32DType'>, <class 'numpy.dtypes.Int32DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>)