In [None]:
import pandas as pd
import statsmodels.formula.api as smf

# Load datasets
df_sales = pd.read_csv('umsatzdaten_gekuerzt.csv')
df_weather = pd.read_csv('wetter.csv')
df_kiwo = pd.read_csv('kiwo.csv')
df_holidays = pd.read_csv('Feiertage.csv')

# Merge sales and weather data
df_merged = pd.merge(df_sales, df_weather, on='Datum', how='left')

# Merge with Kieler Woche data
df_merged_with_kiwo = pd.merge(df_merged, df_kiwo, on='Datum', how='left')

# Define weather codes that indicate precipitation, snow, or wet conditions
rain_weather_codes = [
    # Rain related codes
    50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
    60, 61, 62, 63, 64, 65, 66, 67,
    80,81,82,91,92,95,97
]

snow_weather_codes = [
    # Snow related codes
    36,37,38,39,
    68,69,70,71,72,73,74,75,76,77,78,79,
    83,84,85,86,87,88,89,90,93,94,95,96,97,99

]


# Create dry_weather column (1 for dry, 0 for wet)
df_merged_with_kiwo['rain_weather'] = df_merged_with_kiwo['Wettercode'].apply(lambda x: 1 if x in rain_weather_codes else 0)
df_merged_with_kiwo['snow_weather'] = df_merged_with_kiwo['Wettercode'].apply(lambda x: 1 if x in snow_weather_codes else 0)

# Fill missing values in KielerWoche column with 0
df_merged_with_kiwo['KielerWoche'] = df_merged_with_kiwo['KielerWoche'].fillna(0)

# Prepare holidays data
df_holidays['Feiertag'] = 1

# Merge with Feiertag data
df_final = pd.merge(df_merged_with_kiwo, df_holidays, on='Datum', how='left')
df_final['Feiertag'] = df_final['Feiertag'].fillna(0)

month_to_season = {
    12: 'Winter', 1: 'Winter', 2: 'Winter',
    3: 'Spring', 4: 'Spring', 5: 'Spring',
    6: 'Summer', 7: 'Summer', 8: 'Summer',
    9: 'Fall', 10: 'Fall', 11: 'Fall'
}

# Stefan Temperatur nach Jahreszeit in ranges einteilen: kalt normal warm
def analyze_seasons(df):
    # Add season column based on month
    df['Season'] = pd.to_datetime(df['Datum']).dt.month.map(month_to_season)

    # Group by season and calculate min/max temperature
    seasonal_stats = df.groupby('Season')['Temperatur'].agg(['min', 'max'])
    
    return seasonal_stats

# Analyze seasons and add season column to df_final
seasonal_stats = analyze_seasons(df_final)
print(seasonal_stats)

# Add season column to df_final
df_final['Season'] = pd.to_datetime(df_final['Datum']).dt.month.map(month_to_season)

# Function to categorize temperature into bins based on season
def categorize_temperature(row, seasonal_stats):
    season = row['Season']
    temp = row['Temperatur']
    min_temp = seasonal_stats.loc[season, 'min']
    max_temp = seasonal_stats.loc[season, 'max']
    bins = pd.cut([min_temp, max_temp], bins=3, retbins=True)[1]
    temp_bins = pd.cut([temp], bins=bins, labels=['low', 'middle', 'high'])
    return f"{season}_{temp_bins[0]}"

# Apply the function to create a new column in df_final
df_final['Season_Temp_Category'] = df_final.apply(lambda row: categorize_temperature(row, seasonal_stats), axis=1)

# Apply one hot encoding for Season_Temp_Category with 0 and 1
df_final = pd.concat([df_final, pd.get_dummies(df_final['Season_Temp_Category'], prefix='Season_Temp', drop_first=False)], axis=1)

# Add a column with Umsatz one week ago based on Datum and Umsatz in the same Warengruppe
df_final['Datum'] = pd.to_datetime(df_final['Datum'])
df_final = df_final.sort_values(by=['Warengruppe', 'Datum'])
df_final['Umsatz_one_week_ago'] = df_final.groupby('Warengruppe')['Umsatz'].shift(7)

print(df_final)



# Print min and max dates
min_date = df_final['Datum'].min()
max_date = df_final['Datum'].max()
print(f"Min Datum: {min_date}")
print(f"Max Datum: {max_date}")


            min      max
Season                  
Fall     0.0000  27.8750
Spring  -4.9625  26.8625
Summer  11.5625  31.4375
Winter  -8.4750  12.5000
          Datum  Warengruppe      Umsatz  Bewoelkung  Temperatur  \
0    2013-07-01            1  148.828353         6.0     17.8375   
1    2013-07-02            1  159.793757         3.0     17.3125   
2    2013-07-03            1  111.885594         7.0     21.0750   
3    2013-07-04            1  168.864941         7.0     18.8500   
4    2013-07-05            1  171.280754         5.0     19.9750   
...         ...          ...         ...         ...         ...   
9329 2017-12-21            6   87.471228         7.0      6.2375   
9330 2017-12-22            6   71.911652         7.0      5.7625   
9331 2017-12-23            6   84.062223         7.0      7.8000   
9332 2017-12-24            6   60.981969         7.0      8.1125   
9333 2017-12-27            6   34.972644         7.0      4.6125   

      Windgeschwindigkeit  Wetter

In [6]:
# Add a column to df_final that gives the day of the week for each date in the Datum column
df_final['Wochentag'] = pd.to_datetime(df_final['Datum']).dt.day_name()

# Create a dummy variable for weekends (1 for Saturday and Sunday, 0 otherwise)
df_final['is_weekend'] = df_final['Wochentag'].apply(lambda x: 1 if x in ['Saturday', 'Sunday'] else 0)

print(df_final[['Datum', 'Wochentag', 'is_weekend']])

          Datum  Wochentag  is_weekend
0    2013-07-01     Monday           0
1    2013-07-02    Tuesday           0
2    2013-07-03  Wednesday           0
3    2013-07-04   Thursday           0
4    2013-07-05     Friday           0
...         ...        ...         ...
9329 2017-12-21   Thursday           0
9330 2017-12-22     Friday           0
9331 2017-12-23   Saturday           1
9332 2017-12-24     Sunday           1
9333 2017-12-27  Wednesday           0

[9334 rows x 3 columns]


In [None]:
# Split dataset into training and validation sets
train_df = df_final[(df_final['Datum'] >= '2013-07-01') & (df_final['Datum'] <= '2017-07-31')]
validation_df = df_final[(df_final['Datum'] >= '2017-08-01') & (df_final['Datum'] <= '2018-07-31')]

print(f"Training set: {train_df.shape[0]} rows")
print(f"Validation set: {validation_df.shape[0]} rows")

In [8]:

model = smf.ols(formula='Umsatz ~ snow_weather + rain_weather + Windgeschwindigkeit + KielerWoche + Feiertag + is_weekend + Season_Temp_Fall_high + Season_Temp_Fall_low + Season_Temp_Spring_high + Season_Temp_Spring_low + Season_Temp_Summer_high + Season_Temp_Summer_low + Season_Temp_Winter_high + Season_Temp_Winter_low + Umsatz_one_week_ago', data=df_final).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                 Umsatz   R-squared:                       0.739
Model:                            OLS   Adj. R-squared:                  0.738
Method:                 Least Squares   F-statistic:                     1747.
Date:                Tue, 03 Dec 2024   Prob (F-statistic):               0.00
Time:                        20:59:56   Log-Likelihood:                -53060.
No. Observations:                9276   AIC:                         1.062e+05
Df Residuals:                    9260   BIC:                         1.063e+05
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
Intercept 

In [9]:
df_final1 = df_final[df_final['Warengruppe'] == 1]
model1 = smf.ols(formula='Umsatz ~ snow_weather + rain_weather + Windgeschwindigkeit + KielerWoche + Feiertag + is_weekend + Season_Temp_Fall_high + Season_Temp_Fall_low + Season_Temp_Spring_high + Season_Temp_Spring_low + Season_Temp_Summer_high + Season_Temp_Summer_low + Season_Temp_Winter_high + Season_Temp_Winter_low + Umsatz_one_week_ago', data=df_final1).fit()

print(model1.summary())

                            OLS Regression Results                            
Dep. Variable:                 Umsatz   R-squared:                       0.214
Model:                            OLS   Adj. R-squared:                  0.208
Method:                 Least Squares   F-statistic:                     32.58
Date:                Tue, 03 Dec 2024   Prob (F-statistic):           5.57e-83
Time:                        21:00:00   Log-Likelihood:                -9035.4
No. Observations:                1809   AIC:                         1.810e+04
Df Residuals:                    1793   BIC:                         1.819e+04
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
Intercept 

In [10]:
df_final2 = df_final[df_final['Warengruppe'] == 2]
model2 = smf.ols(formula='Umsatz ~ snow_weather + rain_weather + Windgeschwindigkeit + KielerWoche + Feiertag + is_weekend + Season_Temp_Fall_high + Season_Temp_Fall_low + Season_Temp_Spring_high + Season_Temp_Spring_low + Season_Temp_Summer_high + Season_Temp_Summer_low + Season_Temp_Winter_high + Season_Temp_Winter_low + Umsatz_one_week_ago', data=df_final2).fit()

print(model2.summary())

                            OLS Regression Results                            
Dep. Variable:                 Umsatz   R-squared:                       0.685
Model:                            OLS   Adj. R-squared:                  0.682
Method:                 Least Squares   F-statistic:                     259.4
Date:                Tue, 03 Dec 2024   Prob (F-statistic):               0.00
Time:                        21:00:07   Log-Likelihood:                -10458.
No. Observations:                1809   AIC:                         2.095e+04
Df Residuals:                    1793   BIC:                         2.104e+04
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
Intercept 

In [12]:
df_final3 = df_final[df_final['Warengruppe'] == 3]
model3 = smf.ols(formula='Umsatz ~ snow_weather + rain_weather + Windgeschwindigkeit + KielerWoche + Feiertag + is_weekend + Season_Temp_Fall_high + Season_Temp_Fall_low + Season_Temp_Spring_high + Season_Temp_Spring_low + Season_Temp_Summer_high + Season_Temp_Summer_low + Season_Temp_Winter_high + Season_Temp_Winter_low + Umsatz_one_week_ago', data=df_final3).fit()

print(model3.summary())

                            OLS Regression Results                            
Dep. Variable:                 Umsatz   R-squared:                       0.688
Model:                            OLS   Adj. R-squared:                  0.685
Method:                 Least Squares   F-statistic:                     263.4
Date:                Tue, 03 Dec 2024   Prob (F-statistic):               0.00
Time:                        21:00:28   Log-Likelihood:                -9338.3
No. Observations:                1809   AIC:                         1.871e+04
Df Residuals:                    1793   BIC:                         1.880e+04
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
Intercept 

In [13]:
df_final4 = df_final[df_final['Warengruppe'] == 4]
model4 = smf.ols(formula='Umsatz ~ snow_weather + rain_weather + Windgeschwindigkeit + KielerWoche + Feiertag + is_weekend + Season_Temp_Fall_high + Season_Temp_Fall_low + Season_Temp_Spring_high + Season_Temp_Spring_low + Season_Temp_Summer_high + Season_Temp_Summer_low + Season_Temp_Winter_high + Season_Temp_Winter_low + Umsatz_one_week_ago', data=df_final4).fit()

print(model4.summary())

                            OLS Regression Results                            
Dep. Variable:                 Umsatz   R-squared:                       0.341
Model:                            OLS   Adj. R-squared:                  0.336
Method:                 Least Squares   F-statistic:                     60.09
Date:                Tue, 03 Dec 2024   Prob (F-statistic):          1.29e-145
Time:                        21:00:40   Log-Likelihood:                -8387.1
No. Observations:                1756   AIC:                         1.681e+04
Df Residuals:                    1740   BIC:                         1.689e+04
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
Intercept 

In [14]:
df_final5 = df_final[df_final['Warengruppe'] == 5]
model5 = smf.ols(formula='Umsatz ~ snow_weather + rain_weather + Windgeschwindigkeit + KielerWoche + Feiertag + is_weekend + Season_Temp_Fall_high + Season_Temp_Fall_low + Season_Temp_Spring_high + Season_Temp_Spring_low + Season_Temp_Summer_high + Season_Temp_Summer_low + Season_Temp_Winter_high + Season_Temp_Winter_low + Umsatz_one_week_ago', data=df_final5).fit()

print(model5.summary())

                            OLS Regression Results                            
Dep. Variable:                 Umsatz   R-squared:                       0.107
Model:                            OLS   Adj. R-squared:                  0.099
Method:                 Least Squares   F-statistic:                     14.25
Date:                Tue, 03 Dec 2024   Prob (F-statistic):           6.13e-35
Time:                        21:00:46   Log-Likelihood:                -10788.
No. Observations:                1809   AIC:                         2.161e+04
Df Residuals:                    1793   BIC:                         2.170e+04
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
Intercept 

In [16]:
df_final6 = df_final[df_final['Warengruppe'] == 6]
model6 = smf.ols(formula='Umsatz ~ snow_weather + rain_weather + Windgeschwindigkeit + KielerWoche + Feiertag + is_weekend + Season_Temp_Fall_high + Season_Temp_Fall_low + Season_Temp_Spring_high + Season_Temp_Spring_low + Season_Temp_Summer_high + Season_Temp_Summer_low + Season_Temp_Winter_high + Season_Temp_Winter_low + Umsatz_one_week_ago', data=df_final6).fit()

print(model6.summary())

                            OLS Regression Results                            
Dep. Variable:                 Umsatz   R-squared:                       0.131
Model:                            OLS   Adj. R-squared:                  0.105
Method:                 Least Squares   F-statistic:                     5.170
Date:                Tue, 03 Dec 2024   Prob (F-statistic):           5.04e-06
Time:                        21:00:55   Log-Likelihood:                -1368.6
No. Observations:                 284   AIC:                             2755.
Df Residuals:                     275   BIC:                             2788.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
Intercept 