In [34]:
import pandas as pd
import statsmodels.formula.api as smf

# Load datasets
df_sales = pd.read_csv('umsatzdaten_gekuerzt.csv')
df_weather = pd.read_csv('wetter.csv')
df_kiwo = pd.read_csv('kiwo.csv')
df_holidays = pd.read_csv('Feiertage.csv')

# Merge sales and weather data
df_merged = pd.merge(df_sales, df_weather, on='Datum', how='left')

# Merge with Kieler Woche data
df_merged_with_kiwo = pd.merge(df_merged, df_kiwo, on='Datum', how='left')

# Define weather codes that indicate precipitation, snow, or wet conditions
wet_weather_codes = [
    # Rain related codes
    50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
    60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
    # Snow related codes
    70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
    # Shower related codes
    80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
    # Thunder related codes with precipitation
    91, 92, 93, 94, 95, 96, 97, 98, 99
]

# Create dry_weather column (1 for dry, 0 for wet)
df_merged_with_kiwo['dry_weather'] = df_merged_with_kiwo['Wettercode'].apply(lambda x: 0 if x in wet_weather_codes else 1)

# Fill missing values in KielerWoche column with 0
df_merged_with_kiwo['KielerWoche'] = df_merged_with_kiwo['KielerWoche'].fillna(0)

# Prepare holidays data
df_holidays['Feiertag'] = 1

#Zusätzliche mögliche Features: Wochentage, Ferien, Warengruppe. Bins für Wettercodes überarbeiten. Temparatuen in Bins einteilen evtl mit Jahreszeiten.

#Lasse: Wochentage und Wettercodes

#Thorsten: Ferien daten raussuchen und mergen



# Merge with Feiertag data
df_final = pd.merge(df_merged_with_kiwo, df_holidays, on='Datum', how='left')
df_final['Feiertag'] = df_final['Feiertag'].fillna(0)

month_to_season = {
    12: 'Winter', 1: 'Winter', 2: 'Winter',
    3: 'Spring', 4: 'Spring', 5: 'Spring',
    6: 'Summer', 7: 'Summer', 8: 'Summer',
    9: 'Fall', 10: 'Fall', 11: 'Fall'
}

# Stefan Temperatur nach Jahreszeit in ranges einteilen: kalt normal warm
def analyze_seasons(df):
    # Add season column based on month
    df['Season'] = pd.to_datetime(df['Datum']).dt.month.map(month_to_season)

    # Group by season and calculate min/max temperature
    seasonal_stats = df.groupby('Season')['Temperatur'].agg(['min', 'max'])
    
    return seasonal_stats

# Analyze seasons and add season column to df_final
seasonal_stats = analyze_seasons(df_final)
print(seasonal_stats)

# Add season column to df_final
df_final['Season'] = pd.to_datetime(df_final['Datum']).dt.month.map(month_to_season)

# Function to categorize temperature into bins based on season
def categorize_temperature(row, seasonal_stats):
    season = row['Season']
    temp = row['Temperatur']
    min_temp = seasonal_stats.loc[season, 'min']
    max_temp = seasonal_stats.loc[season, 'max']
    bins = pd.cut([min_temp, max_temp], bins=3, retbins=True)[1]
    temp_bins = pd.cut([temp], bins=bins, labels=['low', 'middle', 'high'])
    return f"{season}_{temp_bins[0]}"

# Apply the function to create a new column in df_final
df_final['Season_Temp_Category'] = df_final.apply(lambda row: categorize_temperature(row, seasonal_stats), axis=1)

# Apply one hot encoding for Season_Temp_Category with 0 and 1
df_final = pd.concat([df_final, pd.get_dummies(df_final['Season_Temp_Category'], prefix='Season_Temp', drop_first=False)], axis=1)
df_final = df_final.replace({True: 1, False: 0})

# Add a column with Umsatz one week ago based on Datum and Umsatz in the same Warengruppe
df_final['Datum'] = pd.to_datetime(df_final['Datum'])
df_final = df_final.sort_values(by=['Warengruppe', 'Datum'])
df_final['Umsatz_one_week_ago'] = df_final.groupby('Warengruppe')['Umsatz'].shift(7)

print(df_final)

# Print min and max dates
min_date = df_merged_with_kiwo['Datum'].min()
max_date = df_merged_with_kiwo['Datum'].max()
print(f"Min Datum: {min_date}")
print(f"Max Datum: {max_date}")




            min      max
Season                  
Fall     0.0000  27.8750
Spring  -4.9625  26.8625
Summer  11.5625  31.4375
Winter  -8.4750  12.5000
          Datum  Warengruppe      Umsatz  Bewoelkung  Temperatur  \
0    2013-07-01            1  148.828353         6.0     17.8375   
1    2013-07-02            1  159.793757         3.0     17.3125   
2    2013-07-03            1  111.885594         7.0     21.0750   
3    2013-07-04            1  168.864941         7.0     18.8500   
4    2013-07-05            1  171.280754         5.0     19.9750   
...         ...          ...         ...         ...         ...   
9329 2017-12-21            6   87.471228         7.0      6.2375   
9330 2017-12-22            6   71.911652         7.0      5.7625   
9331 2017-12-23            6   84.062223         7.0      7.8000   
9332 2017-12-24            6   60.981969         7.0      8.1125   
9333 2017-12-27            6   34.972644         7.0      4.6125   

      Windgeschwindigkeit  Wetter

  df_final = df_final.replace({True: 1, False: 0})


In [None]:
print(df_final)

model = smf.ols(formula='Umsatz ~ Temperatur + dry_weather + Windgeschwindigkeit + KielerWoche + Feiertag', data=df_final).fit()


print(model.summary())

           Datum  Warengruppe      Umsatz  Bewoelkung  Temperatur  \
0     2013-07-01            1  148.828353         6.0     17.8375   
1     2013-07-02            1  159.793757         3.0     17.3125   
2     2013-07-03            1  111.885594         7.0     21.0750   
3     2013-07-04            1  168.864941         7.0     18.8500   
4     2013-07-05            1  171.280754         5.0     19.9750   
...          ...          ...         ...         ...         ...   
9329  2017-12-21            6   87.471228         7.0      6.2375   
9330  2017-12-22            6   71.911652         7.0      5.7625   
9331  2017-12-23            6   84.062223         7.0      7.8000   
9332  2017-12-24            6   60.981969         7.0      8.1125   
9333  2017-12-27            6   34.972644         7.0      4.6125   

      Windgeschwindigkeit  Wettercode  KielerWoche  dry_weather  Feiertag  \
0                    15.0        20.0          0.0            1       0.0   
1                