In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import Markdown
from sklearn.preprocessing import StandardScaler

from utils.common.merge_df import (merge_forcast_and_train_df,
                                   remove_NaN_rows)
from utils.common.train_model import (train_model,
                                      train_model_cv,
                                      train_model_stacking)
from utils.common.test_model import (test_model)
from utils.common.create_features import (create_seasons,
                                          create_datetime_features,
                                          create_adjusted_values,
                                          check_if_in_daylight,
                                          create_rolling_avg,
                                            astral)

In [2]:
df_forecast = pd.read_parquet('../Daten/forecasts.parquet')
df_test1 = pd.read_parquet('../Daten/energy_test1.parquet')
df_test2 = pd.read_parquet('../Daten/energy_test2.parquet')
df_train = pd.read_parquet('../Daten/energy_train.parquet')

In [3]:
# entferne alle NaN Werte aus den Daten
df_train, df_forecast = remove_NaN_rows(df_train, df_forecast)


In [4]:

# merge die Daten
merged_df = merge_forcast_and_train_df(df_forecast, df_train)

# merge test 1
merged_test_df = merge_forcast_and_train_df(df_forecast, df_test1)

# merge test 2
merged_test2_df = merge_forcast_and_train_df(df_forecast, df_test2)

In [5]:
# Erstelle Jahreszeiten als categorical feature
#merged_df = await create_seasons(merged_df)

# Erstelle Spalten für Stunden und Monate
merged_df = await create_datetime_features(merged_df)

merged_df = await astral(merged_df)


#merged_df = await check_if_in_daylight(merged_df)

# Erstelle Spalten für die adjustierten Werte
# Können wir wieder auskommentieren wenn wir das wieder testen 
#merged_df = await create_adjusted_values(merged_df)
#merged_df = await create_rolling_avg(merged_df)


# create features for test data
#merged_test_df = await create_seasons(merged_test_df)
#merged_test2_df = await create_seasons(merged_test2_df)

merged_test_df = await create_datetime_features(merged_test_df)
merged_test2_df = await create_datetime_features(merged_test2_df)

#merged_test_df = await check_if_in_daylight(merged_test_df)
#merged_test2_df = await check_if_in_daylight(merged_test2_df)

#merged_test_df = await create_adjusted_values(merged_test_df)
#merged_test2_df = await create_adjusted_values(merged_test2_df)

In [6]:
# Scatterplot für jedes Feature gegen die Zielvariablen
#for feature in merged_df.columns:
#    if feature != 'Solar_MWh':  # Zielvariable nicht als Feature verwenden
#        plt.figure(figsize=(6, 4))
#        sns.scatterplot(x=merged_df[feature], y=merged_df['Solar_MWh'], alpha=0.1)
#        plt.title(f'Relationship between {feature} and Solar_MWh')
#        plt.show()


In [7]:
#"sunrise", 
#                                                       "sunset", 
#                                                       "dawn", 
#                                                       "dusk", 
#                                                       "noon"
#"sun_altitude",
#                                                       "sun_azimuth",

In [8]:
#merged_df

In [9]:
model, evaluation_md, scaler, test_col = train_model(merged_df, 
                                                      [ "SolarDownwardRadiation",
                                                       "week",
                                                       "hour",
                                                       "month",
                                                       "rbf_hour_0",
                                                       "rbf_hour_1",
                                                       "rbf_hour_2",
                                                       "rbf_month_0",
                                                       "rbf_month_1",
                                                       "rbf_month_2",
                                                       "rbf_week_0",
                                                       "rbf_week_1",
                                                       "rbf_week_2",
                                                       "sun_altitude"
                                                       
                                                       
                                                                ], 
                                                    "Solar_MWh")

In [10]:
display(Markdown(evaluation_md))

### Ridge(alpha=0.5) Evaluation

| Dataset | R² | RMSE | MAE | Rows | Columns |
|---------|--------:|------------:|--------:|-------:|-------:|
| Train   | 0.88769 | 61.67 | 36.35 | 31524 | 100 |
| Test    | 0.88945 | 61.35 | 36.32 | 7881 | 100 |

### Top 10 Coefficients

| Feature Name           |   Coefficient |
|:-----------------------|--------------:|
| SolarDownwardRadiation |     144.163   |
| sun_altitude           |      83.0674  |
| month_12               |      11.6273  |
| month_4                |     -10.7891  |
| month_1                |      10.6804  |
| rbf_hour_0             |      10.4033  |
| rbf_month_0            |      10.4033  |
| month_5                |      -8.75384 |
| hour_0                 |       8.55912 |
| hour_6                 |      -8.42451 |

Number of coefficients that are zero: 0/100


Alpha value: 0


In [11]:
test_col

Index(['SolarDownwardRadiation', 'rbf_hour_0', 'rbf_hour_1', 'rbf_hour_2',
       'rbf_month_0', 'rbf_month_1', 'rbf_month_2', 'rbf_week_0', 'rbf_week_1',
       'rbf_week_2', 'sun_altitude', 'week_1', 'week_2', 'week_3', 'week_4',
       'week_5', 'week_6', 'week_7', 'week_8', 'week_9', 'week_10', 'week_11',
       'week_12', 'week_13', 'week_14', 'week_15', 'week_16', 'week_17',
       'week_18', 'week_19', 'week_20', 'week_21', 'week_22', 'week_23',
       'week_24', 'week_25', 'week_26', 'week_27', 'week_28', 'week_29',
       'week_30', 'week_31', 'week_32', 'week_33', 'week_34', 'week_35',
       'week_36', 'week_37', 'week_38', 'week_39', 'week_40', 'week_41',
       'week_42', 'week_43', 'week_44', 'week_45', 'week_46', 'week_47',
       'week_48', 'week_49', 'week_50', 'week_51', 'week_52', 'week_53',
       'hour_0', 'hour_1', 'hour_2', 'hour_3', 'hour_4', 'hour_5', 'hour_6',
       'hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_12',
       'hour_13', 'hour_14', 'h

In [12]:
# scale the data
merged_test_pred_df = test_model(model, scaler, merged_test_df, test_col)
merged_test2_pred_df = test_model(model, scaler, merged_test2_df, test_col)

Series([], dtype: int64)
Empty DataFrame
Columns: [dtm, ref_datetime, Solar_capacity_mwp, valid_time, SolarDownwardRadiation, CloudCover, Temperature, valid_datetime, day_of_year, year, hour_sin, hour_cos, month_sin, month_cos, week_sin, week_cos, day_of_year_sin, day_of_year_cos, rbf_hour_0, rbf_hour_1, rbf_hour_2, rbf_month_0, rbf_month_1, rbf_month_2, rbf_week_0, rbf_week_1, rbf_week_2, Weather Model_DWD ICON, Weather Model_NCEP GFS, hour_0, hour_1, hour_2, hour_3, hour_4, hour_5, hour_6, hour_7, hour_8, hour_9, hour_10, hour_11, hour_12, hour_13, hour_14, hour_15, hour_16, hour_17, hour_18, hour_19, hour_20, hour_21, hour_22, hour_23, month_1, month_2, month_3, month_4, month_5, month_6, month_7, month_8, month_9, month_10, week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8, week_9, week_10, week_11, week_12, week_13, week_14, week_15, week_16, week_17, week_18, week_19, week_20, week_21, week_22, week_23, week_24, week_25, week_26, week_27, week_28, week_29, week_30, w

In [13]:
merged_test_pred_df.to_pickle('test1.pkl')
merged_test2_pred_df.to_pickle('test2.pkl')