In [30]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack



In [11]:
df = pd.read_csv('merged_cleaned.csv',
                 index_col= [0],
                 parse_dates= [0])

In [19]:
df.Holiday.unique()

array(['Not a Holiday', 'Christmas Day', 'Second Christmas Day',
       'New year', 'Good Friday', 'Easter Monday', 'Labour Day',
       'Ascension Thursday', 'Whit Monday', 'Day of German Unity'],
      dtype=object)

In [36]:
categorical_columns = df.select_dtypes(include=['object', 'category']).columns
numerical_columns = df.select_dtypes(include=['number']).columns.drop('Gesamt (Netzlast) [MWh] Berechnete Auflösungen')

X = df.drop('Gesamt (Netzlast) [MWh] Berechnete Auflösungen', axis=1)
y = df['Gesamt (Netzlast) [MWh] Berechnete Auflösungen']

# Separate categorical and numerical features
X_categorical = X[categorical_columns]
X_numerical = X[numerical_columns]

In [None]:
# Use OneHotEncoder with sparse output
encoder = OneHotEncoder(sparse_output=True, drop='first')
X_categorical_encoded = encoder.fit_transform(X_categorical)

# Combine numerical and sparse categorical features
X_encoded = hstack([X_numerical, X_categorical_encoded], format='csr')

In [None]:
# Train the model
model = RandomForestRegressor(random_state=42)
model.fit(X_encoded, y)



AttributeError: 'csr_matrix' object has no attribute 'columns'

In [42]:
# Retrieve feature names from OneHotEncoder and numerical columns
categorical_feature_names = encoder.get_feature_names_out(categorical_columns)
feature_names = list(numerical_columns) + list(categorical_feature_names)

# Get feature importances
importances = model.feature_importances_

# Create a DataFrame for better visualization
feature_importances = pd.DataFrame({'Feature': feature_names, 'Importance': importances})

In [45]:
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)
print(feature_importances.head(50))  # Top 10 most important features


                                                 Feature  Importance
13                                                  hour    0.406127
28                                         dayofyear_cos    0.086600
14                                             dayofweek    0.086155
23                                         dayofweek_sin    0.066129
29                                            is_workday    0.065101
22                                              hour_cos    0.050592
20                                           date_offset    0.039825
18                                             dayofyear    0.029823
7               Kernenergie [MWh] Berechnete Auflösungen    0.028442
9                Steinkohle [MWh] Berechnete Auflösungen    0.024078
36                                 Holiday_Not a Holiday    0.023036
21                                              hour_sin    0.018790
4              Wind Onshore [MWh] Berechnete Auflösungen    0.010746
6      Sonstige Erneuerbare [MWh] 

In [46]:
threshold = 0.01  # Example threshold
selected_features = feature_importances[feature_importances['Importance'] > threshold]['Feature']
print(f"Number of selected features: {len(selected_features)}")


Number of selected features: 13


In [47]:
selected_feature_indices = feature_importances[feature_importances['Importance'] > threshold].index
X_selected = X_encoded[:, selected_feature_indices]  # Keeps only important features


In [49]:
# Convert sparse matrix to a dense DataFrame with selected feature names
X_selected_dense = pd.DataFrame(X_selected.toarray(), columns=selected_features)

# Combine the features and target variable
filtered_dataset = pd.concat([X_selected_dense, y.reset_index(drop=True)], axis=1)

# Save to CSV
filtered_dataset.to_csv('selected_features.csv', index=False)
print("Filtered dataset saved as 'selected_features.csv'.")

Filtered dataset saved as 'selected_features.csv'.
