In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import math

In [2]:
df_solar = pd.read_csv('HEFTcom24\data\solar.csv')

In [5]:
df_solar.drop(columns=['Unnamed: 0','boa_MWh','Wind_MW','Wind_MWh_credit'], inplace=True)

In [6]:
df_solar.columns

Index(['valid_time', 'reference_time', 'CloudCover_Point0_dwd',
       'CloudCover_Point1_dwd', 'CloudCover_Point2_dwd',
       'CloudCover_Point3_dwd', 'CloudCover_Point4_dwd',
       'CloudCover_Point5_dwd', 'CloudCover_Point6_dwd',
       'CloudCover_Point7_dwd',
       ...
       'Temperature_Point5_ncep_pes10', 'Temperature_Point6_ncep_pes10', 'dtm',
       'MIP', 'Solar_MW', 'Solar_capacity_mwp', 'Solar_installedcapacity_mwp',
       'SS_Price', 'DA_Price', 'Solar_MWh_credit'],
      dtype='object', length=104)

In [7]:
# Display columns with NaN values and their counts
na_columns = df_solar.isna().sum()
na_columns = na_columns[na_columns > 0]
print(na_columns)

Series([], dtype: int64)


In [9]:
#correlation between features and Wind_MWh_credit
correlation = df_solar.drop(columns=['dtm','reference_time','valid_time']).corr()['Solar_MWh_credit'].sort_values()
correlation

CloudCover_Point18_ncep              -0.148366
CloudCover_Point1_ncep               -0.145069
CloudCover_Point11_ncep              -0.126816
CloudCover_Point6_ncep               -0.122557
CloudCover_Point9_ncep               -0.122229
                                        ...   
SolarDownwardRadiation_Point8_dwd     0.935096
SolarDownwardRadiation_Point7_dwd     0.935288
SolarDownwardRadiation_Point12_dwd    0.936116
Solar_MW                              1.000000
Solar_MWh_credit                      1.000000
Name: Solar_MWh_credit, Length: 101, dtype: float64

In [11]:
df_solar.columns.to_list()

['valid_time',
 'reference_time',
 'CloudCover_Point0_dwd',
 'CloudCover_Point1_dwd',
 'CloudCover_Point2_dwd',
 'CloudCover_Point3_dwd',
 'CloudCover_Point4_dwd',
 'CloudCover_Point5_dwd',
 'CloudCover_Point6_dwd',
 'CloudCover_Point7_dwd',
 'CloudCover_Point8_dwd',
 'CloudCover_Point9_dwd',
 'CloudCover_Point10_dwd',
 'CloudCover_Point11_dwd',
 'CloudCover_Point12_dwd',
 'CloudCover_Point13_dwd',
 'CloudCover_Point14_dwd',
 'CloudCover_Point15_dwd',
 'CloudCover_Point16_dwd',
 'CloudCover_Point17_dwd',
 'CloudCover_Point18_dwd',
 'CloudCover_Point19_dwd',
 'SolarDownwardRadiation_Point0_dwd',
 'SolarDownwardRadiation_Point1_dwd',
 'SolarDownwardRadiation_Point2_dwd',
 'SolarDownwardRadiation_Point3_dwd',
 'SolarDownwardRadiation_Point4_dwd',
 'SolarDownwardRadiation_Point5_dwd',
 'SolarDownwardRadiation_Point6_dwd',
 'SolarDownwardRadiation_Point7_dwd',
 'SolarDownwardRadiation_Point8_dwd',
 'SolarDownwardRadiation_Point9_dwd',
 'SolarDownwardRadiation_Point10_dwd',
 'SolarDownwardRa

In [15]:
df_solar.valid_time = pd.to_datetime(df_solar.valid_time) 
df_solar.reference_time = pd.to_datetime(df_solar.reference_time)

In [16]:
# Assume your data is in a pandas DataFrame called `df_solar_solar_solar`

# 1. Mean Cloud Cover
df_solar['Mean_CloudCover_dwd'] = df_solar[[f'CloudCover_Point{i}_dwd' for i in range(20)]].mean(axis=1)
df_solar['Mean_CloudCover_ncep'] = df_solar[[f'CloudCover_Point{i}_ncep' for i in range(20)]].mean(axis=1)

# 2. Mean Solar Radiation
df_solar['Mean_SolarRadiation_dwd'] = df_solar[[f'SolarDownwardRadiation_Point{i}_dwd' for i in range(20)]].mean(axis=1)
df_solar['Mean_SolarRadiation_ncep'] = df_solar[[f'SolarDownwardRadiation_Point{i}_ncep' for i in range(20)]].mean(axis=1)

# 3. Mean Temperature
df_solar['Mean_Temperature_dwd'] = df_solar[[f'Temperature_Point{i}_dwd_pes10' for i in range(7)]].mean(axis=1)
df_solar['Mean_Temperature_ncep'] = df_solar[[f'Temperature_Point{i}_ncep_pes10' for i in range(7)]].mean(axis=1)

# 4. Standard Deviation of Cloud Cover
df_solar['Std_CloudCover_dwd'] = df_solar[[f'CloudCover_Point{i}_dwd' for i in range(20)]].std(axis=1)
df_solar['Std_CloudCover_ncep'] = df_solar[[f'CloudCover_Point{i}_ncep' for i in range(20)]].std(axis=1)

# 5. Max-Min Cloud Cover Gradient
df_solar['CloudCover_Gradient_dwd'] = df_solar[[f'CloudCover_Point{i}_dwd' for i in range(20)]].max(axis=1) - df_solar[[f'CloudCover_Point{i}_dwd' for i in range(20)]].min(axis=1)
df_solar['CloudCover_Gradient_ncep'] = df_solar[[f'CloudCover_Point{i}_ncep' for i in range(20)]].max(axis=1) - df_solar[[f'CloudCover_Point{i}_ncep' for i in range(20)]].min(axis=1)

# 6. Solar Radiation Change Over Points (e.g., Point 19 - Point 0)
df_solar['SolarRadiation_Change_dwd'] = df_solar['SolarDownwardRadiation_Point19_dwd'] - df_solar['SolarDownwardRadiation_Point0_dwd']
df_solar['SolarRadiation_Change_ncep'] = df_solar['SolarDownwardRadiation_Point19_ncep'] - df_solar['SolarDownwardRadiation_Point0_ncep']

# 7. Cloud Cover x Solar Radiation Interaction
df_solar['CloudCover_SolarInteraction_dwd'] = df_solar['Mean_CloudCover_dwd'] * df_solar['Mean_SolarRadiation_dwd']
df_solar['CloudCover_SolarInteraction_ncep'] = df_solar['Mean_CloudCover_ncep'] * df_solar['Mean_SolarRadiation_ncep']

# 8. Temperature x Solar Radiation Interaction
df_solar['Temp_SolarInteraction_dwd'] = df_solar['Mean_Temperature_dwd'] * df_solar['Mean_SolarRadiation_dwd']
df_solar['Temp_SolarInteraction_ncep'] = df_solar['Mean_Temperature_ncep'] * df_solar['Mean_SolarRadiation_ncep']

# 9. Cloud Cover x Temperature Interaction
df_solar['Cloud_TemperatureInteraction_dwd'] = df_solar['Mean_CloudCover_dwd'] * df_solar['Mean_Temperature_dwd']
df_solar['Cloud_TemperatureInteraction_ncep'] = df_solar['Mean_CloudCover_ncep'] * df_solar['Mean_Temperature_ncep']

# 10. Temporal Features
# Extracting hour of the day from 'valid_time'
df_solar['hour'] = df_solar['valid_time'].dt.hour

# Sine and cosine encoding for hour (for cyclical behavior)
df_solar['sin_hour'] = np.sin(2 * np.pi * df_solar['hour'] / 24)
df_solar['cos_hour'] = np.cos(2 * np.pi * df_solar['hour'] / 24)

# Day of the year (seasonality)
df_solar['day_of_year'] = df_solar['valid_time'].dt.dayofyear

# Sine and cosine encoding for day of the year (for cyclical seasonality)
df_solar['sin_day'] = np.sin(2 * np.pi * df_solar['day_of_year'] / 365)
df_solar['cos_day'] = np.cos(2 * np.pi * df_solar['day_of_year'] / 365)

# 11. Discrepancy between DWD and NCEP datasets
df_solar['CloudCover_Discrepancy'] = df_solar['Mean_CloudCover_dwd'] - df_solar['Mean_CloudCover_ncep']
df_solar['SolarRadiation_Discrepancy'] = df_solar['Mean_SolarRadiation_dwd'] - df_solar['Mean_SolarRadiation_ncep']
df_solar['Temperature_Discrepancy'] = df_solar['Mean_Temperature_dwd'] - df_solar['Mean_Temperature_ncep']

# 12. Time Delta (difference between valid_time and reference_time)
df_solar['Time_Delta'] = (df_solar['valid_time'] - df_solar['reference_time']).dt.total_seconds() / 3600  # in hours

# Optional: Normalize differences for temporal changes
# For example, normalizing cloud cover difference by time delta
df_solar['Normalized_CloudCover_dwd'] = (df_solar['CloudCover_Point19_dwd'] - df_solar['CloudCover_Point0_dwd']) / df_solar['Time_Delta']
df_solar['Normalized_CloudCover_ncep'] = (df_solar['CloudCover_Point19_ncep'] - df_solar['CloudCover_Point0_ncep']) / df_solar['Time_Delta']

# 13. Rolling Mean (if applicable, depending on time series)
# Assuming df_solar_solar is ordered by time, create rolling averages
df_solar['RollingMean_CloudCover_dwd'] = df_solar['Mean_CloudCover_dwd'].rolling(window=3, min_periods=1).mean()
df_solar['RollingMean_SolarRadiation_dwd'] = df_solar['Mean_SolarRadiation_dwd'].rolling(window=3, min_periods=1).mean()


In [31]:
from sklearn.preprocessing import StandardScaler
def handle_missing_values(df):
    # For numeric columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
    
    # For categorical columns
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns
    df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])
    return df
def scale_features(df, exclude_cols=['valid_time', 'reference_time','dtm']):
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df = handle_missing_values(df)
    scaler = StandardScaler()
    cols_to_scale = [col for col in df.columns if col not in exclude_cols]
    df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])
    return df, scaler

In [32]:
df_solar1 = handle_missing_values(df_solar)
df_solar1, scaler = scale_features(df_solar1)

In [25]:
df_solar1

Unnamed: 0,valid_time,reference_time,CloudCover_Point0_dwd,CloudCover_Point1_dwd,CloudCover_Point2_dwd,CloudCover_Point3_dwd,CloudCover_Point4_dwd,CloudCover_Point5_dwd,CloudCover_Point6_dwd,CloudCover_Point7_dwd,...,sin_day,cos_day,CloudCover_Discrepancy,SolarRadiation_Discrepancy,Temperature_Discrepancy,Time_Delta,Normalized_CloudCover_dwd,Normalized_CloudCover_ncep,RollingMean_CloudCover_dwd,RollingMean_SolarRadiation_dwd
0,2020-09-20 00:00:00+00:00,2020-09-20 00:00:00+00:00,-0.982652,-0.285306,-1.274204,-0.254969,-1.204310,-1.236174,-0.577607,-0.968967,...,-1.413758,-0.343050,1.076331,0.363312,-0.356110,-1.714635,,,-1.004507,-0.651924
1,2020-09-20 00:30:00+00:00,2020-09-20 00:00:00+00:00,-0.952231,-0.314613,-1.055732,-0.252858,-1.163102,-0.959317,-0.575180,-0.934148,...,-1.413758,-0.343050,1.111156,0.363312,-0.325474,-1.679991,1.963820,-0.006918,-0.965610,-0.651924
2,2020-09-20 01:00:00+00:00,2020-09-20 00:00:00+00:00,-0.921809,-0.343920,-0.837260,-0.250747,-1.121895,-0.682460,-0.572753,-0.899330,...,-1.413758,-0.343050,1.145980,0.363312,-0.294838,-1.645346,1.802157,-0.006918,-0.926712,-0.651924
3,2020-09-20 01:30:00+00:00,2020-09-20 00:00:00+00:00,-0.792440,-0.278277,-0.678646,-0.252858,-1.113218,-0.017810,-0.420935,-0.889174,...,-1.413758,-0.343050,1.302960,0.363312,-0.250579,-1.610701,0.937752,-0.006918,-0.843696,-0.651924
4,2020-09-20 02:00:00+00:00,2020-09-20 00:00:00+00:00,-0.663071,-0.212634,-0.520032,-0.254969,-1.104542,0.646841,-0.269117,-0.879019,...,-1.413758,-0.343050,1.459940,0.363312,-0.206321,-1.576057,0.505550,-0.006918,-0.755460,-0.651924
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
529625,2024-05-19 21:30:00+00:00,2024-05-19 00:00:00+00:00,-2.323971,-2.319299,-2.333750,-2.226691,-1.859483,-2.242294,-2.333581,-2.321654,...,0.895764,-1.174776,-0.504910,0.363460,1.406248,-0.224912,0.014799,-0.006918,-2.536282,-0.651887
529626,2024-05-19 22:00:00+00:00,2024-05-19 00:00:00+00:00,-2.323971,-2.274052,-2.333750,-2.155119,-1.974761,-2.242294,-2.333581,-2.321654,...,0.895764,-1.174776,-0.492299,0.363486,1.334205,-0.190267,0.014799,-0.006918,-2.523782,-0.651853
529627,2024-05-19 22:30:00+00:00,2024-05-19 00:00:00+00:00,-2.321605,-1.881825,-2.291858,-1.908245,-2.087831,-2.242294,-2.333581,-2.321654,...,0.895764,-1.174776,-0.421224,0.363287,1.280684,-0.155622,0.014095,-0.006918,-2.494824,-0.651877
529628,2024-05-19 23:00:00+00:00,2024-05-19 00:00:00+00:00,-2.319239,-1.489599,-2.249967,-1.661371,-2.200902,-2.242294,-2.333581,-2.321654,...,0.895764,-1.174776,-0.350148,0.363089,1.227163,-0.120978,0.013422,-0.006918,-2.448285,-0.651936


In [33]:
def forward_feature_selection(X, y, max_features=15):
    from sklearn.linear_model import LinearRegression
    from sklearn.model_selection import cross_val_score
    import numpy as np
    import pandas as pd
    
    # Initialize variables
    features_selected = []
    features_to_select = list(X.columns)
    best_scores = []
    correlations = []
    
    # Model to use
    model = LinearRegression()
    
    for i in range(max_features):
        best_score = float('-inf')
        best_feature = None
        
        # Try each feature not yet selected
        for feature in features_to_select:
            # Combine selected features with current candidate
            current_features = features_selected + [feature]
            
            # Create dataset with current features
            X_current = X[current_features]
            
            # Perform cross-validation
            scores = cross_val_score(model, X_current, y, cv=5, scoring='r2')
            avg_score = np.mean(scores)
            
            # Update if this is the best score
            if avg_score > best_score:
                best_score = avg_score
                best_feature = feature
        
        # Add best feature to selected features
        if best_feature:
            features_selected.append(best_feature)
            features_to_select.remove(best_feature)
            best_scores.append(best_score)
            
            # Fit model and get correlation for print
            X_current = X[features_selected]
            model.fit(X_current, y)
            y_pred = model.predict(X_current)
            mse = np.mean((y - y_pred) ** 2)
            correlation = np.corrcoef(y, y_pred)[0, 1]
            correlations.append(correlation)
            
            # Print results
            print(f"Step {i+1}:")
            print(f"Added feature: {best_feature}")
            print(f"R-squared score: {best_score:.4f}")
            print(f"MSE: {mse:.4f}")
            print(f"Correlation: {correlation:.4f}")
            print(f"Current features: {features_selected}")
            print("--------------------")
    
    # Create summary dataframe
    summary = pd.DataFrame({
        'Step': range(1, len(features_selected) + 1),
        'Added_Feature': features_selected,
        'R_Squared': best_scores,
        'Correlation': correlations
    })
    
    return summary, features_selected

# Example usage
def run_feature_selection(df, target_column, max_features=20):
    # Prepare data
    y = df[target_column]
    X = df.drop(columns=[target_column])
    
    # Remove any non-numeric columns
    X = X.select_dtypes(include=[np.number])
    
    # Run feature selection
    summary, selected_features = forward_feature_selection(X, y, max_features)
    
    return summary, selected_features

# Plot results
def plot_feature_selection_results(summary):
    import matplotlib.pyplot as plt
    
    plt.figure(figsize=(12, 6))
    plt.plot(summary['Step'], summary['R_Squared'], label='R-Squared')
    plt.plot(summary['Step'], summary['Correlation'], label='Correlation')
    plt.xlabel('Number of Features')
    plt.ylabel('Score')
    plt.title('Feature Selection Results')
    plt.legend()
    plt.grid(True)
    plt.show()

# Full example of how to use
"""
# Assuming df is your dataframe and 'target' is your target column
summary, selected_features = run_feature_selection(df, 'target')
plot_feature_selection_results(summary)

# Print final selected features and their coefficients
final_model = LinearRegression()
X_final = df[selected_features]
final_model.fit(X_final, df['target'])

coefficients = pd.DataFrame({
    'Feature': selected_features,
    'Coefficient': final_model.coef_
}).sort_values(by='Coefficient', key=abs, ascending=False)

print("\nFinal Model Coefficients:")
print(coefficients)
"""

'\n# Assuming df is your dataframe and \'target\' is your target column\nsummary, selected_features = run_feature_selection(df, \'target\')\nplot_feature_selection_results(summary)\n\n# Print final selected features and their coefficients\nfinal_model = LinearRegression()\nX_final = df[selected_features]\nfinal_model.fit(X_final, df[\'target\'])\n\ncoefficients = pd.DataFrame({\n    \'Feature\': selected_features,\n    \'Coefficient\': final_model.coef_\n}).sort_values(by=\'Coefficient\', key=abs, ascending=False)\n\nprint("\nFinal Model Coefficients:")\nprint(coefficients)\n'

In [28]:
df_solar1.columns.to_list()

['valid_time',
 'reference_time',
 'CloudCover_Point0_dwd',
 'CloudCover_Point1_dwd',
 'CloudCover_Point2_dwd',
 'CloudCover_Point3_dwd',
 'CloudCover_Point4_dwd',
 'CloudCover_Point5_dwd',
 'CloudCover_Point6_dwd',
 'CloudCover_Point7_dwd',
 'CloudCover_Point8_dwd',
 'CloudCover_Point9_dwd',
 'CloudCover_Point10_dwd',
 'CloudCover_Point11_dwd',
 'CloudCover_Point12_dwd',
 'CloudCover_Point13_dwd',
 'CloudCover_Point14_dwd',
 'CloudCover_Point15_dwd',
 'CloudCover_Point16_dwd',
 'CloudCover_Point17_dwd',
 'CloudCover_Point18_dwd',
 'CloudCover_Point19_dwd',
 'SolarDownwardRadiation_Point0_dwd',
 'SolarDownwardRadiation_Point1_dwd',
 'SolarDownwardRadiation_Point2_dwd',
 'SolarDownwardRadiation_Point3_dwd',
 'SolarDownwardRadiation_Point4_dwd',
 'SolarDownwardRadiation_Point5_dwd',
 'SolarDownwardRadiation_Point6_dwd',
 'SolarDownwardRadiation_Point7_dwd',
 'SolarDownwardRadiation_Point8_dwd',
 'SolarDownwardRadiation_Point9_dwd',
 'SolarDownwardRadiation_Point10_dwd',
 'SolarDownwardRa

In [34]:
summary, selected_features = run_feature_selection(df_solar1.drop(columns=['Solar_MW','reference_time','valid_time']), 'Solar_MWh_credit')

Step 1:
Added feature: Mean_SolarRadiation_dwd
R-squared score: 0.8788
MSE: 0.1122
Correlation: 0.9422
Current features: ['Mean_SolarRadiation_dwd']
--------------------
Step 2:
Added feature: SolarDownwardRadiation_Point19_ncep
R-squared score: 0.8848
MSE: 0.1058
Correlation: 0.9456
Current features: ['Mean_SolarRadiation_dwd', 'SolarDownwardRadiation_Point19_ncep']
--------------------
Step 3:
Added feature: cos_day
R-squared score: 0.8889
MSE: 0.1018
Correlation: 0.9477
Current features: ['Mean_SolarRadiation_dwd', 'SolarDownwardRadiation_Point19_ncep', 'cos_day']
--------------------
Step 4:
Added feature: RollingMean_SolarRadiation_dwd
R-squared score: 0.8914
MSE: 0.0993
Correlation: 0.9491
Current features: ['Mean_SolarRadiation_dwd', 'SolarDownwardRadiation_Point19_ncep', 'cos_day', 'RollingMean_SolarRadiation_dwd']
--------------------
Step 5:
Added feature: Solar_installedcapacity_mwp
R-squared score: 0.8934
MSE: 0.0981
Correlation: 0.9497
Current features: ['Mean_SolarRadiati

In [38]:
from sklearn.tree import DecisionTreeRegressor
import pandas as pd
import plotly.express as px

# Prepare the data
X = df_solar1.drop(columns=['Solar_MW', 'reference_time', 'valid_time', 'dtm','Solar_MWh_credit'])
y = df_solar1['Solar_MWh_credit']

# Define the selected features
selected_features = X.columns.tolist()

# Initialize and fit the decision tree regressor
tree_model = DecisionTreeRegressor(random_state=42)
tree_model.fit(X, y)

# Get feature importances
feature_importances = tree_model.feature_importances_

# Create a DataFrame for better visualization
importance_df = pd.DataFrame({
    'Feature': selected_features,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

# Print the most important features
print(importance_df)

# Plot the feature importances using Plotly
fig = px.bar(importance_df, x='Feature', y='Importance', title='Feature Importance from Decision Tree', labels={'Feature': 'Features', 'Importance': 'Importance'})
fig.show()

                            Feature  Importance
101         Mean_SolarRadiation_dwd    0.866448
119                        cos_hour    0.018812
130  RollingMean_SolarRadiation_dwd    0.016446
102        Mean_SolarRadiation_ncep    0.007231
122                         cos_day    0.007007
..                              ...         ...
8             CloudCover_Point8_dwd    0.000138
7             CloudCover_Point7_dwd    0.000137
63          CloudCover_Point16_ncep    0.000127
0             CloudCover_Point0_dwd    0.000124
62          CloudCover_Point15_ncep    0.000118

[131 rows x 2 columns]
