Import Libraries and Load Data

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

df_gen = pd.read_csv('Plant_1_Generation_Data.csv')
df_weather = pd.read_csv('Plant_1_Weather_Sensor_Data.csv')

Preprocess Data

In [11]:
df_gen['DATE_TIME'] = pd.to_datetime(df_gen['DATE_TIME'], format='%d-%m-%Y %H:%M')
df_weather['DATE_TIME'] = pd.to_datetime(df_weather['DATE_TIME'], format='%Y-%m-%d %H:%M:%S')

Explore Datasets

In [12]:
gen_time_range = (df_gen['DATE_TIME'].min(), df_gen['DATE_TIME'].max())
print(f"Generation Data Time Range: {gen_time_range[0]} to {gen_time_range[1]}")
weather_time_range = (df_weather['DATE_TIME'].min(), df_weather['DATE_TIME'].max())
print(f"Weather Data Time Range: {weather_time_range[0]} to {weather_time_range[1]}")
gen_unique_keys = df_gen['SOURCE_KEY'].nunique()
print(f"\nNumber of unique inverters in generation data: {gen_unique_keys}")
weather_unique_keys = df_weather['SOURCE_KEY'].nunique()
print(f"Number of unique sensors in weather data: {weather_unique_keys}")

Generation Data Time Range: 2020-05-15 00:00:00 to 2020-06-17 23:45:00
Weather Data Time Range: 2020-05-15 00:00:00 to 2020-06-17 23:45:00

Number of unique inverters in generation data: 22
Number of unique sensors in weather data: 1


Merging Datasets & Feature Engineering

In [13]:
df_merged = pd.merge(df_gen, df_weather, on=['DATE_TIME', 'PLANT_ID'], how='inner')
display(df_merged.head())
df_merged['MONTH'] = df_merged['DATE_TIME'].dt.month
df_merged['DAY_OF_WEEK'] = df_merged['DATE_TIME'].dt.dayofweek
df_merged['HOUR'] = df_merged['DATE_TIME'].dt.hour

Unnamed: 0,DATE_TIME,PLANT_ID,SOURCE_KEY_x,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD,SOURCE_KEY_y,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION
0,2020-05-15,4135001,1BY6WEcLGh8j5v7,0.0,0.0,0.0,6259559.0,HmiyD2TTLFNqkNe,25.184316,22.857507,0.0
1,2020-05-15,4135001,1IF53ai7Xc0U56Y,0.0,0.0,0.0,6183645.0,HmiyD2TTLFNqkNe,25.184316,22.857507,0.0
2,2020-05-15,4135001,3PZuoBAID5Wc2HD,0.0,0.0,0.0,6987759.0,HmiyD2TTLFNqkNe,25.184316,22.857507,0.0
3,2020-05-15,4135001,7JYdWkrLSPkdwr4,0.0,0.0,0.0,7602960.0,HmiyD2TTLFNqkNe,25.184316,22.857507,0.0
4,2020-05-15,4135001,McdE0feGgRqW7Ca,0.0,0.0,0.0,7158964.0,HmiyD2TTLFNqkNe,25.184316,22.857507,0.0


Prepare Data

In [14]:
features = [
    'MONTH', 'DAY_OF_WEEK', 'HOUR',
    'AMBIENT_TEMPERATURE', 'MODULE_TEMPERATURE', 'IRRADIATION',
    'SOURCE_KEY_x'
]

X = pd.get_dummies(df_merged[features], columns=['SOURCE_KEY_x'], drop_first=True)
y = df_merged['AC_POWER']

Model Training And Evaluation

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model_full = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf_model_full.fit(X_train, y_train)

y_pred = rf_model_full.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"\nModel Performance with Weather Data (RMSE): {rmse:.2f} W")


Model Performance with Weather Data (RMSE): 60.53 W
