In [48]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [49]:
plant_1_gen = pd.read_csv('Plant_1_Generation_Data.csv')
plant_2_gen = pd.read_csv('Plant_2_Generation_Data.csv')
plant_1_weather = pd.read_csv('Plant_1_Weather_Sensor_Data.csv')
plant_2_weather = pd.read_csv('Plant_2_Weather_Sensor_Data.csv')

In [50]:
# Rebuild combined_data with correct per-plant-time aggregation and merge
gen_all = pd.concat([plant_1_gen, plant_2_gen], ignore_index=True)
weather_all = pd.concat([plant_1_weather, plant_2_weather], ignore_index=True)

# Robust datetime parsing for mixed formats without warnings
# Try ISO with seconds, ISO without seconds, then day-first format, then a final general day-first parse

def parse_mixed_datetime(series: pd.Series) -> pd.Series:
    parsed = pd.to_datetime(series, format='%Y-%m-%d %H:%M:%S', errors='coerce')
    mask = parsed.isna()
    if mask.any():
        parsed2 = pd.to_datetime(series[mask], format='%Y-%m-%d %H:%M', errors='coerce')
        parsed.loc[mask] = parsed2
    mask = parsed.isna()
    if mask.any():
        parsed3 = pd.to_datetime(series[mask], format='%d-%m-%Y %H:%M', errors='coerce', dayfirst=True)
        parsed.loc[mask] = parsed3
    mask = parsed.isna()
    if mask.any():
        parsed4 = pd.to_datetime(series[mask], errors='coerce', dayfirst=True)
        parsed.loc[mask] = parsed4
    if parsed.isna().any():
        num_bad = int(parsed.isna().sum())
        print(f"Warning: {num_bad} DATE_TIME values could not be parsed and will be dropped.")
    return parsed

# Ensure datetime type before grouping/merging
gen_all['DATE_TIME'] = parse_mixed_datetime(gen_all['DATE_TIME'])
weather_all['DATE_TIME'] = parse_mixed_datetime(weather_all['DATE_TIME'])

# Drop any rows with unparsed timestamps before grouping
gen_all = gen_all.dropna(subset=['DATE_TIME'])
weather_all = weather_all.dropna(subset=['DATE_TIME'])

gen_agg = gen_all.groupby(['PLANT_ID','DATE_TIME'], as_index=False).agg({
    'DC_POWER': 'sum',
    'AC_POWER': 'sum',
    'DAILY_YIELD': 'max',
    'TOTAL_YIELD': 'max'
})

weather_agg = weather_all.groupby(['PLANT_ID','DATE_TIME'], as_index=False).agg({
    'AMBIENT_TEMPERATURE': 'mean',
    'MODULE_TEMPERATURE': 'mean',
    'IRRADIATION': 'mean'
})

combined_data = pd.merge(gen_agg, weather_agg, on=['PLANT_ID','DATE_TIME'], how='inner')


In [51]:
# Basic feature engineering
combined_data['hour'] = combined_data['DATE_TIME'].dt.hour
combined_data['day_of_week'] = combined_data['DATE_TIME'].dt.dayofweek
combined_data['month'] = combined_data['DATE_TIME'].dt.month

# Simple encoding for PLANT_ID (assuming values like 1, 2)
# If PLANT_ID is non-numeric, convert to category codes
if not np.issubdtype(combined_data['PLANT_ID'].dtype, np.number):
    combined_data['PLANT_ID'] = combined_data['PLANT_ID'].astype('category').cat.codes


In [52]:
# Filter out night-time rows for training (keeps evaluation consistent if using same split)
train_data = combined_data[combined_data['IRRADIATION'] > 0].copy()

feature_cols = ['AMBIENT_TEMPERATURE', 'MODULE_TEMPERATURE', 'IRRADIATION',
                'hour', 'day_of_week', 'month', 'PLANT_ID']
X = train_data[feature_cols]
y = train_data['DC_POWER']


In [53]:
# Train-test split and model training (simple and robust)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("\nClean Model Performance (daylight only):")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R2 Score: {r2:.4f}")



Clean Model Performance (daylight only):
Mean Squared Error: 33696739.65
R2 Score: 0.9950


In [54]:
# Save the cleaned model
from joblib import dump
with open('dc_power_model_daylight.joblib', 'wb') as f:
    dump(model, f)
print("Model saved to dc_power_model_daylight.joblib")


Model saved to dc_power_model_daylight.joblib
