In [2]:
# ============================================================
# ☀️ SOLAR POWER PREDICTION PROJECT — ALL WEEKS
# AICTE–Shell Skills4Future Internship | Theme: Energy
# Author: Supratik Mitra
# ============================================================

# -----------------------------
# WEEK 1: Data Loading & Preprocessing
# -----------------------------

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Load datasets
gen = pd.read_csv('../data/Plant_1_Generation_Data.csv')
weather = pd.read_csv('../data/Plant_1_Weather_Sensor_Data.csv')

# Merge data
gen['DATE_TIME'] = pd.to_datetime(gen['DATE_TIME'])
weather['DATE_TIME'] = pd.to_datetime(weather['DATE_TIME'])
data = pd.merge(gen, weather, on='DATE_TIME', how='inner')

# Extract useful features
data['Hour'] = data['DATE_TIME'].dt.hour
data['Day'] = data['DATE_TIME'].dt.day
data['Month'] = data['DATE_TIME'].dt.month

# Drop unnecessary columns
drop_cols = ['PLANT_ID', 'SOURCE_KEY', 'DATE_TIME']
data.drop(columns=drop_cols, inplace=True, errors='ignore')

# Handle missing values
data = data.apply(pd.to_numeric, errors='coerce')
data.fillna(data.mean(numeric_only=True), inplace=True)


# Save cleaned data for reference
data.to_csv('../results/cleaned_data.csv', index=False)

# -----------------------------
# WEEK 2: Model Building (Commented for Week 1)
# -----------------------------
# Uncomment for Week 2

# features = ['AMBIENT_TEMPERATURE', 'MODULE_TEMPERATURE', 'IRRADIATION', 'Hour', 'Day', 'Month']
# target_dc = 'DC_POWER'
# target_ac = 'AC_POWER'
# X = data[features]
# y_dc = data[target_dc]
# y_ac = data[target_ac]

# X_train, X_test, y_dc_train, y_dc_test = train_test_split(X, y_dc, test_size=0.2, random_state=42)
# X_train2, X_test2, y_ac_train, y_ac_test = train_test_split(X, y_ac, test_size=0.2, random_state=42)

# rf_dc = RandomForestRegressor(n_estimators=120, random_state=42)
# rf_ac = RandomForestRegressor(n_estimators=120, random_state=42)
# rf_dc.fit(X_train, y_dc_train)
# rf_ac.fit(X_train2, y_ac_train)

# joblib.dump(rf_dc, '../models/dc_rf_model.pkl')
# joblib.dump(rf_ac, '../models/ac_rf_model.pkl')

# -----------------------------
# WEEK 3: Evaluation & Visualization (Commented for Week 1–2)
# -----------------------------
# Uncomment for Week 3

# y_dc_pred = rf_dc.predict(X_test)
# y_ac_pred = rf_ac.predict(X_test2)

# print("DC Power R2:", r2_score(y_dc_test, y_dc_pred))
# print("AC Power R2:", r2_score(y_ac_test, y_ac_pred))
# print("DC MAE:", mean_absolute_error(y_dc_test, y_dc_pred))
# print("AC MAE:", mean_absolute_error(y_ac_test, y_ac_pred))

# plt.figure(figsize=(8,6))
# sns.scatterplot(x=y_dc_test, y=y_dc_pred)
# plt.title('DC Power: Actual vs Predicted')
# plt.xlabel('Actual DC Power')
# plt.ylabel('Predicted DC Power')
# plt.savefig('../results/predictions_vs_actual_dc.png')

# plt.figure(figsize=(8,6))
# sns.barplot(x=rf_dc.feature_importances_, y=features)
# plt.title('Feature Importance for DC Power')
# plt.savefig('../results/feature_importances.png')

# -----------------------------
# WEEK 4: Optimization & Export (Commented for Week 1–3)
# -----------------------------
# Uncomment for Week 4

# from sklearn.model_selection import GridSearchCV

# param_grid = {
#     'n_estimators': [100, 150],
#     'max_depth': [None, 10, 20],
# }

# grid_rf = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=3, n_jobs=-1)
# grid_rf.fit(X_train, y_dc_train)
# print("Best Parameters:", grid_rf.best_params_)
# joblib.dump(grid_rf.best_estimator_, '../models/final_dc_model.pkl')

print("Week 1 complete: Data preprocessed and ready for modeling.")


  gen['DATE_TIME'] = pd.to_datetime(gen['DATE_TIME'])


Week 1 complete: Data preprocessed and ready for modeling.
