In [None]:
from google.colab import drive

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
file_path = '/content/drive/Shareddrives/DATA245_Project/SolarIrradiance_processed.csv'

In [None]:
import pandas as pd
import numpy as np
df = pd.read_csv(file_path)

In [None]:
df.head()

Unnamed: 0,Year,Month,Day,Hour,Minute,Temperature,Clearsky DHI,Clearsky DNI,Clearsky GHI,Cloud Type,...,DHI,DNI,GHI,Relative Humidity,Solar Zenith Angle,Pressure,Precipitable Water,Wind Speed,zipcode,hour_day_part
0,2018,1,1,0,30,10.4,0,0,0,4,...,0,0,0,96.33,165.11,1024,1.7,0.7,95110,late_night_start_of_day
1,2018,1,1,1,30,10.2,0,0,0,0,...,0,0,0,96.98,157.82,1024,1.8,0.7,95110,late_night_start_of_day
2,2018,1,1,2,30,9.9,0,0,0,0,...,0,0,0,97.66,146.99,1024,1.8,0.7,95110,late_night_start_of_day
3,2018,1,1,3,30,9.7,0,0,0,0,...,0,0,0,97.31,135.26,1024,1.8,0.7,95110,late_night_start_of_day
4,2018,1,1,4,30,9.4,0,0,0,0,...,0,0,0,97.68,123.34,1025,1.8,0.8,95110,late_night_start_of_day


In [None]:
# Check for missing values
print("Missing values:\n", df.isnull().sum())

Missing values:
 Year                  0
Month                 0
Day                   0
Hour                  0
Minute                0
Temperature           0
Clearsky DHI          0
Clearsky DNI          0
Clearsky GHI          0
Cloud Type            0
Dew Point             0
DHI                   0
DNI                   0
GHI                   0
Relative Humidity     0
Solar Zenith Angle    0
Pressure              0
Precipitable Water    0
Wind Speed            0
zipcode               0
hour_day_part         0
dtype: int64


In [None]:
# Tabular summary by hour part
# Grouping by 'hour_day_part' and counting rows and the count of zeros in 'GHI'
solar_summary_df = df.groupby('hour_day_part')['GHI'].agg([
    ('Row count', 'count'),
    ('Count_of_Zeros', lambda x: (x == 0).sum()),
    ('Percentage_of_Zeros', lambda x: ((x == 0).sum() / len(x)) * 100)
]).reset_index()

# Display the tabular summary
print(solar_summary_df)

             hour_day_part  Row count  Count_of_Zeros  Percentage_of_Zeros
0                afternoon     219120               0             0.000000
1            early_morning     164340           50910            30.978459
2              early_night     109560          109560           100.000000
3                  evening     109560           14341            13.089631
4             late_evening     109560           76457            69.785506
5               late_night     109560          109560           100.000000
6  late_night_start_of_day     273900          273900           100.000000
7                  morning     219120               0             0.000000


In [None]:
# Filtering data - we need to remove all day part values where >50% are 0
# Defining the hour_day_part values to remove
darkvalues_to_remove = ['early_night', 'late_evening', 'late_night', 'late_night_start_of_day']

# Filtering out rows with the specified hour_day_part values
solar_ir_df_nondark = df[~df['hour_day_part'].isin(darkvalues_to_remove)]

# Display the filtered DataFrame
solar_ir_df_nondark.head()

Unnamed: 0,Year,Month,Day,Hour,Minute,Temperature,Clearsky DHI,Clearsky DNI,Clearsky GHI,Cloud Type,...,DHI,DNI,GHI,Relative Humidity,Solar Zenith Angle,Pressure,Precipitable Water,Wind Speed,zipcode,hour_day_part
5,2018,1,1,5,30,9.3,0,0,0,0,...,0,0,0,96.84,111.57,1025,1.8,0.7,95110,early_morning
6,2018,1,1,6,30,9.1,0,0,0,0,...,0,0,0,96.7,100.18,1025,1.8,0.7,95110,early_morning
7,2018,1,1,7,30,9.6,10,87,12,0,...,10,87,12,92.3,89.01,1026,1.8,0.7,95110,early_morning
8,2018,1,1,8,30,11.0,47,541,145,7,...,66,113,87,85.28,79.54,1026,1.8,0.9,95110,morning
9,2018,1,1,9,30,13.0,63,730,298,4,...,121,292,215,77.34,71.18,1026,1.8,1.3,95110,morning


In [None]:
solar_ir_df_nondark.shape

(712140, 21)

In [None]:
solar_dark_missing_share = ((1314720-712140)*100)/1314720
print(f"Share of data that was filtered out = {solar_dark_missing_share}")

Share of data that was filtered out = 45.833333333333336


In [None]:
# Information about the dataset
solar_ir_df_nondark.info()

<class 'pandas.core.frame.DataFrame'>
Index: 712140 entries, 5 to 1314713
Data columns (total 21 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   Year                712140 non-null  int64  
 1   Month               712140 non-null  int64  
 2   Day                 712140 non-null  int64  
 3   Hour                712140 non-null  int64  
 4   Minute              712140 non-null  int64  
 5   Temperature         712140 non-null  float64
 6   Clearsky DHI        712140 non-null  int64  
 7   Clearsky DNI        712140 non-null  int64  
 8   Clearsky GHI        712140 non-null  int64  
 9   Cloud Type          712140 non-null  int64  
 10  Dew Point           712140 non-null  float64
 11  DHI                 712140 non-null  int64  
 12  DNI                 712140 non-null  int64  
 13  GHI                 712140 non-null  int64  
 14  Relative Humidity   712140 non-null  float64
 15  Solar Zenith Angle  712140 non-null  f

In [None]:
# 1-hot encoding for multiple columns
# Specifying the list of columns to one-hot encode
columns_to_encode = ['Cloud Type', 'zipcode', 'hour_day_part']

# Performing one-hot encoding for multiple columns and ensuring numeric encoding instead of boolean
encoded_df = pd.get_dummies(solar_ir_df_nondark, columns=columns_to_encode, prefix=columns_to_encode, dtype='int')

# after encoding adding to the nondark encoded
solar_ir_df_nondark_encoded = encoded_df

# Display the DataFrame with one-hot encoded columns
solar_ir_df_nondark_encoded.head(10)


Unnamed: 0,Year,Month,Day,Hour,Minute,Temperature,Clearsky DHI,Clearsky DNI,Clearsky GHI,Dew Point,...,zipcode_95136,zipcode_95138,zipcode_95139,zipcode_95141,zipcode_95148,zipcode_95192,hour_day_part_afternoon,hour_day_part_early_morning,hour_day_part_evening,hour_day_part_morning
5,2018,1,1,5,30,9.3,0,0,0,8.8,...,0,0,0,0,0,0,0,1,0,0
6,2018,1,1,6,30,9.1,0,0,0,8.6,...,0,0,0,0,0,0,0,1,0,0
7,2018,1,1,7,30,9.6,10,87,12,8.4,...,0,0,0,0,0,0,0,1,0,0
8,2018,1,1,8,30,11.0,47,541,145,8.6,...,0,0,0,0,0,0,0,0,0,1
9,2018,1,1,9,30,13.0,63,730,298,9.1,...,0,0,0,0,0,0,0,0,0,1
10,2018,1,1,10,30,15.1,68,821,418,9.3,...,0,0,0,0,0,0,0,0,0,1
11,2018,1,1,11,30,16.8,72,859,488,8.9,...,0,0,0,0,0,0,0,0,0,1
12,2018,1,1,12,30,18.1,75,857,498,8.2,...,0,0,0,0,0,0,1,0,0,0
13,2018,1,1,13,30,18.6,74,824,448,7.8,...,0,0,0,0,0,0,1,0,0,0
14,2018,1,1,14,30,18.3,67,764,347,7.6,...,0,0,0,0,0,0,1,0,0,0


In [None]:
# Droping the minute column as it is a constant accross
solar_ir_df_nondark_encoded = solar_ir_df_nondark_encoded.drop(columns=['Minute'])

In [None]:
solar_ir_df_nondark_encoded.dtypes

Year                             int64
Month                            int64
Day                              int64
Hour                             int64
Temperature                    float64
Clearsky DHI                     int64
Clearsky DNI                     int64
Clearsky GHI                     int64
Dew Point                      float64
DHI                              int64
DNI                              int64
GHI                              int64
Relative Humidity              float64
Solar Zenith Angle             float64
Pressure                         int64
Precipitable Water             float64
Wind Speed                     float64
Cloud Type_0                     int64
Cloud Type_2                     int64
Cloud Type_3                     int64
Cloud Type_4                     int64
Cloud Type_5                     int64
Cloud Type_6                     int64
Cloud Type_7                     int64
Cloud Type_8                     int64
Cloud Type_9             

In [None]:
solar_ir_df_nondark_encoded.head(5)

Unnamed: 0,Year,Month,Day,Hour,Temperature,Clearsky DHI,Clearsky DNI,Clearsky GHI,Dew Point,DHI,...,zipcode_95136,zipcode_95138,zipcode_95139,zipcode_95141,zipcode_95148,zipcode_95192,hour_day_part_afternoon,hour_day_part_early_morning,hour_day_part_evening,hour_day_part_morning
5,2018,1,1,5,9.3,0,0,0,8.8,0,...,0,0,0,0,0,0,0,1,0,0
6,2018,1,1,6,9.1,0,0,0,8.6,0,...,0,0,0,0,0,0,0,1,0,0
7,2018,1,1,7,9.6,10,87,12,8.4,10,...,0,0,0,0,0,0,0,1,0,0
8,2018,1,1,8,11.0,47,541,145,8.6,66,...,0,0,0,0,0,0,0,0,0,1
9,2018,1,1,9,13.0,63,730,298,9.1,121,...,0,0,0,0,0,0,0,0,0,1


In [None]:
solar_ir_df_nondark_encoded
# Dropping the "Clearsky GHI","Clearsky DHI","Clearsky DNI","DHI", 'Solar Zenith Angle'
columns_to_drop = ["Clearsky GHI", "Clearsky DHI", "Clearsky DNI", "DHI", "Solar Zenith Angle"]
solar_ir_df_nondark_encoded.drop(columns=columns_to_drop, inplace=True)

**Split data into testing and training data**

In [None]:
from sklearn.model_selection import train_test_split

# Extract features (X) and target variable (y)
X = solar_ir_df_nondark_encoded.drop('GHI', axis=1)
y = solar_ir_df_nondark_encoded['GHI']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**Feature Scaling**

In [None]:
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Standardize the training and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

**Training the model**

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the Random Forest regressor
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)  # You can adjust hyperparameters like n_estimators, max_depth, etc.

# Train the Random Forest regressor
rf_regressor.fit(X_train_scaled, y_train)

**Predictions on testing data**

In [None]:
# Make predictions on the test set
y_pred = rf_regressor.predict(X_test_scaled)

**Model Performance**

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate R-squared (R^2)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R-squared (R^2):", r2)

Mean Squared Error (MSE): 57.212883404948464
R-squared (R^2): 0.999427081732272
