## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Importing Data

In [2]:
train_data = pd.read_csv("Data/train.csv")
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210240 entries, 0 to 210239
Data columns (total 16 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   Timestamp           210240 non-null  object 
 1   Temperature         210240 non-null  float64
 2   Dew Point           210240 non-null  float64
 3   Surface Albedo      210240 non-null  float64
 4   Pressure            210240 non-null  int64  
 5   Wind Direction      210240 non-null  int64  
 6   Wind Speed          210240 non-null  float64
 7   Clearsky DHI        210240 non-null  int64  
 8   Clearsky DNI        210240 non-null  int64  
 9   Clearsky GHI        210240 non-null  int64  
 10  Fill Flag           210240 non-null  int64  
 11  Ozone               210240 non-null  float64
 12  Cloud Type          210240 non-null  int64  
 13  Solar Zenith Angle  210240 non-null  float64
 14  Precipitable Water  210240 non-null  float64
 15  Relative Humidity   210240 non-nul

# Data Preprocessing

In [3]:
train_data['Timestamp'] = pd.to_datetime(train_data['Timestamp'])

In [4]:
# Create new features
train_data['hour_of_day'] = train_data['Timestamp'].dt.hour
train_data['day_of_week'] = train_data['Timestamp'].dt.dayofweek
train_data['month_of_year'] = train_data['Timestamp'].dt.month
train_data['is_weekend'] = train_data['Timestamp'].dt.weekday >= 5
train_data['season'] = (train_data['Timestamp'].dt.month%12 + 3)//3
train_data['hour_of_day'] = train_data['Timestamp'].dt.hour
train_data['minute_of_hour'] = train_data['Timestamp'].dt.minute
train_data['second_of_minute'] = train_data['Timestamp'].dt.second

In [5]:
train_data = train_data.drop(["Timestamp"], axis = 1)

In [6]:
# Define the X and y variables for training and testing
X_train = train_data.drop(['Clearsky DHI', 'Clearsky DNI', 'Clearsky GHI'], axis=1)
y_train = train_data[['Clearsky DHI', 'Clearsky DNI', 'Clearsky GHI']]

In [None]:
from xgboost import XGBRegressor
xgb_model = XGBRegressor(objective='reg:squarederror')

In [None]:
xgb_model.fit(X_train,y_train)

In [None]:
y_pred = xgb_model.predict(X_train)

In [9]:
test_data = pd.read_csv("Data/test.csv")

In [10]:
# converting Timestamp as datetime object
test_data["Timestamp"] = pd.to_datetime(test_data["Timestamp"])

In [11]:
# Create new features
test_data['hour_of_day'] = test_data['Timestamp'].dt.hour
test_data['day_of_week'] = test_data['Timestamp'].dt.dayofweek
test_data['month_of_year'] = test_data['Timestamp'].dt.month
test_data['is_weekend'] = test_data['Timestamp'].dt.weekday >= 5
test_data['season'] = (test_data['Timestamp'].dt.month%12 + 3)//3
test_data['hour_of_day'] = test_data['Timestamp'].dt.hour
test_data['minute_of_hour'] = test_data['Timestamp'].dt.minute
test_data['second_of_minute'] = test_data['Timestamp'].dt.second

In [12]:
test_data = test_data.drop(["Timestamp"], axis = 1)

In [13]:
X_test = test_data.drop(['Clearsky DHI', 'Clearsky DNI','Clearsky GHI'], axis = 1)

In [None]:
y_test_pred = xgb_model.predict(X_test)

In [None]:
submit_data = pd.DataFrame(y_test_pred,
                           columns = ['Clearsky DHI', 'Clearsky DNI','Clearsky GHI'])

In [None]:
submit_data.to_csv("submission_v9.csv",
                  index = False)

# Hyper Parameter Tunning

In [None]:
# Define the XGBRegressor model
xgb = XGBRegressor(objective='reg:squarederror')

In [None]:
# Define the hyperparameter grid to search over
param_grid = {
    'n_estimators': [100, 500, 1000],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.5]
}

In [None]:
# Define the GridSearchCV object
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
grid_search = RandomizedSearchCV(xgb, param_grid, cv=3, verbose = 3, n_jobs=-1)

In [None]:
# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

In [None]:
# Print the best hyperparameters and their corresponding score
print(grid_search.best_params_)
print(grid_search.best_score_)

In [None]:
xgb = XGBRegressor(objective='reg:squarederror',
                   n_estimators = 500, 
                   max_depth = 9,
                   learning_rate = 0.01)

In [None]:
xgb.fit(X_train, y_train)

In [None]:
y_pred_v1 = xgb.predict(X_test)

In [None]:
submit_data = pd.DataFrame(y_pred_v1,
                           columns = ['Clearsky DHI', 'Clearsky DNI','Clearsky GHI'])

In [None]:
submit_data.to_csv("submission_v12.csv",
                  index = False)

# RandomForest

In [7]:
from sklearn.ensemble import RandomForestRegressor

regression  = RandomForestRegressor(n_estimators=100,
                                   random_state=42,
                                   max_depth=50)

In [8]:
regression.fit(X_train,y_train)

RandomForestRegressor(max_depth=50, random_state=42)

In [14]:
y_pred = regression.predict(X_test)

In [15]:
submit_data = pd.DataFrame(y_pred,
                           columns = ['Clearsky DHI', 'Clearsky DNI','Clearsky GHI'])

In [16]:
submit_data.to_csv("submission_v15.csv",
                  index = False)

In [17]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

params = {
    'max_depth': randint(1, 50),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20),
    'max_features': ['auto', 'sqrt', 'log2']
}

In [None]:
random_search = RandomizedSearchCV(regression, param_distributions=params, n_iter=100, cv=5, n_jobs=-1)
random_search.fit(X_train, y_train)