In [1]:
import catboost as cb
import numpy as np
import pandas as pd
import seaborn as sns
import shap
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import MinMaxScaler
import tqdm as nootbook_tqdm


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df=pd.read_csv("cloudy_cluster.csv")
df.head()

Unnamed: 0,Clearsky DHI,GHI,cluster,DHI,Temperature,Cloud Type,Clearsky GHI,Clearsky DNI,DNI,Relative Humidity,Solar Zenith Angle,Wind Speed
0,0,0,0,0,5.8,7,0,0,0,95.85,116.05,4.9
1,0,0,0,0,6.3,7,0,0,0,95.92,127.36,5.3
2,0,0,0,0,6.7,6,0,0,0,96.37,138.65,4.7
3,0,0,0,0,7.1,6,0,0,0,97.15,149.4,4.5
4,0,0,0,0,7.7,6,0,0,0,97.94,158.37,4.3


In [3]:
df.shape

(6779, 12)

In [4]:
df.isnull().sum()

Clearsky DHI          0
GHI                   0
cluster               0
DHI                   0
Temperature           0
Cloud Type            0
Clearsky GHI          0
Clearsky DNI          0
DNI                   0
Relative Humidity     0
Solar Zenith Angle    0
Wind Speed            0
dtype: int64

In [5]:
df.drop(['cluster'], axis = 1, inplace = True) 

In [6]:
df.describe()

Unnamed: 0,Clearsky DHI,GHI,DHI,Temperature,Cloud Type,Clearsky GHI,Clearsky DNI,DNI,Relative Humidity,Solar Zenith Angle,Wind Speed
count,6779.0,6779.0,6779.0,6779.0,6779.0,6779.0,6779.0,6779.0,6779.0,6779.0,6779.0
mean,27.33766,40.711314,27.674436,9.954949,3.929931,106.243546,195.33825,54.110636,86.057694,102.228298,2.7249
std,46.642371,75.913866,52.953529,9.33241,2.780282,198.798603,297.987786,149.946295,11.116897,29.900231,1.313591
min,0.0,0.0,0.0,-16.0,0.0,0.0,0.0,0.0,36.64,17.32,0.1
25%,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,79.195,79.275,1.8
50%,0.0,0.0,0.0,10.1,4.0,0.0,0.0,0.0,88.29,103.92,2.5
75%,49.0,43.0,34.0,18.2,7.0,135.0,435.0,0.0,95.4,124.01,3.4
max,263.0,316.0,281.0,30.8,9.0,937.0,953.0,849.0,100.0,162.65,8.5


In [7]:
s1= MinMaxScaler(feature_range=(0,1))
X= s1.fit_transform(df)

In [8]:
s2=MinMaxScaler(feature_range=(0,1))
y= s2.fit_transform(df[['GHI']])

In [9]:
X_train,X_test,y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42,shuffle=False)

In [10]:
train_dataset = cb.Pool(X_train, y_train) 
test_dataset = cb.Pool(X_test, y_test)

In [11]:
model = cb.CatBoostRegressor(iterations=400,depth=4,learning_rate=0.1,l2_leaf_reg=0.2)

In [12]:
import time
start=time.time()
model.fit(X_train,y_train)
end=time.time()
continental=end-start

0:	learn: 0.2203185	total: 131ms	remaining: 52.1s
1:	learn: 0.1993532	total: 133ms	remaining: 26.6s
2:	learn: 0.1802313	total: 136ms	remaining: 18s
3:	learn: 0.1639745	total: 140ms	remaining: 13.9s
4:	learn: 0.1487082	total: 145ms	remaining: 11.4s
5:	learn: 0.1347937	total: 148ms	remaining: 9.72s
6:	learn: 0.1218151	total: 151ms	remaining: 8.49s
7:	learn: 0.1104919	total: 154ms	remaining: 7.55s
8:	learn: 0.1002392	total: 161ms	remaining: 6.99s
9:	learn: 0.0910480	total: 164ms	remaining: 6.39s
10:	learn: 0.0823311	total: 167ms	remaining: 5.9s
11:	learn: 0.0746725	total: 170ms	remaining: 5.49s
12:	learn: 0.0681103	total: 175ms	remaining: 5.21s
13:	learn: 0.0619874	total: 178ms	remaining: 4.91s
14:	learn: 0.0564623	total: 181ms	remaining: 4.65s
15:	learn: 0.0514737	total: 184ms	remaining: 4.42s
16:	learn: 0.0469905	total: 191ms	remaining: 4.3s
17:	learn: 0.0427218	total: 194ms	remaining: 4.12s
18:	learn: 0.0390454	total: 197ms	remaining: 3.96s
19:	learn: 0.0355059	total: 200ms	remaining: 

In [13]:
import time
start=time.time()
pred = model.predict(X_test)
end=time.time()
continental=end-start
print("inference time:",continental)


inference time: 0.015992164611816406


In [14]:
pred.shape

(1356,)

In [15]:
prediction = pred.reshape(pred.shape[0], 1)

In [16]:
prediction.shape

(1356, 1)

In [17]:
Pred= s2.inverse_transform(prediction)

In [18]:
Actual= s2.inverse_transform(y_test)

In [19]:
Calculated = pd.DataFrame(Pred, columns = ['Prediction'])

In [20]:
Calculated.to_csv(r'E:\CatBoost_1\literature_comparison\New York\catboost\cloudy hours\prediction.csv', index = False)

In [21]:
Actual = pd.DataFrame(Actual, columns = ['Actual'])

In [22]:
Actual.to_csv(r'E:\CatBoost_1\literature_comparison\New York\catboost\cloudy hours\Actual.csv', index = False)

In [23]:
x= Pred
z= Actual

In [24]:
from sklearn.metrics import mean_squared_error
import math

 
MSE = mean_squared_error(z,x)
 
RMSE = math.sqrt(MSE)
from sklearn.metrics import mean_absolute_error
MAE=mean_absolute_error(z,x)
MAE
print("Root Mean Square Error:",RMSE)
print("Mean Square Error:", MSE)
print("Mean Absolute Error:",MAE)

Root Mean Square Error: 0.8643191082101102
Mean Square Error: 0.7470475208171201
Mean Absolute Error: 0.4080324345945646


In [25]:
max= x.max()
max

313.1929556662358

In [26]:
min=x.min()
min

-0.22549398548906296

In [27]:
NRMSE= (RMSE/(max-min))*100
print("Normalized Root Mean Square Error:",NRMSE)

Normalized Root Mean Square Error: 0.2757716111385766
