In [13]:
import catboost as cb
import numpy as np
import pandas as pd
import seaborn as sns
import shap
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import MinMaxScaler
import tqdm as nootbook_tqdm


In [14]:
df=pd.read_csv("cloudy_cluster.csv")
df.head()

Unnamed: 0,Clearsky DHI,GHI,cluster,Temperature,Clearsky DNI,Relative Humidity,Wind Speed,DNI,Cloud Type,Clearsky GHI,DHI,Solar Zenith Angle
0,0,0,0,5.0,0,63.52,1.4,0,0,0,0,174.62
1,0,0,0,4.7,0,65.31,1.4,0,0,0,0,161.76
2,0,0,0,6.2,0,75.81,1.3,0,0,0,0,148.34
3,0,0,0,5.6,0,79.06,1.3,0,0,0,0,134.93
4,0,0,0,5.0,0,82.37,1.4,0,0,0,0,121.67


In [15]:
df.shape

(5888, 12)

In [16]:
df.isnull().sum()

Clearsky DHI          0
GHI                   0
cluster               0
Temperature           0
Clearsky DNI          0
Relative Humidity     0
Wind Speed            0
DNI                   0
Cloud Type            0
Clearsky GHI          0
DHI                   0
Solar Zenith Angle    0
dtype: int64

In [17]:
df.drop(['cluster'], axis = 1, inplace = True) 

In [18]:
df.describe()

Unnamed: 0,Clearsky DHI,GHI,Temperature,Clearsky DNI,Relative Humidity,Wind Speed,DNI,Cloud Type,Clearsky GHI,DHI,Solar Zenith Angle
count,5888.0,5888.0,5888.0,5888.0,5888.0,5888.0,5888.0,5888.0,5888.0,5888.0,5888.0
mean,22.804008,42.034137,18.194124,82.918648,40.696788,2.417833,71.039402,0.744565,48.393682,21.630605,113.522045
std,49.770999,92.543872,8.220768,180.425605,19.206628,1.116259,164.58538,2.040437,109.771611,46.143014,29.11622
min,0.0,0.0,-0.8,0.0,6.1,0.2,0.0,0.0,0.0,0.0,19.63
25%,0.0,0.0,11.075,0.0,26.63,1.6,0.0,0.0,0.0,0.0,88.8575
50%,0.0,0.0,18.7,0.0,36.66,2.2,0.0,0.0,0.0,0.0,114.565
75%,1.25,1.25,24.8,0.0,51.41,3.1,0.0,0.0,1.25,1.25,134.8425
max,480.0,390.0,39.4,912.0,100.0,7.8,802.0,8.0,901.0,313.0,174.72


In [19]:
s1= MinMaxScaler(feature_range=(0,1))
X= s1.fit_transform(df)

In [20]:
s2=MinMaxScaler(feature_range=(0,1))
y= s2.fit_transform(df[['GHI']])

In [21]:
X_train,X_test,y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42,shuffle=False)

In [22]:
train_dataset = cb.Pool(X_train, y_train) 
test_dataset = cb.Pool(X_test, y_test)

In [23]:
model = cb.CatBoostRegressor(loss_function="RMSE")

In [24]:
grid = {'iterations': [200,300,400],
        'learning_rate': [0.03, 0.1,0.01],
        'depth': [2, 4, 6, 8],
        'l2_leaf_reg': [0.2, 0.5, 1, 3]}
model.grid_search(grid, train_dataset)

0:	learn: 0.2449626	test: 0.2592153	best: 0.2592153 (0)	total: 2.5ms	remaining: 497ms
1:	learn: 0.2385360	test: 0.2527211	best: 0.2527211 (1)	total: 3.84ms	remaining: 381ms
2:	learn: 0.2318643	test: 0.2458160	best: 0.2458160 (2)	total: 4.91ms	remaining: 322ms
3:	learn: 0.2254882	test: 0.2393635	best: 0.2393635 (3)	total: 5.96ms	remaining: 292ms
4:	learn: 0.2191949	test: 0.2328370	best: 0.2328370 (4)	total: 7.02ms	remaining: 274ms
5:	learn: 0.2131931	test: 0.2266061	best: 0.2266061 (5)	total: 8.09ms	remaining: 262ms
6:	learn: 0.2073070	test: 0.2204535	best: 0.2204535 (6)	total: 9.17ms	remaining: 253ms
7:	learn: 0.2018028	test: 0.2148729	best: 0.2148729 (7)	total: 10.2ms	remaining: 246ms
8:	learn: 0.1962553	test: 0.2088792	best: 0.2088792 (8)	total: 11.3ms	remaining: 241ms
9:	learn: 0.1907132	test: 0.2030888	best: 0.2030888 (9)	total: 12.4ms	remaining: 236ms
10:	learn: 0.1857412	test: 0.1979791	best: 0.1979791 (10)	total: 13.6ms	remaining: 234ms
11:	learn: 0.1806543	test: 0.1928186	best:

{'params': {'depth': 4,
  'iterations': 400,
  'learning_rate': 0.1,
  'l2_leaf_reg': 0.5},
 'cv_results': defaultdict(list,
             {'iterations': [0,
               1,
               2,
               3,
               4,
               5,
               6,
               7,
               8,
               9,
               10,
               11,
               12,
               13,
               14,
               15,
               16,
               17,
               18,
               19,
               20,
               21,
               22,
               23,
               24,
               25,
               26,
               27,
               28,
               29,
               30,
               31,
               32,
               33,
               34,
               35,
               36,
               37,
               38,
               39,
               40,
               41,
               42,
               43,
               44,
               4

In [25]:
import time
start=time.time()
model.fit(X_train,y_train)
end=time.time()
continental=end-start

0:	learn: 0.2095794	total: 3.52ms	remaining: 1.4s
1:	learn: 0.1894997	total: 7.75ms	remaining: 1.54s
2:	learn: 0.1713788	total: 13.6ms	remaining: 1.8s
3:	learn: 0.1558423	total: 16.1ms	remaining: 1.59s
4:	learn: 0.1412682	total: 18.5ms	remaining: 1.46s
5:	learn: 0.1279257	total: 21.1ms	remaining: 1.38s
6:	learn: 0.1156767	total: 23.5ms	remaining: 1.32s
7:	learn: 0.1051973	total: 29.1ms	remaining: 1.43s
8:	learn: 0.0954013	total: 31.5ms	remaining: 1.37s
9:	learn: 0.0864354	total: 34ms	remaining: 1.33s
10:	learn: 0.0781545	total: 36.4ms	remaining: 1.29s
11:	learn: 0.0707166	total: 38.8ms	remaining: 1.25s
12:	learn: 0.0644977	total: 43.1ms	remaining: 1.28s
13:	learn: 0.0587227	total: 45.5ms	remaining: 1.25s
14:	learn: 0.0533955	total: 47.9ms	remaining: 1.23s
15:	learn: 0.0483744	total: 50.3ms	remaining: 1.21s
16:	learn: 0.0439643	total: 52.6ms	remaining: 1.19s
17:	learn: 0.0398978	total: 55.2ms	remaining: 1.17s
18:	learn: 0.0362876	total: 59.3ms	remaining: 1.19s
19:	learn: 0.0331123	total

In [26]:
import time
start=time.time()
pred = model.predict(X_test)
end=time.time()
continental=end-start
print("inference time:",continental)

inference time: 0.023965835571289062


In [27]:
pred

array([-6.93385323e-05, -5.73666237e-05, -1.71653913e-05, ...,
        8.48960779e-05, -3.23839034e-04, -3.94607204e-05])

In [28]:
pred.shape

(1178,)

In [29]:
prediction = pred.reshape(pred.shape[0], 1)

In [30]:
prediction.shape

(1178, 1)

In [31]:
Pred= s2.inverse_transform(prediction)

In [32]:
Actual= s2.inverse_transform(y_test)

In [33]:
Calculated = pd.DataFrame(Pred, columns = ['Prediction'])

In [34]:
Calculated.to_csv(r'E:\CatBoost_1\Johannesburg\catboost\cloudy hours\prediction.csv', index = False)

In [35]:
Actual = pd.DataFrame(Actual, columns = ['Actual'])

In [36]:
Actual.to_csv(r'E:\CatBoost_1\Johannesburg\catboost\cloudy hours\Actual.csv', index = False)

In [37]:
x=Pred
z=Actual

In [38]:
from sklearn.metrics import mean_squared_error
import math

 
MSE = mean_squared_error(z,x)
 
RMSE = math.sqrt(MSE)
from sklearn.metrics import mean_absolute_error
MAE=mean_absolute_error(z,x)
MAE
print("Root Mean Square Error:",RMSE)
print("Mean Square Error:", MSE)
print("Mean Absolute Error:",MAE)

Root Mean Square Error: 1.3153264306605903
Mean Square Error: 1.7300836191943285
Mean Absolute Error: 0.5204448345098367


In [39]:
max= x.max()
max

385.96292957984684

In [40]:
min=x.min()
min

-0.18452579076220013

In [41]:
NRMSE= (RMSE/(max-min))*100
print("Normalized Root Mean Square Error:",NRMSE)

Normalized Root Mean Square Error: 0.34062801978021373
