In [1]:
import catboost as cb
import numpy as np
import pandas as pd
import seaborn as sns
import shap
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import MinMaxScaler
import tqdm as nootbook_tqdm


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df=pd.read_csv("cloudy_cluster.csv")
df.head()

Unnamed: 0,GHI,Clearsky DHI,cluster,DHI,DNI,Wind Speed,Relative Humidity,Temperature,Clearsky DNI,Clearsky GHI,Cloud Type,Solar Zenith Angle
0,0,0,1,0,0,5.6,50.05,-5.4,0,0,7,163.33
1,0,0,1,0,0,5.7,49.67,-5.3,0,0,7,162.48
2,0,0,1,0,0,5.8,52.92,-5.1,0,0,8,159.58
3,0,0,1,0,0,6.0,52.45,-5.0,0,0,0,155.37
4,0,0,1,0,0,6.1,55.58,-4.9,0,0,0,150.41


In [3]:
df.shape

(12635, 12)

In [4]:
df.isnull().sum()

GHI                   0
Clearsky DHI          0
cluster               0
DHI                   0
DNI                   0
Wind Speed            0
Relative Humidity     0
Temperature           0
Clearsky DNI          0
Clearsky GHI          0
Cloud Type            0
Solar Zenith Angle    0
dtype: int64

In [5]:
df.drop(['cluster'], axis = 1, inplace = True) 

In [6]:
df.describe()

Unnamed: 0,GHI,Clearsky DHI,DHI,DNI,Wind Speed,Relative Humidity,Temperature,Clearsky DNI,Clearsky GHI,Cloud Type,Solar Zenith Angle
count,12635.0,12635.0,12635.0,12635.0,12635.0,12635.0,12635.0,12635.0,12635.0,12635.0,12635.0
mean,45.259201,18.276138,23.818283,90.733043,2.091619,55.478055,4.675797,191.764306,83.672893,2.69355,105.951741
std,85.973604,35.184072,48.906084,203.054985,1.157924,17.844177,9.852023,316.693066,177.238869,3.077329,28.252325
min,0.0,0.0,0.0,0.0,0.2,7.91,-19.5,0.0,0.0,0.0,16.25
25%,0.0,0.0,0.0,0.0,1.3,42.36,-3.4,0.0,0.0,0.0,84.465
50%,0.0,0.0,0.0,0.0,1.8,56.69,3.4,0.0,0.0,0.0,107.28
75%,46.0,29.0,29.0,29.0,2.7,67.66,13.0,359.0,71.0,6.0,126.155
max,332.0,339.0,295.0,949.0,8.8,98.91,32.1,1021.0,1025.0,9.0,163.76


In [7]:
s1= MinMaxScaler(feature_range=(0,1))
X= s1.fit_transform(df)

In [8]:
s2=MinMaxScaler(feature_range=(0,1))
y= s2.fit_transform(df[['GHI']])

In [9]:
X_train,X_test,y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42,shuffle=False)

In [10]:
train_dataset = cb.Pool(X_train, y_train) 
test_dataset = cb.Pool(X_test, y_test)

In [11]:
model = cb.CatBoostRegressor(loss_function="RMSE")

In [12]:
grid = {'iterations': [200,300,400],
        'learning_rate': [0.03, 0.1,0.01],
        'depth': [2, 4, 6, 8],
        'l2_leaf_reg': [0.2, 0.5, 1, 3]}
model.grid_search(grid, train_dataset)

0:	learn: 0.2886542	test: 0.2879951	best: 0.2879951 (0)	total: 119ms	remaining: 23.7s
1:	learn: 0.2805869	test: 0.2799524	best: 0.2799524 (1)	total: 122ms	remaining: 12s
2:	learn: 0.2727144	test: 0.2719779	best: 0.2719779 (2)	total: 124ms	remaining: 8.14s
3:	learn: 0.2650847	test: 0.2642945	best: 0.2642945 (3)	total: 126ms	remaining: 6.16s
4:	learn: 0.2579120	test: 0.2569545	best: 0.2569545 (4)	total: 128ms	remaining: 4.97s
5:	learn: 0.2510782	test: 0.2499155	best: 0.2499155 (5)	total: 129ms	remaining: 4.17s
6:	learn: 0.2441542	test: 0.2428853	best: 0.2428853 (6)	total: 131ms	remaining: 3.6s
7:	learn: 0.2374470	test: 0.2360623	best: 0.2360623 (7)	total: 132ms	remaining: 3.17s
8:	learn: 0.2310403	test: 0.2295289	best: 0.2295289 (8)	total: 134ms	remaining: 2.84s
9:	learn: 0.2248269	test: 0.2231870	best: 0.2231870 (9)	total: 136ms	remaining: 2.58s
10:	learn: 0.2185444	test: 0.2170425	best: 0.2170425 (10)	total: 137ms	remaining: 2.36s
11:	learn: 0.2126905	test: 0.2111821	best: 0.2111821 (1

{'params': {'depth': 4,
  'iterations': 400,
  'learning_rate': 0.1,
  'l2_leaf_reg': 0.2},
 'cv_results': defaultdict(list,
             {'iterations': [0,
               1,
               2,
               3,
               4,
               5,
               6,
               7,
               8,
               9,
               10,
               11,
               12,
               13,
               14,
               15,
               16,
               17,
               18,
               19,
               20,
               21,
               22,
               23,
               24,
               25,
               26,
               27,
               28,
               29,
               30,
               31,
               32,
               33,
               34,
               35,
               36,
               37,
               38,
               39,
               40,
               41,
               42,
               43,
               44,
               4

In [13]:
import time
start=time.time()
model.fit(X_train,y_train)
end=time.time()
continental=end-start

0:	learn: 0.2363498	total: 4.82ms	remaining: 1.92s
1:	learn: 0.2136752	total: 8.54ms	remaining: 1.7s
2:	learn: 0.1933572	total: 11.9ms	remaining: 1.58s
3:	learn: 0.1750169	total: 15.4ms	remaining: 1.53s
4:	learn: 0.1582005	total: 19.9ms	remaining: 1.57s
5:	learn: 0.1432542	total: 23.2ms	remaining: 1.53s
6:	learn: 0.1294389	total: 26.5ms	remaining: 1.49s
7:	learn: 0.1172227	total: 30ms	remaining: 1.47s
8:	learn: 0.1060996	total: 35.1ms	remaining: 1.53s
9:	learn: 0.0962706	total: 38.5ms	remaining: 1.5s
10:	learn: 0.0875062	total: 41.8ms	remaining: 1.48s
11:	learn: 0.0792289	total: 45ms	remaining: 1.46s
12:	learn: 0.0718165	total: 49.8ms	remaining: 1.48s
13:	learn: 0.0649474	total: 53.3ms	remaining: 1.47s
14:	learn: 0.0586862	total: 56.5ms	remaining: 1.45s
15:	learn: 0.0533312	total: 59.9ms	remaining: 1.44s
16:	learn: 0.0482966	total: 64ms	remaining: 1.44s
17:	learn: 0.0438003	total: 67.8ms	remaining: 1.44s
18:	learn: 0.0398281	total: 71ms	remaining: 1.42s
19:	learn: 0.0362036	total: 74.2

In [14]:
import time
start=time.time()
pred = model.predict(X_test)
end=time.time()
continental=end-start
print("inference time:",continental)


inference time: 0.048000335693359375


In [15]:
pred.shape

(2527,)

In [16]:
prediction = pred.reshape(pred.shape[0], 1)

In [17]:
prediction.shape

(2527, 1)

In [18]:
Pred= s2.inverse_transform(prediction)

In [19]:
Actual= s2.inverse_transform(y_test)

In [20]:
Calculated = pd.DataFrame(Pred, columns = ['Prediction'])

In [21]:
Calculated.to_csv(r'E:\CatBoost_1\Golden\catboost\cloudy hours\prediction.csv', index = False)

In [22]:
Actual = pd.DataFrame(Actual, columns = ['Actual'])

In [23]:
Actual.to_csv(r'E:\CatBoost_1\Golden\catboost\cloudy hours\Actual.csv', index = False)

In [31]:
x= Pred
z= Actual

In [32]:
from sklearn.metrics import mean_squared_error
import math

 
MSE = mean_squared_error(z,x)
 
RMSE = math.sqrt(MSE)
from sklearn.metrics import mean_absolute_error
MAE=mean_absolute_error(z,x)
MAE
print("Root Mean Square Error:",RMSE)
print("Mean Square Error:", MSE)
print("Mean Absolute Error:",MAE)

Root Mean Square Error: 0.8161500818389105
Mean Square Error: 0.6661009560856602
Mean Absolute Error: 0.32022778237581595


In [33]:
max= x.max()
max

329.23932653193566

In [34]:
min=x.min()
min

-0.5534506025794635

In [35]:
NRMSE= (RMSE/(max-min))*100
print("Normalized Root Mean Square Error:",NRMSE)

Normalized Root Mean Square Error: 0.2474736071936533
