In [1]:
import catboost as cb
import numpy as np
import pandas as pd
import seaborn as sns
import shap
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import MinMaxScaler
import tqdm as nootbook_tqdm


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df=pd.read_csv("sunny_cluster.csv")
df.head()

Unnamed: 0,Clearsky DHI,GHI,cluster,DHI,Temperature,Cloud Type,Clearsky GHI,Clearsky DNI,DNI,Relative Humidity,Solar Zenith Angle,Wind Speed
0,63,429,1,63,9.0,0,429,873,873,72.53,65.23,5.0
1,59,344,1,59,8.4,1,344,818,818,72.73,69.57,4.7
2,66,320,1,152,2.7,7,433,873,399,58.45,65.1,1.9
3,68,341,1,163,3.0,7,462,886,401,54.51,63.6,1.4
4,66,319,1,152,3.1,7,433,871,396,53.72,65.12,0.9


In [3]:
df.shape

(1981, 12)

In [4]:
df.isnull().sum()

Clearsky DHI          0
GHI                   0
cluster               0
DHI                   0
Temperature           0
Cloud Type            0
Clearsky GHI          0
Clearsky DNI          0
DNI                   0
Relative Humidity     0
Solar Zenith Angle    0
Wind Speed            0
dtype: int64

In [5]:
df.drop(['cluster'], axis = 1, inplace = True) 

In [6]:
df.describe()

Unnamed: 0,Clearsky DHI,GHI,DHI,Temperature,Cloud Type,Clearsky GHI,Clearsky DNI,DNI,Relative Humidity,Solar Zenith Angle,Wind Speed
count,1981.0,1981.0,1981.0,1981.0,1981.0,1981.0,1981.0,1981.0,1981.0,1981.0,1981.0
mean,115.208985,586.660777,178.380616,17.835689,2.164563,660.321555,816.114589,627.621908,63.967461,46.649788,2.454114
std,41.891897,184.122116,110.148419,9.771802,2.746931,182.629492,102.172979,274.995234,12.18157,14.395687,1.268989
min,50.0,298.0,50.0,-11.9,0.0,304.0,311.0,16.0,30.98,17.31,0.2
25%,83.0,426.0,91.0,10.7,0.0,511.0,759.0,409.0,56.41,34.58,1.5
50%,108.0,565.0,133.0,20.5,0.0,670.0,831.0,733.0,63.47,48.01,2.3
75%,139.0,735.0,261.0,26.0,4.0,820.0,893.0,860.0,72.17,58.9,3.2
max,354.0,1012.0,476.0,32.3,9.0,1012.0,1025.0,1025.0,96.75,70.94,8.2


In [7]:
s1= MinMaxScaler(feature_range=(0,1))
X= s1.fit_transform(df)

In [8]:
s2=MinMaxScaler(feature_range=(0,1))
y= s2.fit_transform(df[['GHI']])

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42,shuffle=False)

In [10]:
train_dataset = cb.Pool(X_train, y_train) 
test_dataset = cb.Pool(X_test, y_test)

In [11]:
model = cb.CatBoostRegressor(learning_rate=0.1,depth=6,l2_leaf_reg=0.2,iterations=400)

In [12]:
import time
start=time.time()
model.fit(X_train,y_train)
end=time.time()
continental=end-start
print(continental)

0:	learn: 0.2400655	total: 145ms	remaining: 57.8s
1:	learn: 0.2172435	total: 151ms	remaining: 30.1s
2:	learn: 0.1980144	total: 156ms	remaining: 20.7s
3:	learn: 0.1797459	total: 166ms	remaining: 16.5s
4:	learn: 0.1633118	total: 171ms	remaining: 13.5s
5:	learn: 0.1485099	total: 177ms	remaining: 11.6s
6:	learn: 0.1350366	total: 183ms	remaining: 10.3s
7:	learn: 0.1227794	total: 188ms	remaining: 9.21s
8:	learn: 0.1117859	total: 194ms	remaining: 8.44s
9:	learn: 0.1012343	total: 199ms	remaining: 7.77s
10:	learn: 0.0923185	total: 205ms	remaining: 7.27s
11:	learn: 0.0844551	total: 216ms	remaining: 6.98s
12:	learn: 0.0771849	total: 220ms	remaining: 6.54s
13:	learn: 0.0707093	total: 226ms	remaining: 6.24s
14:	learn: 0.0646065	total: 230ms	remaining: 5.91s
15:	learn: 0.0587491	total: 234ms	remaining: 5.62s
16:	learn: 0.0538514	total: 240ms	remaining: 5.41s
17:	learn: 0.0490508	total: 244ms	remaining: 5.19s
18:	learn: 0.0449077	total: 248ms	remaining: 4.98s
19:	learn: 0.0409013	total: 252ms	remaini

In [13]:

import time
start=time.time()
pred = model.predict(X_test)
end=time.time()
continental=end-start
print("Inference time:",continental)

Inference time: 0.00799417495727539


In [14]:
pred

array([0.29164852, 0.04836574, 0.10328704, 0.33250133, 0.18522566,
       0.21911918, 0.17402068, 0.41567301, 0.61136243, 0.71737085,
       0.47917517, 0.32798566, 0.32940354, 0.0477442 , 0.11706644,
       0.02206115, 0.02213357, 0.02892272, 0.11523567, 0.25039508,
       0.41544106, 0.672898  , 0.70239137, 0.66457289, 0.52970727,
       0.32372002, 0.0574447 , 0.31061131, 0.28512684, 0.03847288,
       0.53519452, 0.15194596, 0.41027364, 0.5889557 , 0.70823569,
       0.73775359, 0.67636822, 0.53628727, 0.33184296, 0.06712711,
       0.14265445, 0.3815053 , 0.5746815 , 0.68792507, 0.41346009,
       0.66371213, 0.52052473, 0.32023267, 0.05376593, 0.16683773,
       0.41204617, 0.60104743, 0.72220735, 0.7428835 , 0.68833166,
       0.54880469, 0.32944816, 0.06274552, 0.14765589, 0.39014049,
       0.41724179, 0.49220756, 0.70599796, 0.47083102, 0.3680475 ,
       0.28545436, 0.03193948, 0.11475847, 0.36414031, 0.41720453,
       0.64593983, 0.66385001, 0.62631502, 0.48128607, 0.27914

In [15]:
prediction = pred.reshape(pred.shape[0], 1)

In [16]:
prediction.shape

(397, 1)

In [17]:
Pred= s2.inverse_transform(prediction)

In [18]:
Actual= s2.inverse_transform(y_test)

In [19]:
x= Pred
z= Actual

In [20]:
from sklearn.metrics import mean_squared_error
import math

 
MSE = mean_squared_error(z,x)
 
RMSE = math.sqrt(MSE)
from sklearn.metrics import mean_absolute_error
MAE=mean_absolute_error(z,x)
MAE
print("Root Mean Square Error:",RMSE)
print("Mean Square Error:", MSE)
print("Mean Absolute Error:",MAE)


Root Mean Square Error: 3.451460523724207
Mean Square Error: 11.912579746826578
Mean Absolute Error: 2.3073132556874034


In [21]:
max= x.max()
min=x.min()
NRMSE= (RMSE/(max-min))*100
print("max:",max)
print("min:",min)
print("Normalized Root Mean Square Error:",NRMSE)

max: 828.4188198600633
min: 313.7516603810139
Normalized Root Mean Square Error: 0.6706199259377276


In [22]:
Calculated = pd.DataFrame(Pred, columns = ['Prediction'])

In [23]:
Calculated.to_csv(r'E:\CatBoost_1\literature_comparison\New York\catboost\sunny hours\preddiction.csv', index = False)

In [24]:
Actual = pd.DataFrame(Actual, columns = ['Actual'])

In [25]:
Actual.to_csv(r'E:\CatBoost_1\literature_comparison\New York\catboost\sunny hours\Actual.csv', index = False)