In [1]:
import catboost as cb
import numpy as np
import pandas as pd
import seaborn as sns
import shap
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import MinMaxScaler
import tqdm as nootbook_tqdm


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df=pd.read_csv("cloudy_cluster.csv")
df.head()

Unnamed: 0,Clearsky DHI,GHI,cluster,Temperature,Clearsky DNI,Clearsky GHI,Cloud Type,DHI,DNI,Solar Zenith Angle,Relative Humidity,Wind Speed
0,0,0,0,15.0,0,0,1,0,0,159.92,66.92,7.6
1,0,0,0,14.9,0,0,0,0,0,147.83,66.95,7.5
2,0,0,0,14.7,0,0,3,0,0,135.28,68.04,7.5
3,0,0,0,14.6,0,0,4,0,0,122.73,68.98,7.4
4,0,0,0,14.5,0,0,4,0,0,110.4,69.59,7.2


In [3]:
df.shape

(6288, 12)

In [4]:
df.isnull().sum()

Clearsky DHI          0
GHI                   0
cluster               0
Temperature           0
Clearsky DNI          0
Clearsky GHI          0
Cloud Type            0
DHI                   0
DNI                   0
Solar Zenith Angle    0
Relative Humidity     0
Wind Speed            0
dtype: int64

In [5]:
df.drop(['cluster'], axis = 1, inplace = True) 

In [6]:
df.describe()

Unnamed: 0,Clearsky DHI,GHI,Temperature,Clearsky DNI,Clearsky GHI,Cloud Type,DHI,DNI,Solar Zenith Angle,Relative Humidity,Wind Speed
count,6288.0,6288.0,6288.0,6288.0,6288.0,6288.0,6288.0,6288.0,6288.0,6288.0,6288.0
mean,28.285146,40.410305,19.479183,119.157602,72.788486,1.729644,25.330312,56.171597,108.201081,70.975967,4.498012
std,55.863768,82.722314,5.528927,223.7707,152.820705,2.604531,52.199988,141.571441,29.7379,11.312134,2.165923
min,0.0,0.0,9.1,0.0,0.0,0.0,0.0,0.0,12.68,24.87,0.3
25%,0.0,0.0,14.6,0.0,0.0,0.0,0.0,0.0,84.565,63.82,2.9
50%,0.0,0.0,18.3,0.0,0.0,0.0,0.0,0.0,109.035,71.96,4.2
75%,36.0,28.0,24.9,113.0,50.0,3.0,25.0,0.0,130.165,78.76,5.8
max,463.0,354.0,34.0,922.0,919.0,9.0,297.0,772.0,169.3,100.0,12.6


In [7]:
s1= MinMaxScaler(feature_range=(0,1))
X= s1.fit_transform(df)

In [8]:
s2=MinMaxScaler(feature_range=(0,1))
y= s2.fit_transform(df[['GHI']])

In [9]:
X_train,X_test,y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42,shuffle=False)

In [10]:
train_dataset = cb.Pool(X_train, y_train) 
test_dataset = cb.Pool(X_test, y_test)

In [11]:
model = cb.CatBoostRegressor(iterations=400,depth=4,learning_rate=0.1,l2_leaf_reg=0.2)

In [12]:
import time
start=time.time()
model.fit(X_train,y_train)
end=time.time()
continental=end-start

0:	learn: 0.2102741	total: 151ms	remaining: 1m
1:	learn: 0.1902687	total: 154ms	remaining: 30.7s
2:	learn: 0.1728810	total: 157ms	remaining: 20.8s
3:	learn: 0.1568396	total: 160ms	remaining: 15.8s
4:	learn: 0.1428154	total: 165ms	remaining: 13s
5:	learn: 0.1292792	total: 168ms	remaining: 11s
6:	learn: 0.1169045	total: 170ms	remaining: 9.56s
7:	learn: 0.1063447	total: 173ms	remaining: 8.47s
8:	learn: 0.0963979	total: 177ms	remaining: 7.69s
9:	learn: 0.0872937	total: 181ms	remaining: 7.07s
10:	learn: 0.0790468	total: 184ms	remaining: 6.51s
11:	learn: 0.0717365	total: 187ms	remaining: 6.03s
12:	learn: 0.0656118	total: 189ms	remaining: 5.63s
13:	learn: 0.0595590	total: 195ms	remaining: 5.37s
14:	learn: 0.0543771	total: 198ms	remaining: 5.08s
15:	learn: 0.0494439	total: 201ms	remaining: 4.82s
16:	learn: 0.0451035	total: 203ms	remaining: 4.58s
17:	learn: 0.0410554	total: 207ms	remaining: 4.39s
18:	learn: 0.0377000	total: 216ms	remaining: 4.33s
19:	learn: 0.0344161	total: 219ms	remaining: 4.1

In [13]:
import time
start=time.time()
pred = model.predict(X_test)
end=time.time()
continental=end-start
print("inference time:",continental)


inference time: 0.015988826751708984


In [14]:
pred.shape

(1258,)

In [15]:
prediction = pred.reshape(pred.shape[0], 1)

In [16]:
prediction.shape

(1258, 1)

In [17]:
Pred= s2.inverse_transform(prediction)

In [18]:
Actual= s2.inverse_transform(y_test)

In [19]:
Calculated = pd.DataFrame(Pred, columns = ['Prediction'])

In [20]:
Calculated.to_csv(r'E:\CatBoost_1\literature_comparison\Libya\catboost\cloudy hours\prediction.csv', index = False)

In [21]:
Actual = pd.DataFrame(Actual, columns = ['Actual'])

In [22]:
Actual.to_csv(r'E:\CatBoost_1\literature_comparison\Libya\catboost\cloudy hours\Actual.csv', index = False)

In [23]:
x= Pred
z= Actual

In [24]:
from sklearn.metrics import mean_squared_error
import math

 
MSE = mean_squared_error(z,x)
 
RMSE = math.sqrt(MSE)
from sklearn.metrics import mean_absolute_error
MAE=mean_absolute_error(z,x)
MAE
print("Root Mean Square Error:",RMSE)
print("Mean Square Error:", MSE)
print("Mean Absolute Error:",MAE)

Root Mean Square Error: 1.0227842336658457
Mean Square Error: 1.0460875886354313
Mean Absolute Error: 0.4778726739536978


In [25]:
max= x.max()
max

344.0011133908716

In [26]:
min=x.min()
min

-0.13384847535958533

In [27]:
NRMSE= (RMSE/(max-min))*100
print("Normalized Root Mean Square Error:",NRMSE)

Normalized Root Mean Square Error: 0.29720439565899515
