In [1]:
import catboost as cb
import numpy as np
import pandas as pd
import seaborn as sns
import shap
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import MinMaxScaler
import tqdm as nootbook_tqdm


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df=pd.read_csv("sunny_cluster.csv")
df.head()

Unnamed: 0,Clearsky DHI,GHI,cluster,Temperature,Clearsky DNI,Relative Humidity,Wind Speed,DNI,Cloud Type,Clearsky GHI,DHI,Solar Zenith Angle
0,108,403,1,14.5,889,40.65,3.4,241,7,618,265,54.95
1,114,702,1,15.7,920,36.14,3.7,920,0,702,114,50.2
2,115,715,1,16.4,923,33.37,3.9,923,0,715,115,49.41
3,111,657,1,16.7,901,31.61,4.0,901,1,657,111,52.76
4,100,533,1,16.6,854,30.95,4.0,854,0,533,100,59.55


In [3]:
df.shape

(2872, 12)

In [4]:
df.isnull().sum()

Clearsky DHI          0
GHI                   0
cluster               0
Temperature           0
Clearsky DNI          0
Relative Humidity     0
Wind Speed            0
DNI                   0
Cloud Type            0
Clearsky GHI          0
DHI                   0
Solar Zenith Angle    0
dtype: int64

In [5]:
df.drop(['cluster'], axis = 1, inplace = True) 

In [6]:
df.describe()

Unnamed: 0,Clearsky DHI,GHI,Temperature,Clearsky DNI,Relative Humidity,Wind Speed,DNI,Cloud Type,Clearsky GHI,DHI,Solar Zenith Angle
count,2872.0,2872.0,2872.0,2872.0,2872.0,2872.0,2872.0,2872.0,2872.0,2872.0,2872.0
mean,196.06929,732.918872,28.169394,744.552228,23.098088,4.163022,722.517409,0.430362,744.193593,201.089833,40.970286
std,70.114563,193.575924,8.048195,134.986357,11.223323,1.540044,164.395314,1.591959,189.422882,73.647434,15.695106
min,79.0,320.0,7.5,162.0,4.46,0.3,17.0,0.0,358.0,79.0,5.21
25%,148.0,579.0,21.2,666.0,15.2975,3.1,644.0,0.0,600.0,149.0,30.105
50%,181.0,734.0,29.5,770.0,20.28,4.2,760.0,0.0,746.0,184.0,44.48
75%,225.0,899.25,35.2,844.0,29.01,5.2,839.0,0.0,908.0,233.0,53.1425
max,520.0,1086.0,44.0,1008.0,79.39,8.7,1008.0,8.0,1086.0,520.0,66.84


In [7]:
s1= MinMaxScaler(feature_range=(0,1))
X= s1.fit_transform(df)

In [8]:
s2=MinMaxScaler(feature_range=(0,1))
y= s2.fit_transform(df[['GHI']])

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42,shuffle=False)

In [10]:
train_dataset = cb.Pool(X_train, y_train) 
test_dataset = cb.Pool(X_test, y_test)

In [11]:
model = cb.CatBoostRegressor(loss_function="RMSE")

In [12]:
grid = {'iterations': [200,300,400],
        'learning_rate': [0.03, 0.1,0.01],
        'depth': [2, 4, 6, 8],
        'l2_leaf_reg': [0.2, 0.5, 1, 3]}
model.grid_search(grid, train_dataset)

0:	learn: 0.6046382	test: 0.6250659	best: 0.6250659 (0)	total: 128ms	remaining: 25.5s
1:	learn: 0.5874107	test: 0.6076356	best: 0.6076356 (1)	total: 129ms	remaining: 12.8s
2:	learn: 0.5709241	test: 0.5910203	best: 0.5910203 (2)	total: 131ms	remaining: 8.62s
3:	learn: 0.5546474	test: 0.5745620	best: 0.5745620 (3)	total: 132ms	remaining: 6.48s
4:	learn: 0.5387157	test: 0.5584554	best: 0.5584554 (4)	total: 133ms	remaining: 5.19s
5:	learn: 0.5233661	test: 0.5428260	best: 0.5428260 (5)	total: 134ms	remaining: 4.33s
6:	learn: 0.5084395	test: 0.5272827	best: 0.5272827 (6)	total: 135ms	remaining: 3.71s
7:	learn: 0.4941275	test: 0.5127879	best: 0.5127879 (7)	total: 136ms	remaining: 3.25s
8:	learn: 0.4797493	test: 0.4978339	best: 0.4978339 (8)	total: 136ms	remaining: 2.89s
9:	learn: 0.4662591	test: 0.4837936	best: 0.4837936 (9)	total: 137ms	remaining: 2.6s
10:	learn: 0.4530085	test: 0.4701857	best: 0.4701857 (10)	total: 138ms	remaining: 2.37s
11:	learn: 0.4402788	test: 0.4572648	best: 0.4572648 

{'params': {'depth': 6,
  'iterations': 400,
  'learning_rate': 0.1,
  'l2_leaf_reg': 0.2},
 'cv_results': defaultdict(list,
             {'iterations': [0,
               1,
               2,
               3,
               4,
               5,
               6,
               7,
               8,
               9,
               10,
               11,
               12,
               13,
               14,
               15,
               16,
               17,
               18,
               19,
               20,
               21,
               22,
               23,
               24,
               25,
               26,
               27,
               28,
               29,
               30,
               31,
               32,
               33,
               34,
               35,
               36,
               37,
               38,
               39,
               40,
               41,
               42,
               43,
               44,
               4

In [13]:
import time
start=time.time()
model.fit(X_train,y_train)
end=time.time()
continental=end-start

0:	learn: 0.2363291	total: 8.21ms	remaining: 3.27s
1:	learn: 0.2138586	total: 13ms	remaining: 2.59s
2:	learn: 0.1951961	total: 19.4ms	remaining: 2.57s
3:	learn: 0.1770214	total: 25.2ms	remaining: 2.49s
4:	learn: 0.1603269	total: 29.8ms	remaining: 2.35s
5:	learn: 0.1456958	total: 36.7ms	remaining: 2.41s
6:	learn: 0.1321823	total: 40.9ms	remaining: 2.3s
7:	learn: 0.1194624	total: 45.2ms	remaining: 2.21s
8:	learn: 0.1084782	total: 51.2ms	remaining: 2.23s
9:	learn: 0.0987483	total: 55.3ms	remaining: 2.16s
10:	learn: 0.0900186	total: 59.5ms	remaining: 2.1s
11:	learn: 0.0817782	total: 63.9ms	remaining: 2.07s
12:	learn: 0.0744076	total: 70ms	remaining: 2.08s
13:	learn: 0.0676682	total: 74.2ms	remaining: 2.04s
14:	learn: 0.0616364	total: 78.8ms	remaining: 2.02s
15:	learn: 0.0562732	total: 85.1ms	remaining: 2.04s
16:	learn: 0.0512388	total: 89.5ms	remaining: 2.02s
17:	learn: 0.0466891	total: 94.5ms	remaining: 2s
18:	learn: 0.0425991	total: 102ms	remaining: 2.04s
19:	learn: 0.0390313	total: 106m

In [14]:
import time 
start=time.time()
pred = model.predict(X_test)
end=time.time()
continental=end-start
print("inference time:",continental)


inference time: 0.011999368667602539


In [15]:
pred

array([0.69277719, 0.78453221, 0.77869462, 0.6755697 , 0.4813399 ,
       0.20622499, 0.2384351 , 0.5004269 , 0.67507447, 0.76006558,
       0.7579662 , 0.65358057, 0.45548423, 0.18294455, 0.220655  ,
       0.48063442, 0.66707264, 0.75819886, 0.75540281, 0.64701043,
       0.44258544, 0.18381175, 0.24846886, 0.50176159, 0.67676685,
       0.77689048, 0.75872361, 0.65444897, 0.45718032, 0.18454991,
       0.22408505, 0.48280172, 0.66772173, 0.75881924, 0.75328264,
       0.64005344, 0.44145207, 0.17715626, 0.21460348, 0.47461341,
       0.65775639, 0.7410088 , 0.7310972 , 0.62718679, 0.42165973,
       0.15672213, 0.20546417, 0.47511523, 0.64772347, 0.74367149,
       0.73095105, 0.62285068, 0.42308285, 0.15443256, 0.1987223 ,
       0.46371248, 0.63971742, 0.72663488, 0.71506231, 0.61213289,
       0.40533077, 0.13832644, 0.19500376, 0.44292384, 0.62569104,
       0.7135474 , 0.69312697, 0.5886344 , 0.38726958, 0.12523284,
       0.18382826, 0.43701135, 0.61645702, 0.70331987, 0.69239

In [16]:
prediction = pred.reshape(pred.shape[0], 1)

In [17]:
prediction.shape

(575, 1)

In [18]:
Pred= s2.inverse_transform(prediction)

In [19]:
Actual= s2.inverse_transform(y_test)

In [20]:
Calculated = pd.DataFrame(Pred, columns = ['Prediction'])

In [21]:
Calculated.to_csv(r'E:\CatBoost_1\Johannesburg\catboost\sunny hours\prediction.csv', index = False)

In [22]:
Actual = pd.DataFrame(Actual, columns = ['Actual'])

In [23]:
Actual.to_csv(r'E:\CatBoost_1\Johannesburg\catboost\sunny hours\Actual.csv', index = False)

In [24]:
x=Pred
z=Actual

In [25]:
from sklearn.metrics import mean_squared_error
import math

 
MSE = mean_squared_error(z,x)
 
RMSE = math.sqrt(MSE)
from sklearn.metrics import mean_absolute_error
MAE=mean_absolute_error(z,x)
MAE
from sklearn.metrics import mean_absolute_percentage_error
MAPE=mean_absolute_percentage_error(z,x)
MAPE

print("Root Mean Square Error:",RMSE)
print("Mean Square Error:", MSE)
print("Mean Absolute Error:",MAE)
print("Mean Absolute Percentage Error:",MAPE)

Root Mean Square Error: 3.4905461329714162
Mean Square Error: 12.183912306401709
Mean Absolute Error: 2.4084364957325426
Mean Absolute Percentage Error: 0.004034312440543788


In [26]:
max= x.max()
max

920.9516717748666

In [27]:
min=x.min()
min

380.0680836701633

In [28]:
NRMSE= (RMSE/(max-min))*100
print("Normalized Root Mean Square Error:",NRMSE)

Normalized Root Mean Square Error: 0.6453414763798903
