In [1]:
import catboost as cb
import numpy as np
import pandas as pd
import seaborn as sns
import shap
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import MinMaxScaler
import tqdm as nootbook_tqdm


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df=pd.read_csv("sunny_cluster.csv")
df.head()

Unnamed: 0,GHI,Clearsky DHI,cluster,DHI,DNI,Wind Speed,Relative Humidity,Temperature,Clearsky DNI,Clearsky GHI,Cloud Type,Solar Zenith Angle
0,427,50,0,60,894,4.7,54.23,3.0,959,444,0,65.75
1,394,48,0,48,933,4.3,55.89,2.3,938,396,0,68.2
2,370,48,0,54,882,2.9,50.8,-0.9,947,387,0,69.01
3,438,53,0,55,956,2.9,48.99,-0.4,969,442,0,66.36
4,484,60,0,60,980,2.9,47.64,0.1,980,484,0,64.37


In [3]:
df.shape

(4885, 12)

In [4]:
df.isnull().sum()

GHI                   0
Clearsky DHI          0
cluster               0
DHI                   0
DNI                   0
Wind Speed            0
Relative Humidity     0
Temperature           0
Clearsky DNI          0
Clearsky GHI          0
Cloud Type            0
Solar Zenith Angle    0
dtype: int64

In [5]:
df.drop(['cluster'], axis = 1, inplace = True) 

In [6]:
df.describe()

Unnamed: 0,GHI,Clearsky DHI,DHI,DNI,Wind Speed,Relative Humidity,Temperature,Clearsky DNI,Clearsky GHI,Cloud Type,Solar Zenith Angle
count,4885.0,4885.0,4885.0,4885.0,4885.0,4885.0,4885.0,4885.0,4885.0,4885.0,4885.0
mean,616.202456,91.901331,156.200614,726.331832,2.717994,27.690878,16.832487,919.222723,693.633982,1.891709,47.676669
std,192.619786,40.769207,110.079159,284.966113,1.321444,13.683449,10.539028,102.613445,198.179709,2.796603,14.64044
min,310.0,37.0,37.0,23.0,0.2,5.98,-12.6,286.0,321.0,0.0,16.25
25%,455.0,68.0,74.0,522.0,1.8,16.59,8.3,879.0,522.0,0.0,36.12
50%,588.0,84.0,105.0,837.0,2.6,25.57,18.4,945.0,688.0,0.0,49.73
75%,763.0,102.0,221.0,960.0,3.4,36.5,26.0,987.0,864.0,4.0,60.36
max,1060.0,377.0,512.0,1092.0,8.7,77.97,34.1,1092.0,1065.0,9.0,72.26


In [7]:
s1= MinMaxScaler(feature_range=(0,1))
X= s1.fit_transform(df)

In [8]:
s2=MinMaxScaler(feature_range=(0,1))
y= s2.fit_transform(df[['GHI']])

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42,shuffle=False)

In [10]:
train_dataset = cb.Pool(X_train, y_train) 
test_dataset = cb.Pool(X_test, y_test)

In [11]:
model = cb.CatBoostRegressor(loss_function="RMSE")

In [12]:
grid = {'iterations': [200,300,400],
        'learning_rate': [0.03, 0.1,0.01],
        'depth': [2, 4, 6, 8],
        'l2_leaf_reg': [0.2, 0.5, 1, 3]}
model.grid_search(grid, train_dataset)

0:	learn: 0.5022699	test: 0.4972209	best: 0.4972209 (0)	total: 144ms	remaining: 28.6s
1:	learn: 0.4893146	test: 0.4843859	best: 0.4843859 (1)	total: 146ms	remaining: 14.5s
2:	learn: 0.4756210	test: 0.4708736	best: 0.4708736 (2)	total: 153ms	remaining: 10s
3:	learn: 0.4621281	test: 0.4575190	best: 0.4575190 (3)	total: 189ms	remaining: 9.28s
4:	learn: 0.4487424	test: 0.4441706	best: 0.4441706 (4)	total: 193ms	remaining: 7.54s
5:	learn: 0.4362391	test: 0.4318615	best: 0.4318615 (5)	total: 198ms	remaining: 6.4s
6:	learn: 0.4237568	test: 0.4194626	best: 0.4194626 (6)	total: 205ms	remaining: 5.64s
7:	learn: 0.4121140	test: 0.4078917	best: 0.4078917 (7)	total: 206ms	remaining: 4.94s
8:	learn: 0.4005955	test: 0.3964577	best: 0.3964577 (8)	total: 207ms	remaining: 4.39s
9:	learn: 0.3895647	test: 0.3857912	best: 0.3857912 (9)	total: 208ms	remaining: 3.96s
10:	learn: 0.3789690	test: 0.3752473	best: 0.3752473 (10)	total: 209ms	remaining: 3.6s
11:	learn: 0.3687276	test: 0.3651728	best: 0.3651728 (11

{'params': {'depth': 6,
  'iterations': 400,
  'learning_rate': 0.1,
  'l2_leaf_reg': 0.2},
 'cv_results': defaultdict(list,
             {'iterations': [0,
               1,
               2,
               3,
               4,
               5,
               6,
               7,
               8,
               9,
               10,
               11,
               12,
               13,
               14,
               15,
               16,
               17,
               18,
               19,
               20,
               21,
               22,
               23,
               24,
               25,
               26,
               27,
               28,
               29,
               30,
               31,
               32,
               33,
               34,
               35,
               36,
               37,
               38,
               39,
               40,
               41,
               42,
               43,
               44,
               4

In [13]:
import time
start=time.time()
model.fit(X_train,y_train)
end=time.time()
continental=end-start
print(continental)

0:	learn: 0.2408168	total: 13.2ms	remaining: 5.26s
1:	learn: 0.2188475	total: 24.7ms	remaining: 4.91s
2:	learn: 0.1986812	total: 32.9ms	remaining: 4.35s
3:	learn: 0.1808056	total: 42.7ms	remaining: 4.23s
4:	learn: 0.1639926	total: 48.1ms	remaining: 3.8s
5:	learn: 0.1492650	total: 55.7ms	remaining: 3.66s
6:	learn: 0.1353379	total: 60.8ms	remaining: 3.41s
7:	learn: 0.1231914	total: 65.6ms	remaining: 3.21s
8:	learn: 0.1120270	total: 73.7ms	remaining: 3.2s
9:	learn: 0.1020606	total: 78.9ms	remaining: 3.08s
10:	learn: 0.0929378	total: 83.8ms	remaining: 2.96s
11:	learn: 0.0848451	total: 92.6ms	remaining: 2.99s
12:	learn: 0.0769877	total: 97.4ms	remaining: 2.9s
13:	learn: 0.0698454	total: 105ms	remaining: 2.88s
14:	learn: 0.0638503	total: 110ms	remaining: 2.83s
15:	learn: 0.0584321	total: 115ms	remaining: 2.76s
16:	learn: 0.0533430	total: 122ms	remaining: 2.76s
17:	learn: 0.0486020	total: 128ms	remaining: 2.71s
18:	learn: 0.0443474	total: 134ms	remaining: 2.68s
19:	learn: 0.0404119	total: 139

In [14]:

import time
start=time.time()
pred = model.predict(X_test)
end=time.time()
continental=end-start
print("Inference time:",continental)

Inference time: 0.015987396240234375


In [15]:
pred

array([0.49326768, 0.45665752, 0.47399406, 0.5036565 , 0.50441751,
       0.49755686, 0.40326977, 0.29412296, 0.16955628, 0.03949724,
       0.09156456, 0.16473735, 0.25812327, 0.35667595, 0.41065906,
       0.44363875, 0.44031851, 0.41526013, 0.445581  , 0.44157739,
       0.48521395, 0.47168683, 0.36196244, 0.24718702, 0.15666702,
       0.03021517, 0.06762721, 0.11831656, 0.21183331, 0.25895824,
       0.36141078, 0.42293976, 0.40604666, 0.24355331, 0.15432891,
       0.21474959, 0.1658537 , 0.14688087, 0.15161528, 0.07172069,
       0.10253152, 0.1671079 , 0.19586441, 0.05490077, 0.10060902,
       0.22749003, 0.35331136, 0.45682959, 0.5518635 , 0.61830367,
       0.67010644, 0.69957006, 0.70120822, 0.68145566, 0.63552255,
       0.57447201, 0.49794842, 0.40346304, 0.2818525 , 0.15383553,
       0.09218108, 0.21764065, 0.33802194, 0.43882094, 0.52335148,
       0.58678442, 0.63696405, 0.66500068, 0.67214086, 0.65000945,
       0.61959055, 0.55462898, 0.47062273, 0.37019064, 0.25885

In [16]:
prediction = pred.reshape(pred.shape[0], 1)

In [17]:
prediction.shape

(977, 1)

In [18]:
Pred= s2.inverse_transform(prediction)

In [19]:
Actual= s2.inverse_transform(y_test)

In [20]:
x= Pred
z= Actual

In [21]:
from sklearn.metrics import mean_squared_error
import math

 
MSE = mean_squared_error(z,x)
 
RMSE = math.sqrt(MSE)
from sklearn.metrics import mean_absolute_error
MAE=mean_absolute_error(z,x)
MAE
print("Root Mean Square Error:",RMSE)
print("Mean Square Error:", MSE)
print("Mean Absolute Error:",MAE)


Root Mean Square Error: 3.7057986821247493
Mean Square Error: 13.732943872437529
Mean Absolute Error: 2.3655956913520986


In [22]:
max= x.max()
min=x.min()
NRMSE= (RMSE/(max-min))*100
print("max:",max)
print("min:",min)
print("Normalized Root Mean Square Error:",NRMSE)

max: 835.9061622740368
min: 326.84078748034597
Normalized Root Mean Square Error: 0.7279612532332611


In [23]:
Calculated = pd.DataFrame(Pred, columns = ['Prediction'])

In [24]:
Calculated.to_csv(r'E:\CatBoost_1\Golden\catboost\sunny hours\preddiction.csv', index = False)

In [25]:
Actual = pd.DataFrame(Actual, columns = ['Actual'])

In [26]:
Actual.to_csv(r'E:\CatBoost_1\Golden\catboost\sunny hours\Actual.csv', index = False)