In [24]:
import catboost as cb
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [25]:
df=pd.read_csv("San Diego_featured_data.csv")
df.head()

Unnamed: 0,All sky irradiance,Clear sky irradiance,ALLSKY ICI,ALLSKY_SFC_PAR_TOT,ALLSKY_SFC_UV_INDEX,CLRSKY_SFC_PAR_TOT
0,3.58,3.68,0.7,63.57,0.55,66.2
1,3.2,3.6,0.62,57.87,0.53,65.5
2,2.52,3.46,0.49,47.44,0.5,64.25
3,2.93,3.48,0.57,55.34,0.49,65.82
4,0.78,3.42,0.15,16.23,0.21,64.96


In [26]:
df.shape

(1917, 6)

In [27]:
df.isnull().sum()

All sky irradiance      0
Clear sky irradiance    0
ALLSKY ICI              0
ALLSKY_SFC_PAR_TOT      0
ALLSKY_SFC_UV_INDEX     0
CLRSKY_SFC_PAR_TOT      0
dtype: int64

In [28]:
y= df[['All sky irradiance']]
scaler_X = StandardScaler()
X_scaled = scaler_X.fit_transform(df)

scaler_y = StandardScaler()
y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1)).flatten() 

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size = 0.2,shuffle=False)

In [30]:
train_dataset = cb.Pool(X_train, y_train) 
test_dataset = cb.Pool(X_test, y_test)

In [31]:
model = cb.CatBoostRegressor(learning_rate=0.1,depth=2,l2_leaf_reg=1,iterations=1500)

In [32]:
import time
start=time.time()
model.fit(X_train,y_train)
end=time.time()
continental=end-start
print("convergence time:",continental)

0:	learn: 0.9134141	total: 894us	remaining: 1.34s
1:	learn: 0.8416578	total: 1.25ms	remaining: 938ms
2:	learn: 0.7745282	total: 1.54ms	remaining: 769ms
3:	learn: 0.7165301	total: 1.86ms	remaining: 697ms
4:	learn: 0.6605078	total: 2.06ms	remaining: 617ms
5:	learn: 0.6102301	total: 2.31ms	remaining: 575ms
6:	learn: 0.5664314	total: 2.73ms	remaining: 582ms
7:	learn: 0.5243119	total: 3.03ms	remaining: 565ms
8:	learn: 0.4865797	total: 3.21ms	remaining: 531ms
9:	learn: 0.4549733	total: 3.34ms	remaining: 498ms
10:	learn: 0.4214188	total: 3.48ms	remaining: 472ms
11:	learn: 0.3929913	total: 3.62ms	remaining: 449ms
12:	learn: 0.3647427	total: 3.76ms	remaining: 431ms
13:	learn: 0.3406752	total: 3.9ms	remaining: 414ms
14:	learn: 0.3191715	total: 4.04ms	remaining: 400ms
15:	learn: 0.2986798	total: 4.19ms	remaining: 389ms
16:	learn: 0.2798691	total: 4.41ms	remaining: 384ms
17:	learn: 0.2624175	total: 4.59ms	remaining: 378ms
18:	learn: 0.2482287	total: 4.75ms	remaining: 370ms
19:	learn: 0.2353383	tot

In [33]:

import time
start=time.time()
pred = model.predict(X_test)
end=time.time()
continental=end-start
print("inference time:",continental)

inference time: 0.0018880367279052734


In [34]:
prediction = pred.reshape(pred.shape[0], 1)

In [35]:
prediction.shape

(384, 1)

In [36]:
predictions = scaler_y.inverse_transform(prediction.reshape(-1, 1))

In [37]:
Actual = scaler_y.inverse_transform(y_test.reshape(-1, 1))

In [38]:
Calculated =pd.DataFrame(predictions, columns = ['Prediction'])
Actual=pd.DataFrame(Actual, columns=["Actual"])

In [39]:
import os
import pandas as pd

# Concatenate Calculated and Actual DataFrames along the columns
combined = pd.concat([Calculated, Actual], axis=1)

# Define the path to the existing folder
save_dir = "/Users/ubaidahmed/Desktop/Daily_SI_forecasting /San Diego/Models/CatBoost"

# Define the filename within the specified folder
filename = os.path.join(save_dir, "CatBoost_output.csv")

# Save the combined DataFrame to CSV
combined.to_csv(filename, index=False)

print(f"File saved to {filename}")


File saved to /Users/ubaidahmed/Desktop/Daily_SI_forecasting /San Diego/Models/CatBoost/CatBoost_output.csv


# Error Calculation

In [2]:
import pandas as pd
import numpy as np

In [3]:
df=pd.read_csv("CatBoost_output.csv")
features= list(df)
features

['Prediction', 'Actual']

In [4]:
df=df*1000

In [5]:
z= df["Actual"]
x= df["Prediction"]

In [6]:
import math
from sklearn.metrics import mean_squared_error
MSE=mean_squared_error(z,x)
RMSE= math.sqrt(MSE)
from sklearn.metrics import mean_absolute_error
MAE= mean_absolute_error(z,x)
from sklearn.metrics import mean_absolute_percentage_error
MAPE= (mean_absolute_percentage_error(z,x))*100
print("Mean Square Error is:",MSE)
print("Root Mean Square Error is:",RMSE)
print("Mean Absolute Error is:",MAE)
print("Mean Absolute Percentage Error is:",MAPE)


Mean Square Error is: 1496.5252401751648
Root Mean Square Error is: 38.68494849647812
Mean Absolute Error is: 29.688379709470137
Mean Absolute Percentage Error is: 0.6263558479190551


In [7]:
max=x.max()
min=x.min()
mean=x.mean()

print("Max",max)
print("Min",min)
print('Mean',mean)

Max 8674.71062660431
Min 1165.7011093140688
Mean 5225.574061933184


In [8]:
NRMSE= (RMSE/(max-min))*100
print("Normalized Root Mean Square Error:",NRMSE)

Normalized Root Mean Square Error: 0.515180443005195


In [9]:
NRMSE= (RMSE/(mean))*100
print("Normalized Root Mean Square Error:",NRMSE)

Normalized Root Mean Square Error: 0.7403004538446204


In [11]:
relative_errors = (z - x) / z
squared_relative_errors = relative_errors ** 2
abs_relative_errors = np.abs(relative_errors)

In [12]:
msre = np.mean(squared_relative_errors)
rmsre = np.sqrt(msre)
mare = np.mean(abs_relative_errors)
rmspe = np.sqrt(np.mean((abs_relative_errors * 100) ** 2))

In [13]:
print("Mean Square Relative Error is:",msre)
print("Root Mean Square Relative Error is:",rmsre)
print("Mean Absolute Relative Error is:",mare)
print("Root Mean Square Percentage Error is:",rmspe)

Mean Square Relative Error is: 7.448569961730494e-05
Root Mean Square Relative Error is: 0.008630509812131896
Mean Absolute Relative Error is: 0.006263558479190551
Root Mean Square Percentage Error is: 0.8630509812131896
