In [62]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

In [63]:
df = pd.read_csv('sensor1monthly.csv')
test = pd.read_csv('sensor2monthly.csv')

test['Timestamp'] = pd.to_datetime(test['Timestamp'], unit= 'ms')
test.set_index('Timestamp', inplace=True)
new_test= test.resample('H').mean()
new_test.reset_index(inplace=True)

df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit= 'ms')
df.set_index('Timestamp', inplace=True)
new_df= df.resample('H').mean()
new_df.reset_index(inplace=True)

In [64]:
df_500= new_df.iloc[:-1].copy()
#print(df.iloc[4000])
df_500['Hour'] = df_500['Timestamp'].dt.hour
df_500['Day'] = df_500['Timestamp'].dt.day
df_500['Month'] = df_500['Timestamp'].dt.month

#column_means = df_500.median()
df_filled = df_500.apply(lambda col: col.fillna(col.median()), axis=0)


test_500= new_test.iloc[:-1].copy()
#print(df.iloc[4000])
test_500['Hour'] = test_500['Timestamp'].dt.hour
test_500['Day'] = test_500['Timestamp'].dt.day
test_500['Month'] = test_500['Timestamp'].dt.month

#column_means = df_500.median()
test_filled = test_500.apply(lambda col: col.fillna(col.median()), axis=0)

#
x= df_filled[['Hour','Day','Month','Wind direction minimum Degrees', 'Wind direction average Degrees','Wind direction maximum Degrees','Wind speed minimum m/s','Wind speed average m/s','Wind speed maximum m/s','Solar radiation W/m^2','Barometric Pressure Hecto Pascals']]
y = df_filled['Air temperature Celsius']
tss = TimeSeriesSplit(n_splits = 3)

for train_index, test_index in tss.split(x):
    x_train, x_test = x.iloc[train_index, :], x.iloc[test_index,:]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    split_index = int(len(x_train) * 0.8)
    x_train_split, x_val = x_train.iloc[:split_index, :], x_train.iloc[split_index:, :]
    y_train_split, y_val = y_train.iloc[:split_index], y_train.iloc[split_index:]

In [65]:
rfr = RandomForestRegressor(random_state=42)
rfr.fit(x_train,y_train)

predictions_train = rfr.predict(x_train)

mae_train = mean_absolute_error(y_train,predictions_train)
mse_train = mean_squared_error(y_train, predictions_train)
r2_train = r2_score(y_train, predictions_train)

print("Training")
print("-----------------------------")
print("Mean Absolute Error:", mae_train)
print("Mean Squared Error:", mse_train)
print("R2:", r2_train)


predictions_val = rfr.predict(x_val)

mae_val = mean_absolute_error(y_val, predictions_val)
mse_val = mean_squared_error(y_val, predictions_val)
r2_val = r2_score(y_val, predictions_val)

print("Validation")
print("-----------------------------")
print("Mean Absolute Error:", mae_val)
print("Mean Squared Error:", mse_val)
print("R2:", r2_val)

predictions = rfr.predict(x_test)

mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)  

print("Testing")
print("-----------------------------")
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("R2:", r2)


Training
-----------------------------
Mean Absolute Error: 0.17225081369248163
Mean Squared Error: 0.09137368871756298
R2: 0.976017668954532
Validation
-----------------------------
Mean Absolute Error: 0.1371501823793509
Mean Squared Error: 0.04951168377110805
R2: 0.9682322771939523
Testing
-----------------------------
Mean Absolute Error: 0.7885489673268992
Mean Squared Error: 0.8226012804758457
R2: -0.29043692172588353


In [66]:
test_filled

Unnamed: 0,Timestamp,Barometric Pressure Hecto Pascals,Wind direction minimum Degrees,Wind direction average Degrees,Wind direction maximum Degrees,Wind speed minimum m/s,Wind speed average m/s,Wind speed maximum m/s,Air temperature Celsius,Internal temperature Celsius,...,Hail peak intensity,Water level Meters,Solar radiation W/m^2,Heating temperature Celsius,Heating voltage Volts,Supply voltage Volts,3.5V ref. voltage Volts,Hour,Day,Month
0,2023-04-10 18:00:00,1016.430000,53.900000,62.700000,106.200000,3.680000,5.110000,6.310000,22.320000,22.320000,...,0.0,0.024439,1.780774,0.0,0.0,13.340000,3.623100,18,10,4
1,2023-04-10 19:00:00,1015.975000,41.666667,48.750000,115.833333,4.333333,5.466667,6.758333,23.158333,23.158333,...,0.0,-0.156596,3.659938,0.0,0.0,13.475000,3.623500,19,10,4
2,2023-04-10 20:00:00,1015.769231,20.153846,26.230769,66.384615,5.084615,7.015385,8.523077,23.976923,23.976923,...,0.0,-0.317993,3.946727,0.0,0.0,13.438462,3.623000,20,10,4
3,2023-04-10 21:00:00,1015.891667,20.916667,34.250000,48.333333,6.383333,7.925000,9.416667,23.975000,23.975000,...,0.0,-0.376191,1.776912,0.0,0.0,13.366667,3.623250,21,10,4
4,2023-04-10 22:00:00,1016.000000,28.833333,39.083333,105.333333,4.850000,7.125000,9.258333,24.025000,24.025000,...,0.0,-0.374061,0.597754,0.0,0.0,13.225000,3.623333,22,10,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
480,2023-04-30 18:00:00,1008.083333,221.833333,256.500000,290.583333,3.025000,4.200000,5.125000,27.208333,27.208333,...,1.0,-0.146602,12.590903,0.0,0.0,13.833333,3.622000,18,30,4
481,2023-04-30 19:00:00,1007.400000,230.166667,252.666667,272.666667,2.025000,3.016667,3.808333,28.433333,28.433333,...,1.0,-0.025417,12.582822,0.0,0.0,13.758333,3.622000,19,30,4
482,2023-04-30 20:00:00,1006.916667,228.500000,241.583333,282.750000,2.783333,3.608333,4.408333,29.008333,29.008333,...,1.0,0.079485,10.735898,0.0,0.0,13.983333,3.621833,20,30,4
483,2023-04-30 21:00:00,1007.041667,222.500000,274.750000,264.916667,1.308333,2.225000,3.141667,28.325000,28.325000,...,1.0,0.102460,7.528838,0.0,0.0,14.116667,3.621667,21,30,4


In [67]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [10,20,30,40,50,60,70,80,90,100],
    'max_depth': [None,2,5,10,20,30,40,50,60],
    'min_samples_split': [60,80,90,100,120,130,140,150],
    # Add more hyperparameters as needed
}

# Instantiate the grid search cross-validation object
grid_search = GridSearchCV(estimator=rfr, param_grid=param_grid, cv=tss, scoring='r2')

# Perform grid search
grid_search.fit(x_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Use the best model for prediction
best_model = grid_search.best_estimator_

# Evaluate the best model
best_model.fit(x_train, y_train)
predictions_test = best_model.predict(x_test)

# Calculate evaluation metrics
mae_test = mean_absolute_error(y_test, predictions_test)
mse_test = mean_squared_error(y_test, predictions_test)
r2_test = r2_score(y_test, predictions_test)

print("Best Hyperparameters:", best_params)
print("Testing Performance:")
print("Mean Absolute Error:", mae_test)
print("Mean Squared Error:", mse_test)
print("R2 Score:", r2_test)

Best Hyperparameters: {'max_depth': None, 'min_samples_split': 90, 'n_estimators': 20}
Testing Performance:
Mean Absolute Error: 0.545708866493652
Mean Squared Error: 0.5490663278880589
R2 Score: 0.13866477137825672


In [68]:
drop = [
    "Timestamp",   "Air temperature Celsius",
    "Internal temperature Celsius",  "Rain accumulation  Millimeters",
    "Rain duration Hours", "Rain intensity mm/h", "Hail accumulation Count", "Hail duration ",
    "Hail intensity  -", "Rain peak intensity mm/h", "Hail peak intensity ", "Water level Meters",
    "Heating temperature Celsius", "Heating voltage Volts",
    "Supply voltage Volts", "3.5V ref. voltage Volts"
]

desired_order = [
    "Hour", "Day", "Month",'Wind direction minimum Degrees', 'Wind direction average Degrees','Wind direction maximum Degrees','Wind speed minimum m/s','Wind speed average m/s','Wind speed maximum m/s','Solar radiation W/m^2','Barometric Pressure Hecto Pascals'
]
test_no_temp = test_filled.drop(drop, axis=1)

test_no_temp = test_no_temp[desired_order]
test_predictions = grid_search.predict(test_no_temp)
test_no_temp

Unnamed: 0,Hour,Day,Month,Wind direction minimum Degrees,Wind direction average Degrees,Wind direction maximum Degrees,Wind speed minimum m/s,Wind speed average m/s,Wind speed maximum m/s,Solar radiation W/m^2,Barometric Pressure Hecto Pascals
0,18,10,4,53.900000,62.700000,106.200000,3.680000,5.110000,6.310000,1.780774,1016.430000
1,19,10,4,41.666667,48.750000,115.833333,4.333333,5.466667,6.758333,3.659938,1015.975000
2,20,10,4,20.153846,26.230769,66.384615,5.084615,7.015385,8.523077,3.946727,1015.769231
3,21,10,4,20.916667,34.250000,48.333333,6.383333,7.925000,9.416667,1.776912,1015.891667
4,22,10,4,28.833333,39.083333,105.333333,4.850000,7.125000,9.258333,0.597754,1016.000000
...,...,...,...,...,...,...,...,...,...,...,...
480,18,30,4,221.833333,256.500000,290.583333,3.025000,4.200000,5.125000,12.590903,1008.083333
481,19,30,4,230.166667,252.666667,272.666667,2.025000,3.016667,3.808333,12.582822,1007.400000
482,20,30,4,228.500000,241.583333,282.750000,2.783333,3.608333,4.408333,10.735898,1006.916667
483,21,30,4,222.500000,274.750000,264.916667,1.308333,2.225000,3.141667,7.528838,1007.041667


In [69]:
actual_temp = test_filled['Air temperature Celsius']


# Compute evaluation metrics
mae = mean_absolute_error(actual_temp, test_predictions)
mse = mean_squared_error(actual_temp, test_predictions)
r2 = r2_score(actual_temp, test_predictions)

# Print the evaluation metrics
print("Testing Set")
print('-----------------------------------')
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("R2:", r2)


Testing Set
-----------------------------------
Mean Absolute Error: 1.198757755883081
Mean Squared Error: 2.416007427253314
R2: 0.0644569132097238


In [70]:
rfr.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [71]:
from sklearn.tree import DecisionTreeRegressor

dtr = DecisionTreeRegressor(random_state=42)

dtr.fit(x_train, y_train)

predictions_train = dtr.predict(x_train)

mae_train = mean_absolute_error(y_train,predictions_train)
mse_train = mean_squared_error(y_train, predictions_train)
r2_train = r2_score(y_train, predictions_train)

print("Training")
print("-----------------------------")
print("Mean Absolute Error:", mae_train)
print("Mean Squared Error:", mse_train)
print("R2:", r2_train)


predictions_val = dtr.predict(x_val)

mae_val = mean_absolute_error(y_val, predictions_val)
mse_val = mean_squared_error(y_val, predictions_val)
r2_val = r2_score(y_val, predictions_val)

print("Validation")
print("-----------------------------")
print("Mean Absolute Error:", mae_val)
print("Mean Squared Error:", mse_val)
print("R2:", r2_val)

predictions = dtr.predict(x_test)

mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)  

print("Testing")
print("-----------------------------")
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("R2:", r2)

Training
-----------------------------
Mean Absolute Error: 6.579099405186113e-18
Mean Squared Error: 2.3373656450992943e-32
R2: 1.0
Validation
-----------------------------
Mean Absolute Error: 3.289549702593056e-17
Mean Squared Error: 1.168682822549647e-31
R2: 1.0
Testing
-----------------------------
Mean Absolute Error: 0.8967767056035211
Mean Squared Error: 1.078788529274461
R2: -0.6923248017616059
