In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

In [2]:
df = pd.read_csv('wahoo_sensor1_2024.csv')
test = pd.read_csv('wahoo_sensor1_2023.csv')

test['Timestamp'] = pd.to_datetime(test['Timestamp'], unit= 'ms')
test.set_index('Timestamp', inplace=True)
new_test= test.resample('H').mean()
new_test.reset_index(inplace=True)

df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit= 'ms')
df.set_index('Timestamp', inplace=True)
new_df= df.resample('H').mean()
new_df.reset_index(inplace=True)

In [3]:
df_500= new_df.iloc[:-1].copy()
#print(df.iloc[4000])
df_500['Hour'] = df_500['Timestamp'].dt.hour
df_500['Day'] = df_500['Timestamp'].dt.day
df_500['Month'] = df_500['Timestamp'].dt.month

#column_means = df_500.median()
df_filled = df_500.apply(lambda col: col.fillna(col.median()), axis=0)


test_500= new_test.iloc[:-1].copy()
#print(df.iloc[4000])
test_500['Hour'] = test_500['Timestamp'].dt.hour
test_500['Day'] = test_500['Timestamp'].dt.day
test_500['Month'] = test_500['Timestamp'].dt.month

#column_means = df_500.median()
test_filled = test_500.apply(lambda col: col.fillna(col.median()), axis=0)

x= df_filled[['Hour','Day','Month']]
y = df_filled['Air temperature Celsius']
tss = TimeSeriesSplit(n_splits = 3)

for train_index, test_index in tss.split(x):
    x_train, x_test = x.iloc[train_index, :], x.iloc[test_index,:]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    split_index = int(len(x_train) * 0.8)
    x_train_split, x_val = x_train.iloc[:split_index, :], x_train.iloc[split_index:, :]
    y_train_split, y_val = y_train.iloc[:split_index], y_train.iloc[split_index:]





In [4]:
rfr = RandomForestRegressor(random_state=42)
rfr.fit(x_train,y_train)

predictions_train = rfr.predict(x_train)

mae_train = mean_absolute_error(y_train,predictions_train)
mse_train = mean_squared_error(y_train, predictions_train)
r2_train = r2_score(y_train, predictions_train)

print("Training")
print("-----------------------------")
print("Mean Absolute Error:", mae_train)
print("Mean Squared Error:", mse_train)
print("R2:", r2_train)


predictions_val = rfr.predict(x_val)

mae_val = mean_absolute_error(y_val, predictions_val)
mse_val = mean_squared_error(y_val, predictions_val)
r2_val = r2_score(y_val, predictions_val)

print("Validation")
print("-----------------------------")
print("Mean Absolute Error:", mae_val)
print("Mean Squared Error:", mse_val)
print("R2:", r2_val)

predictions = rfr.predict(x_test)

mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)  

print("Testing")
print("-----------------------------")
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("R2:", r2)




Training
-----------------------------
Mean Absolute Error: 0.07586003787879009
Mean Squared Error: 0.01407179108321262
R2: 0.989366180732057
Validation
-----------------------------
Mean Absolute Error: 0.02728510971787338
Mean Squared Error: 0.0015799582426939405
R2: 0.9862371657924555
Testing
-----------------------------
Mean Absolute Error: 0.2615195357833628
Mean Squared Error: 0.11008253369331363
R2: 0.40868214767070576


In [5]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10,20,30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf':[1,2,4]
    # Add more hyperparameters as needed
}

# Instantiate the grid search cross-validation object
grid_search = GridSearchCV(estimator=rfr, param_grid=param_grid, cv=tss, scoring='r2')

# Perform grid search
grid_search.fit(x_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Use the best model for prediction
best_model = grid_search.best_estimator_

# Evaluate the best model
best_model.fit(x_train, y_train)
predictions_test = best_model.predict(x_test)

# Calculate evaluation metrics
mae_test = mean_absolute_error(y_test, predictions_test)
mse_test = mean_squared_error(y_test, predictions_test)
r2_test = r2_score(y_test, predictions_test)

print("Best Hyperparameters:", best_params)
print("Testing Performance:")
print("Mean Absolute Error:", mae_test)
print("Mean Squared Error:", mse_test)
print("R2 Score:", r2_test)


Best Hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
Testing Performance:
Mean Absolute Error: 0.2631288766252307
Mean Squared Error: 0.10132109217236743
R2 Score: 0.45574498870149216


In [6]:
drop = [
    "Timestamp",  "Wind direction minimum Degrees",
    "Wind direction average Degrees", "Wind direction maximum Degrees", "Wind speed minimum m/s",
    "Wind speed average m/s", "Wind speed maximum m/s", "Air temperature Celsius",
    "Internal temperature Celsius",  "Rain accumulation  Millimeters",
    "Rain duration Hours", "Rain intensity mm/h", "Hail accumulation Count", "Hail duration ",
    "Hail intensity  -", "Rain peak intensity mm/h", "Hail peak intensity ", "Water level Meters",
    "Solar radiation W/m^2", "Heating temperature Celsius", "Heating voltage Volts",
    "Supply voltage Volts", "3.5V ref. voltage Volts"
]

desired_order = [
    "Hour", "Day", "Month"
]
test_no_temp = test_filled.drop(drop, axis=1)

test_no_temp = test_no_temp[desired_order]
test_predictions = grid_search.predict(test_no_temp)
test_no_temp

Unnamed: 0,Hour,Day,Month
0,4,21,4
1,5,21,4
2,6,21,4
3,7,21,4
4,8,21,4
...,...,...,...
186,22,28,4
187,23,28,4
188,0,29,4
189,1,29,4


In [7]:
actual_temp = test_filled['Air temperature Celsius']


# Compute evaluation metrics
mae = mean_absolute_error(actual_temp, test_predictions)
mse = mean_squared_error(actual_temp, test_predictions)
r2 = r2_score(actual_temp, test_predictions)

# Print the evaluation metrics
print("Testing Set")
print('-----------------------------------')
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("R2:", r2)


Testing Set
-----------------------------------
Mean Absolute Error: 1.5931071159972485
Mean Squared Error: 3.508997748607787
R2: -1.3091098809614454


In [8]:
# Define the number of hours in a day
hours_in_day = 24

# Define the day and month for tomorrow's date
day_tomorrow = 29
month_tomorrow = 4

tomorrow_data = pd.DataFrame({
    'Hour': range(hours_in_day),
    'Day': [day_tomorrow] * hours_in_day,
    'Month': [month_tomorrow] * hours_in_day
})


tomorrow_data

tomorrow_predicted_temp = grid_search.predict(tomorrow_data)

#tomorrow_predicted_temp
df_tommorow_predicted_temp = pd.DataFrame({'Predicted Temp for 4.29.2024':tomorrow_predicted_temp,
                                           })


index_del = 23
df_tommorow_predicted_temp =df_tommorow_predicted_temp.drop(index_del)

df_tommorow_predicted_temp

Unnamed: 0,Predicted Temp for 4.29.2024
0,24.144839
1,24.128699
2,24.11846
3,24.105692
4,24.105692
5,24.105816
6,24.105816
7,24.105816
8,24.105816
9,24.099629


In [9]:
tmrw = pd.read_csv('4-29-24 data.csv')

tmrw['Timestamp'] = pd.to_datetime(tmrw['Timestamp'], unit= 'ms')
tmrw.set_index('Timestamp', inplace=True)
new_tmrw= tmrw.resample('H').mean()
new_tmrw.reset_index(inplace=True)

In [10]:
tmrw_500= new_tmrw.iloc[:-1].copy()
#print(df.iloc[4000])
tmrw_500['Hour'] = tmrw_500['Timestamp'].dt.hour
tmrw_500['Day'] = tmrw_500['Timestamp'].dt.day
tmrw_500['Month'] = tmrw_500['Timestamp'].dt.month

#column_means = df_500.median()
tmrw_filled = tmrw_500.apply(lambda col: col.fillna(col.median()), axis=0)

tmrw_temp = tmrw_filled['Air temperature Celsius']

df_tommorow_acutal_temp= pd.DataFrame({'Acutal Temp for 4.29.2024': tmrw_temp,})
df_tommorow_acutal_temp

Unnamed: 0,Acutal Temp for 4.29.2024
0,23.991667
1,24.008333
2,23.925
3,23.9
4,23.908333
5,23.925
6,23.958333
7,23.858333
8,23.625
9,23.783333


In [11]:
# Compute evaluation metrics
mae = mean_absolute_error(df_tommorow_acutal_temp, df_tommorow_predicted_temp)
mse = mean_squared_error(df_tommorow_acutal_temp, df_tommorow_predicted_temp)
r2 = r2_score(df_tommorow_acutal_temp, df_tommorow_predicted_temp) 


# Print the evaluation metrics
print("4/29/24")
print('-----------------------------------')
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("R2:", r2)

4/29/24
-----------------------------------
Mean Absolute Error: 0.1744182929655738
Mean Squared Error: 0.04313942361975199
R2: 0.7825298457110588
