In [140]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

In [141]:
df = pd.read_csv('sensor1monthly.csv')
test = pd.read_csv('sensor2monthly.csv')

test['Timestamp'] = pd.to_datetime(test['Timestamp'], unit= 'ms')
test.set_index('Timestamp', inplace=True)
new_test= test.resample('H').mean()
new_test.reset_index(inplace=True)

df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit= 'ms')
df.set_index('Timestamp', inplace=True)
new_df= df.resample('H').mean()
new_df.reset_index(inplace=True)

In [142]:
df_500= new_df.iloc[:-1].copy()
#print(df.iloc[4000])
df_500['Hour'] = df_500['Timestamp'].dt.hour
df_500['Day'] = df_500['Timestamp'].dt.day
df_500['Month'] = df_500['Timestamp'].dt.month

#column_means = df_500.median()
df_filled = df_500.apply(lambda col: col.fillna(col.median()), axis=0)


test_500= new_test.iloc[:-1].copy()
#print(df.iloc[4000])
test_500['Hour'] = test_500['Timestamp'].dt.hour
test_500['Day'] = test_500['Timestamp'].dt.day
test_500['Month'] = test_500['Timestamp'].dt.month

#column_means = df_500.median()
test_filled = test_500.apply(lambda col: col.fillna(col.median()), axis=0)

#
x= df_filled[['Hour','Day','Month','Wind direction minimum Degrees', 'Wind direction average Degrees','Wind direction maximum Degrees','Wind speed minimum m/s','Wind speed average m/s','Wind speed maximum m/s','Solar radiation W/m^2','Barometric Pressure Hecto Pascals']]
y = df_filled['Air temperature Celsius']
tss = TimeSeriesSplit(n_splits = 3)

for train_index, test_index in tss.split(x):
    x_train, x_test = x.iloc[train_index, :], x.iloc[test_index,:]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    split_index = int(len(x_train) * 0.8)
    x_train_split, x_val = x_train.iloc[:split_index, :], x_train.iloc[split_index:, :]
    y_train_split, y_val = y_train.iloc[:split_index], y_train.iloc[split_index:]

In [143]:
rfr = RandomForestRegressor(random_state=42)
rfr.fit(x_train,y_train)

predictions_train = rfr.predict(x_train)

mae_train = mean_absolute_error(y_train,predictions_train)
mse_train = mean_squared_error(y_train, predictions_train)
r2_train = r2_score(y_train, predictions_train)

print("Training")
print("-----------------------------")
print("Mean Absolute Error:", mae_train)
print("Mean Squared Error:", mse_train)
print("R2:", r2_train)


predictions_val = rfr.predict(x_val)

mae_val = mean_absolute_error(y_val, predictions_val)
mse_val = mean_squared_error(y_val, predictions_val)
r2_val = r2_score(y_val, predictions_val)

print("Validation")
print("-----------------------------")
print("Mean Absolute Error:", mae_val)
print("Mean Squared Error:", mse_val)
print("R2:", r2_val)

predictions = rfr.predict(x_test)

mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)  

print("Testing")
print("-----------------------------")
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("R2:", r2)


Training
-----------------------------
Mean Absolute Error: 0.17225081369248163
Mean Squared Error: 0.09137368871756298
R2: 0.976017668954532
Validation
-----------------------------
Mean Absolute Error: 0.1371501823793509
Mean Squared Error: 0.04951168377110805
R2: 0.9682322771939523
Testing
-----------------------------
Mean Absolute Error: 0.7885489673268992
Mean Squared Error: 0.8226012804758457
R2: -0.29043692172588353


In [144]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [10,20,50,100],
    'max_depth': [2,5,10,20,30,40,50,60],
    'min_samples_split': [60,80,90,100,120,130,140,150],
    'min_samples_leaf':[1,2,4],
    'max_features': ['sqrt', 'log2']
    # Add more hyperparameters as needed
}

# Instantiate the grid search cross-validation object
grid_search = GridSearchCV(estimator=rfr, param_grid=param_grid, cv=tss, scoring='r2')

# Perform grid search
grid_search.fit(x_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Use the best model for prediction
best_model = grid_search.best_estimator_

# Evaluate the best model
best_model.fit(x_train, y_train)
predictions_test = best_model.predict(x_test)

# Calculate evaluation metrics
mae_test = mean_absolute_error(y_test, predictions_test)
mse_test = mean_squared_error(y_test, predictions_test)
r2_test = r2_score(y_test, predictions_test)

print("Best Hyperparameters:", best_params)
print("Testing Performance:")
print("Mean Absolute Error:", mae_test)
print("Mean Squared Error:", mse_test)
print("R2 Score:", r2_test)

Best Hyperparameters: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 90, 'n_estimators': 10}
Testing Performance:
Mean Absolute Error: 0.5766648157477597
Mean Squared Error: 0.5432078929735904
R2 Score: 0.14785505699607748


In [145]:
rfr.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}