In [5]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit, train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, root_mean_squared_error, mean_absolute_error

# Load the dataset
df = pd.read_parquet('../data/cleanedData/allData.parquet')
df.set_index('measurementDate', inplace=True)

# Prepare the data
X = df.drop(columns=['oxygenValue'])
y = df['oxygenValue']

# Split the data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Set up time series cross-validation on the training set
tscv = TimeSeriesSplit(n_splits=5)

params = {
    "bootstrap": True,
    "ccp_alpha": 0.0,
    "criterion": "squared_error",
    "max_depth": None,
    "max_features": "sqrt",
    "max_leaf_nodes": None,
    "max_samples": None,
    "min_impurity_decrease": 0.0,
    "min_samples_leaf": 1,
    "min_samples_split": 2,
    "min_weight_fraction_leaf": 0.0,
    "n_estimators": 100,
    "n_jobs": -1,
    "oob_score": False,
    "random_state": None,
    "verbose": 0,
    "warm_start": False
}

# Initialize the model
model = RandomForestRegressor(**params)

# Perform cross-validation on the training set
cv_scores = cross_val_score(model, X_train, y_train, cv=tscv, scoring='neg_mean_squared_error')

# Calculate the mean and standard deviation of the scores
mean_mse = -cv_scores.mean()
std_mse = cv_scores.std()

print(f'Mean MSE on training set: {mean_mse}')
print(f'Standard Deviation of MSE on training set: {std_mse}')

# Train the model on the entire training set
model.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = model.predict(X_test)
test_mse = mean_squared_error(y_test, y_pred)

print(f'Test MSE: {test_mse}')
print(root_mean_squared_error(y_test,y_pred))
print(f'Test MAE: {mean_absolute_error(y_test,y_pred)}')



KeyboardInterrupt: 