In [5]:
import pandas as pd
import numpy as np
from functools import reduce
from sklearn.metrics import mean_squared_error, mean_absolute_error
import plotly.express as px

In [None]:
base_prediction = pd.read_parquet("/home/thomas/repos/simplify_deployment/data/data_science/predictions.parquet")
base_prediction = base_prediction[["y_true", "y_pred_s1_50_gen_parallel"]]
base_prediction

In [7]:
spline = pd.read_parquet("/home/thomas/repos/simplify_deployment/data/data_science/50_gen_s1/hour_spline_encoded.parquet")
sin_cos = pd.read_parquet("/home/thomas/repos/simplify_deployment/data/data_science/50_gen_s1/hour_sin_cos_encoded.parquet")
onehot = pd.read_parquet("/home/thomas/repos/simplify_deployment/data/data_science/50_gen_s1/hour_onehot_encoded.parquet")
ordinal = pd.read_parquet("/home/thomas/repos/simplify_deployment/data/data_science/50_gen_s1/hour_ordinal_encoded.parquet")
one_per_hour = pd.read_parquet("/home/thomas/repos/simplify_deployment/data/data_science/50_gen_s1/one_model_per_hour.parquet")

In [None]:
predictions = reduce(
    lambda a,b: pd.merge(
        a,
        b,
        left_index = True,
        right_index = True,
        how = "inner",
    ),
    [
        base_prediction,
        spline,
        sin_cos,
        onehot,
        ordinal,
        one_per_hour,
    ]
)
predictions

In [None]:
for column in predictions.columns[1:]:
    rmse_value = np.sqrt(mean_squared_error(predictions["y_true"],predictions[column]))
    print(f"Rmse {column} is {rmse_value}.")
print("\n")
for column in predictions.columns[1:]:
    mae_value = mean_absolute_error(predictions["y_true"],predictions[column])
    print(f"Mae {column} is {mae_value}.")

In [None]:
residuals = pd.DataFrame(index = predictions.index)
for column in predictions.columns[1:]:
    residuals[f"{column}_residuals"] = predictions["y_true"] - predictions[column]
residuals 

In [None]:
# What is the p-value that we actually do better?
# To get this p-value, we will sample residuals with replacement from all the techniques.
# For then check for each sample of rmse is better or not
# if it is better give a hit. This gives a p value if we do this 10 000 times.
n = 10000
sample_rmse_list = []
for _ in range(n):
    sample = residuals.apply(lambda x: np.random.choice(x, size=len(x), replace=True), axis = 0)
    sample.columns = [col.replace("_residuals", "_rmse") for col in sample.columns]
    sample_rmse = sample.apply(lambda x: np.sqrt(np.mean(x**2)),axis = 0)
    sample_rmse_list.append(sample_rmse.to_frame().T)

sample_rmse_df = pd.concat(
    sample_rmse_list, 
    axis = 0,
)
sample_rmse_df

In [None]:
sample_rmse_df_molten = sample_rmse_df.melt(value_name="rmse", var_name="source")
sample_rmse_df_molten
fig = px.histogram(
    sample_rmse_df_molten, 
    x = "rmse", 
    color = 
    "source", 
    title=f"25 minutes before real time si prediction", 
    barmode="overlay",
)
fig.show()