In [16]:
import pandas as pd
import numpy as np
from functools import reduce
from sklearn.metrics import mean_squared_error, mean_absolute_error
import plotly.express as px

In [17]:
base_prediction = pd.read_parquet("/home/thomas/repos/simplify_deployment/data/data_science/predictions.parquet")
base_prediction = base_prediction[["y_true", "y_pred_s1_50_gen_parallel"]]
base_prediction

Unnamed: 0,y_true,y_pred_s1_50_gen_parallel
2022-12-01 22:44:00+00:00,-70.484,-70.529247
2022-12-01 22:59:00+00:00,-125.662,11.984345
2022-12-01 23:14:00+00:00,71.946,-159.113161
2022-12-01 23:29:00+00:00,-101.859,24.945586
2022-12-01 23:44:00+00:00,-45.071,-14.599998
...,...,...
2023-06-30 20:29:00+00:00,-61.305,-25.595774
2023-06-30 20:44:00+00:00,-13.549,-6.777618
2023-06-30 20:59:00+00:00,-29.614,9.842615
2023-06-30 21:14:00+00:00,-40.824,-43.734448


In [18]:
spline = pd.read_parquet("/home/thomas/repos/simplify_deployment/data/data_science/50_gen_s1/hour_spline_encoded.parquet")
sin_cos = pd.read_parquet("/home/thomas/repos/simplify_deployment/data/data_science/50_gen_s1/hour_sin_cos_encoded.parquet")
onehot = pd.read_parquet("/home/thomas/repos/simplify_deployment/data/data_science/50_gen_s1/hour_onehot_encoded.parquet")
ordinal = pd.read_parquet("/home/thomas/repos/simplify_deployment/data/data_science/50_gen_s1/hour_ordinal_encoded.parquet")

In [19]:
predictions = reduce(
    lambda a,b: pd.merge(
        a,
        b,
        left_index = True,
        right_index = True,
        how = "inner",
    ),
    [
        base_prediction,
        spline,
        sin_cos,
        onehot,
        ordinal,
    ]
)
predictions

Unnamed: 0,y_true,y_pred_s1_50_gen_parallel,y_pred_hour_spline_encoded,y_pred_sin_cos_hour,y_pred_hour_onehot_encoded,y_pred_hour_ordinal_encoded
2022-12-01 22:44:00+00:00,-70.484,-70.529247,-71.261553,-73.543193,-65.526396,-69.879897
2022-12-01 22:59:00+00:00,-125.662,11.984345,8.360678,9.075209,13.816888,12.464365
2022-12-01 23:14:00+00:00,71.946,-159.113161,-171.042594,-162.106608,-172.749610,-158.498549
2022-12-01 23:29:00+00:00,-101.859,24.945586,13.564839,22.225628,11.591980,25.458006
2022-12-01 23:44:00+00:00,-45.071,-14.599998,-21.890113,-16.961000,-22.415011,-14.001658
...,...,...,...,...,...,...
2023-06-30 20:29:00+00:00,-61.305,-25.595774,-21.829994,-38.096373,-32.093051,-29.670416
2023-06-30 20:44:00+00:00,-13.549,-6.777618,-7.368008,-21.471066,-15.840348,-11.784345
2023-06-30 20:59:00+00:00,-29.614,9.842615,11.296429,-4.078789,1.538293,5.810698
2023-06-30 21:14:00+00:00,-40.824,-43.734448,-48.349384,-54.037225,-48.506612,-49.228769


In [20]:
for column in predictions.columns[1:]:
    rmse_value = np.sqrt(mean_squared_error(predictions["y_true"],predictions[column]))
    print(f"Rmse {column} is {rmse_value}.")
print("\n")
for column in predictions.columns[1:]:
    mae_value = mean_absolute_error(predictions["y_true"],predictions[column])
    print(f"Mae {column} is {mae_value}.")

Rmse y_pred_s1_50_gen_parallel is 122.39726350938152.
Rmse y_pred_hour_spline_encoded is 122.41528582121715.
Rmse y_pred_sin_cos_hour is 122.4144274089754.
Rmse y_pred_hour_onehot_encoded is 122.55554340264624.
Rmse y_pred_hour_ordinal_encoded is 122.42014363979719.


Mae y_pred_s1_50_gen_parallel is 92.01673948095548.
Mae y_pred_hour_spline_encoded is 91.99203092918479.
Mae y_pred_sin_cos_hour is 92.0096114049063.
Mae y_pred_hour_onehot_encoded is 92.14242813089962.
Mae y_pred_hour_ordinal_encoded is 92.03764882179911.


In [21]:
residuals = pd.DataFrame(index = predictions.index)
for column in predictions.columns[1:]:
    residuals[f"{column}_residuals"] = predictions["y_true"] - predictions[column]
residuals 

Unnamed: 0,y_pred_s1_50_gen_parallel_residuals,y_pred_hour_spline_encoded_residuals,y_pred_sin_cos_hour_residuals,y_pred_hour_onehot_encoded_residuals,y_pred_hour_ordinal_encoded_residuals
2022-12-01 22:44:00+00:00,0.045247,0.777553,3.059193,-4.957604,-0.604103
2022-12-01 22:59:00+00:00,-137.646345,-134.022678,-134.737209,-139.478888,-138.126365
2022-12-01 23:14:00+00:00,231.059161,242.988594,234.052608,244.695610,230.444549
2022-12-01 23:29:00+00:00,-126.804586,-115.423839,-124.084628,-113.450980,-127.317006
2022-12-01 23:44:00+00:00,-30.471002,-23.180887,-28.110000,-22.655989,-31.069342
...,...,...,...,...,...
2023-06-30 20:29:00+00:00,-35.709226,-39.475006,-23.208627,-29.211949,-31.634584
2023-06-30 20:44:00+00:00,-6.771382,-6.180992,7.922066,2.291348,-1.764655
2023-06-30 20:59:00+00:00,-39.456615,-40.910429,-25.535211,-31.152293,-35.424698
2023-06-30 21:14:00+00:00,2.910448,7.525384,13.213225,7.682612,8.404769


In [22]:
# What is the p-value that we actually do better?
# To get this p-value, we will sample residuals with replacement from all the techniques.
# For then check for each sample of rmse is better or not
# if it is better give a hit. This gives a p value if we do this 10 000 times.
n = 10000
sample_rmse_list = []
for _ in range(n):
    sample = residuals.apply(lambda x: np.random.choice(x, size=len(x), replace=True), axis = 0)
    sample.columns = [col.replace("_residuals", "_rmse") for col in sample.columns]
    sample_rmse = sample.apply(lambda x: np.sqrt(np.mean(x**2)),axis = 0)
    sample_rmse_list.append(sample_rmse.to_frame().T)

sample_rmse_df = pd.concat(
    sample_rmse_list, 
    axis = 0,
)
sample_rmse_df

Unnamed: 0,y_pred_s1_50_gen_parallel_rmse,y_pred_hour_spline_encoded_rmse,y_pred_sin_cos_hour_rmse,y_pred_hour_onehot_encoded_rmse,y_pred_hour_ordinal_encoded_rmse
0,120.895317,123.077282,122.844084,123.480209,122.761756
0,121.157172,120.239376,122.158905,120.528486,122.841305
0,122.313919,122.111346,121.994604,121.808662,122.627214
0,122.359359,120.755652,122.350337,122.974996,123.607116
0,122.598260,121.911607,121.723352,123.350658,122.477346
...,...,...,...,...,...
0,123.305854,122.669727,123.036943,124.351671,122.552244
0,124.532827,122.157047,122.276502,122.465998,122.186500
0,121.934150,123.290424,121.247869,123.857980,121.876516
0,121.544756,121.771683,121.610509,122.756998,122.708703


In [None]:
sample_rmse_df_molten = sample_rmse_df.melt(value_name="rmse", var_name="source")
sample_rmse_df_molten
fig = px.histogram(
    sample_rmse_df_molten, 
    x = "rmse", 
    color = 
    "source", 
    title=f"25 minutes before real time si prediction", 
    barmode="overlay",
)
fig.show()