In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error
import plotly.express as px
from functools import reduce
from scipy.stats import ecdf
from datetime import timedelta

In [None]:
s1 = pd.read_parquet("/home/thomas/repos/simplify_deployment/data/simplify_1_0/predictions/s1_predictions.parquet")

s1_50_gen = pd.read_parquet("/home/thomas/repos/simplify_deployment/data/simplify_1_0/predictions/s1_50_gen_parallel.parquet")
s1_50_gen = (
    s1_50_gen
    .rename(
        columns = {
            "y_pred": "y_pred_s1_50_gen_parallel"
        }
    )
    .drop(
        columns = ["y_true"]
    )
)

s1_100_gen = pd.read_parquet("/home/thomas/repos/simplify_deployment/data/simplify_1_0/predictions/s1_100_gen_parallel.parquet")
s1_100_gen = (
    s1_100_gen
    .rename(
        columns = {
            "y_pred": "y_pred_s1_100_gen_parallel"
        }
    )
    .drop(
        columns = ["y_true"]
    )
)

lots_of_vars_150_gen = pd.read_parquet("/home/thomas/repos/simplify_deployment/data/lots_of_vars/predictions/lots_of_vars_150_gen_parallel.parquet")
lots_of_vars_150_gen = (
    lots_of_vars_150_gen
    .rename(
        columns = {
            "y_pred":"y_pred_lots_of_vars_150_gen_parallel"
        }
    )
    .drop(
        columns = ["y_true"]
    )
)

predictions_list = [
    s1,
    s1_50_gen,
    s1_100_gen,
    lots_of_vars_150_gen,
]

predictions = reduce(
    lambda a,b: pd.merge(
        a,
        b,
        left_index = True,
        right_index = True,
        how = "inner",
    ),
    predictions_list,
)
predictions.to_parquet("/home/thomas/repos/simplify_deployment/src/simplify_deployment/data_science/predictions.parquet")
predictions

In [None]:
for column in predictions.columns[1:]:
    rmse_value = np.sqrt(mean_squared_error(predictions["y_true"],predictions[column]))
    print(f"Rmse {column} is {rmse_value}.")
print("\n")
for column in predictions.columns[1:]:
    mae_value = mean_absolute_error(predictions["y_true"],predictions[column])
    print(f"Mae {column} is {mae_value}.")

In [None]:
residuals = pd.DataFrame(index = predictions.index)
for column in predictions.columns[1:]:
    residuals[f"{column}_residuals"] = predictions["y_true"] - predictions[column]
residuals 

In [None]:
# What is the p-value that we actually do better?
# To get this p-value, we will sample residuals with replacement from all the techniques.
# For then check for each sample of rmse is better or not
# if it is better give a hit. This gives a p value if we do this 10 000 times.
n = 10000
sample_rmse_list = []
for _ in range(n):
    sample = residuals.apply(lambda x: np.random.choice(x, size=len(x), replace=True), axis = 0)
    sample.columns = [col.replace("_residuals", "_rmse") for col in sample.columns]
    sample_rmse = sample.apply(lambda x: np.sqrt(np.mean(x**2)),axis = 0)
    sample_rmse_list.append(sample_rmse.to_frame().T)

sample_rmse_df = pd.concat(
    sample_rmse_list, 
    axis = 0,
)
sample_rmse_df

In [None]:
np.mean(sample_rmse_df["y_pred_s1_50_gen_parallel_rmse"] <= sample_rmse_df["y_pred_s1_rmse"])


In [None]:
sample_rmse_df_molten = sample_rmse_df.melt(value_name="rmse", var_name="source")
sample_rmse_df_molten

In [None]:
fig = px.histogram(
    sample_rmse_df_molten, 
    x = "rmse", 
    color = 
    "source", 
    title=f"25 minutes before real time si prediction", 
    barmode="overlay",
)
fig.show()

In [None]:
fig2 = px.line(predictions, x = predictions.index, y =predictions.columns)
fig2.show()