In [None]:
import pandas as pd
import os
import numpy as np

from settings import TIME_RESULTS_PATH, TIME_LIMIT, DATASET_TABLE_PATH, PAPER_RESULTS_FOLDER

In [None]:
dataset_table = pd.read_csv(DATASET_TABLE_PATH)
dataset_table = dataset_table.reindex(dataset_table.index.append(dataset_table.index[dataset_table["dataset"]=="nutrimouse"])).sort_index().reset_index(drop=True)
dataset_table.loc[dataset_table["dataset"] == "nutrimouse", "dataset"] = ["nutrimouse_genotype", "nutrimouse_diet"]
dataset_table = dataset_table.set_index("dataset")
print("dataset_table", dataset_table.shape)
dataset_table

In [None]:
results = pd.read_csv(TIME_RESULTS_PATH)
print("results", results.shape)
print("Total time:", round(results["time"].sum() / 60 / 60, 1))
results.head()

In [None]:
results.to_csv(os.path.join(PAPER_RESULTS_FOLDER, "time_matrix.csv"), index= False)
mat = results.copy()
mat['time'] = mat['time'].fillna(mat.groupby('dataset')['time'].transform('max'))
mat.to_csv(os.path.join(PAPER_RESULTS_FOLDER, "im_time_matrix.csv"), index= False)

In [None]:
pivot_table = results.pivot(columns="dataset", index= "algorithm", values="time")

In [None]:
for algorithm in results["algorithm"].unique():
    results.loc[results["algorithm"] == algorithm, "time"] = results.loc[results["algorithm"] == algorithm, "time"].values / dataset_table["n_cells"].reset_index(drop=True).values
    
results.groupby("algorithm", sort= False)["time"].mean().sort_values().to_csv(os.path.join(PAPER_RESULTS_FOLDER, "time_barplot.csv"))
results['time'] = results['time'].fillna(results.groupby('dataset')['time'].transform('max'))
results.groupby("algorithm", sort= False)["time"].mean().sort_values().to_csv(os.path.join(PAPER_RESULTS_FOLDER, "im_time_barplot.csv"))

In [None]:
time_table = pivot_table * 10 * 4 * 1.5 * 50 / 60 / 24 / 6 / 3
time_table[time_table < 0] = 0
print("Total", time_table.sum().sum())
print()
print(time_table.sum())
print()
print(time_table.sum(1))

In [None]:
time_table = pivot_table.drop(index= ["SUMO", "MONET", "IntNMF", "MSNE"])
time_table[time_table > 1800] = 0
time_table[time_table < 0] = 0
time_table = time_table * 10 * 4 * 1.5 * 50 / 60 / 24 / 6 / 3
print("Total", time_table.sum().sum())
print()
print(time_table.sum())
print()
print(time_table.sum(1))

In [None]:
time = (pivot_table/dataset_table["n_cells"]).mean(1)
time = time.sort_values()
labels = time / time.min(0)
labels = labels.round(1).astype(str) + "x"
ax = time.mul(1000).plot.bar(title= "Computing time", xlabel= "Algorithm", ylabel= "Miliseconds x 10$^-$$^3$", figsize= (15,6))
_ = ax.bar_label(ax.containers[0], labels= labels)

In [None]:
time = (pivot_table.fillna(pivot_table.max())/dataset_table["n_cells"]).mean(1)
time = time.sort_values()
labels = time / time.min(0)
labels = labels.round(1).astype(str) + "x"
ax = time.mul(1000).plot.bar(title= "Computing time", xlabel= "Algorithm", ylabel= "Miliseconds x 10$^-$$^3$", figsize= (15,6))
_ = ax.bar_label(ax.containers[0], labels= labels)

In [None]:
pivot_table.div(60).style.format(precision=0).set_properties(**{'text-align': 'center'}).background_gradient(cmap='RdYlGn_r', axis= None)

In [None]:
pivot_table.fillna(time_table.max()).div(60).style.format(precision=0).set_properties(**{'text-align': 'center'}).background_gradient(cmap='RdYlGn_r', axis= None)

In [None]:
time = (results.fillna(results.max())/dataset_table["n_cells"]).mean(1)
time = time.sort_values()
labels = time / time.min(0)
labels = labels.round(1).astype(str) + "x"
time = pd.concat([time.mul(1000), 0- time.apply(np.log10)], axis=1)
axs = time.plot.bar(title= "Computing time", xlabel= "Algorithm", ylabel= "Miliseconds x 10$^-$$^3$", subplots= True, layout= (1,2), figsize= (20,6))
for idx, ax in enumerate(axs.flatten()):
    if idx > 0:
        ax.set(ylabel= f"log({ax.get_ylabel()})")
    _ = ax.bar_label(ax.containers[0], labels= labels, rotation= 45)

In [None]:
results.loc[["AJIVE", "DeepMF", "DFMF"], "simulated_gm"] = results["simulated_gm"].mean()
results.loc[["AJIVE", "PIMVC"], "statlog"] = results["statlog"].mean()
results.loc[["SUMO"], "simulated_InterSIM"] = results["simulated_InterSIM"].mean()
results.loc[["SUMO"], "bbcsport"] = results["bbcsport"].mean()
results.loc[["SUMO", "PIMVC"], "digits"] = results["digits"].mean()

In [None]:
results.fillna(results.max()).div(60).style.format(precision=0).set_properties(**{'text-align': 'center'}).background_gradient(cmap='RdYlGn_r', axis= None)

In [None]:
results.div(60).style.format(precision=0).set_properties(**{'text-align': 'center'}).background_gradient(cmap='RdYlGn_r', axis= None)

In [None]:
results[results > 180] = 0

In [None]:
(results * 10 * 4 * 1.5 * 50 / 60 / 60 / 24 / 5).sum().sum()

In [None]:
((results.drop(["MSNE", "SUMO", "MONET"]) * 10 * 5 * 1.5 * 50).sum() / 3600 / 24 / 3).sum()

In [None]:
TIME_RESULTS_PATH = os.path.join("test", "time_evaluation.csv")
results = pd.read_csv(TIME_RESULTS_PATH, index_col=0)
print("results", results.shape)
results.head()

In [None]:
TIME_LIMIT = 0.4
time_limit_seconds = TIME_LIMIT * 3600

N_MISSING_PERCENTAGE = 10
N_MISSING_PATTERNS = 5
N_IMPUTING_PATTERNS = 2
N_PERMUTATIONS = 50

estimated_time = results[results < time_limit_seconds]

print("Iteration", round(results.div(3600).div(24).sum().sum(), 2), "days")
print("Limited iteration", round(estimated_time.div(3600).div(24).sum().sum(), 2), "days")
time = pd.DataFrame([(results * N_MISSING_PERCENTAGE * N_PERMUTATIONS * N_IMPUTING_PATTERNS * N_MISSING_PATTERNS).sum(),
                     (estimated_time * N_MISSING_PERCENTAGE * N_PERMUTATIONS * N_IMPUTING_PATTERNS * N_MISSING_PATTERNS).sum()],
                    index= ["Total time", "Estimated time"]).T
time = time.div(3600).div(24).sort_values("Total time")
print(time.sum(0))
labels = time / time.min(0)
labels = labels.round(1).astype(str) + "x"
time = pd.concat([time, time.apply(np.log1p)], axis=1)
axs = time.plot.bar(title= "Computing time", xlabel= "Dataset", ylabel= "Minutes", subplots= True, layout= (2,2), figsize= (18,10))
for idx, ax in enumerate(axs.flatten()):
    if idx > 1:
        ax.set(ylabel= f"(log) {ax.get_ylabel()}")
    _ = ax.bar_label(ax.containers[0], labels= labels.iloc[:,idx % 2])

In [None]:
large_datasets = ["caltech101", "nuswide"]
time = pd.DataFrame([results.drop(columns=large_datasets).mean(1), results[large_datasets].mean(1)], index= ["Small datasets", "Large datasets"]).T
time = time.div(60).sort_values("Small datasets")
labels = time / time.min(0)
labels = labels.round(1).astype(str) + "x"
time = pd.concat([time, time.apply(np.log1p)], axis=1)
axs = time.plot.bar(title= "Computing time", xlabel= "Algorithm", ylabel= "Minutes", subplots= True, layout= (2,2), figsize= (18,10))
for idx, ax in enumerate(axs.flatten()):
    if idx > 1:
        ax.set(ylabel= f"(log) {ax.get_ylabel()}")
    _ = ax.bar_label(ax.containers[0], labels= labels.iloc[:,idx % 2])

In [None]:
results.div(60).style.format(precision=0).set_properties(**{'text-align': 'center'}).background_gradient(cmap='RdYlGn_r', axis= None)