## Count vs SP

---

#### SP

In [None]:
import os
from pathlib import Path

import numpy as np
import pandas as pd

PATH = Path(".../evaluate_models/CORRECTIONS/preds/nd_preds")


def read_stuff(full_path_filename, extra_cl=[]):
    cols = ["G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "GraphName", "Type"]
    for i in extra_cl:
        cols.append(i)
    df = pd.read_csv(
        full_path_filename,
        delimiter=",",
        usecols=cols,
    )
    df = df.set_index("Type")
    return df

In [None]:
EPSILON = 0.0001
names = []
all_cnts = []

tot = 0
for filename in os.listdir(PATH):
    if ".sha256" in filename:
        continue
    print(filename)

    counts = read_stuff(PATH / filename)
    counts.replace(0, EPSILON, inplace=True)
    
    counts = counts.groupby("GraphName").apply(
        lambda x: np.abs(x.iloc[0, :] - x.iloc[1, :])
    )
    quarts = np.quantile(counts, [0.25, 0.5, 0.75], axis=0)
    all_cnts.append(quarts)
    names.append(filename)
    
all_cnts = np.array(all_cnts)
print(all_cnts.shape)

v,c = np.unique(np.argmin(all_cnts, axis=0), return_counts=True)
to_use = v[np.argmax(c)]

#### Count

In [None]:
PATH2 = Path(
    ".../evaluate_models/CORRECTIONS/preds/nd_preds"
)


for filename in os.listdir(PATH2):
    if ".sha256" in filename:
        continue
    print(filename)

    true_sps = read_stuff(PATH2 / filename, ["DatasetName"])
    true_sps.replace(0, EPSILON, inplace=True)
    break

true_sps = true_sps[true_sps.index.isin(["True"])]
true_sps = true_sps.sort_values(by="GraphName", ascending=True)

In [None]:
true_sps.shape

In [None]:
PATH3 = Path("../nd_d_raw_scores/raw_scores")


def read_stuff2(full_path_filename, name, dname):
    df = pd.read_csv(
        full_path_filename,
        skipinitialspace=True,
        delimiter=",",
        usecols=["occ_original", "avg_random", "stdev_random"],
    )
    df["Graph"] = [i for i in range(8)]
    df["GraphName"] = name
    df["DatasetName"] = dname
    return df


dfs = []

for filename in sorted(os.listdir(PATH3)):
    if not filename.startswith("nd"):
        continue
    counts = read_stuff2(
        PATH3 / filename,
        filename.split("@")[1].split(".score")[0],
        filename.split("@")[0],
    )
    dfs.append(counts)

df = (
    pd.concat(dfs)
    .reset_index(drop=True)
    .rename(
        columns={"occ_original": "y", "avg_random": "Ey", "stdev_random": "sigma_y"}
    )
)
df = df.sort_values(by="GraphName", ascending=True)

In [None]:
ms = [
    92.15,
    59.08,
    140.78,
    37.04,
    913.2499999999999,
    109.96000000000001,
    79.50999999999999,
    201.59999999999997,
]
ss = [
    104.06789850861793,
    96.61549358151622,
    252.91898228484155,
    89.70138460469828,
    1780.922847149758,
    252.750031454004,
    186.38784804809566,
    367.91743639028584,
]

In [None]:
z_scores = []
z_scores2 = []
for i in range(0, df.shape[0], 8):
    z_score_df = df.iloc[i:i+8]
    graph = []
    graph2 = []
    for j in range(8):
        vals = z_score_df[z_score_df["Graph"] == j]
        _t1 = vals["y"] - vals["Ey"] - ss[j]
        _t2 = vals["y"] - vals["Ey"] + ss[j]
        _t1 /= np.sqrt(vals["sigma_y"]**2 + ss[j]**2)
        _t2 /= np.sqrt(vals["sigma_y"]**2 + ss[j]**2)
        graph.append(_t1.item())
        graph2.append(_t2.item())
    z_scores.append(graph)
    z_scores2.append(graph2)

In [None]:
z_scores = np.array(z_scores)
for i in range(z_scores.shape[0]):
    z_scores[i, :] = (z_scores[i, :]/np.sqrt(np.sum(z_scores[i, :]**2)))
z_scores2 = np.array(z_scores2)
for i in range(z_scores2.shape[0]):
    z_scores2[i, :] = (z_scores2[i, :]/np.sqrt(np.sum(z_scores2[i, :]**2)))

print(np.sum(z_scores**2, axis=1))
print(np.sum(z_scores**2, axis=1))

In [None]:
rng = np.random.default_rng(42)

all_quarts = []
for j in range(10000):
    choices = []
    for i in range(0, int(len(z_scores)/9924)):
        # print(i*9924, (i+1)*9924, len(z_scores))
        choices.append(rng.choice(np.arange(i*9924, (i+1)*9924), int(3490*0.1)))

    diffs_count = np.abs(true_sps.iloc[:, 0:8].to_numpy() - z_scores[choices].reshape(11*349,8))
    diffs_count2 = np.abs(true_sps.iloc[:, 0:8].to_numpy() - z_scores2[choices].reshape(11*349,8))
    diff_final = np.minimum(diffs_count, diffs_count2)
    quarts_count = np.nanquantile(diff_final, [0.25, 0.5, 0.75], axis=0)

    df = pd.DataFrame(
    np.vstack(
            [
                all_cnts[to_use, :, :].reshape(-1, 8),
                quarts_count.reshape(-1, 8),
            ]
        ),
        columns=["G" + str(i) for i in range(8)],
    )
    df["Type"] = ["SP", "SP", "SP", "Count", "Count", "Count"]
    df["QS"] = ["0.25","0.5","0.75"]*2
    all_quarts.append(df)

In [None]:
numeric_arrays = [arr.iloc[:, :-2].to_numpy(dtype=float) for arr in all_quarts]

stacked_array = np.stack(numeric_arrays, axis=0)
average_array = np.mean(stacked_array, axis=0)
std_array = np.std(stacked_array, axis=0)

averaged_df = pd.DataFrame(average_array, columns=["G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7"])
std_df = pd.DataFrame(std_array, columns=["G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7"])

In [None]:
averaged_df["Type"] = all_quarts[0]["Type"]
averaged_df["QS"] = all_quarts[0]["QS"]

std_df["Type"] = all_quarts[0]["Type"]
std_df["QS"] = all_quarts[0]["QS"]

In [None]:
df_melted = pd.melt(
    averaged_df,
    id_vars=["Type", "QS"],
    value_vars=["G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7"],
    var_name="Graph",
    value_name="Value",
)
df_pivot = df_melted.pivot_table(
    index=["Graph", "Type"], columns="QS", values="Value"
).reset_index()
df_pivot.columns = ["Graph", "Type", "QS = 0.25", "QS = 0.5", "QS = 0.75"]

np.round(df_pivot, 3)

In [None]:
def percentage_change(old_value, new_value):
    try:
        if old_value == 0 and new_value == 0:
            return 0.0
        elif old_value == 0:
            return 100.0
        return ((new_value - old_value) / old_value) * 100
    except TypeError:
        raise ValueError("Both values must be numbers.")

vectorized_percentage_change = np.vectorize(percentage_change)

df_pivot.drop(columns=["Type"]).groupby("Graph").apply(
    lambda x: pd.Series(np.round(vectorized_percentage_change(x.iloc[0, 1:], x.iloc[1, 1:]), 3))
).transpose()


In [None]:
np.max(std_df.iloc[3:,:-2].to_numpy())

---

## Single vs Multi

In [None]:
df = pd.read_csv(
    ".../multi-vs-isolated.csv", skipinitialspace=True
)
df.drop(columns=["type"]).groupby("g").apply(
    lambda x: pd.Series(np.round(vectorized_percentage_change(x.iloc[0, 1:], x.iloc[1, 1:]), 3))
).transpose()