In [15]:
import torch
import sys
import os
os.environ["CUDA_VISIBLE_DEVICES"]="5"
import pandas as pd
import numpy as np
from tqdm import tqdm
import glob
sys.path.append("../")

In [16]:
df = pd.read_csv("/app/nn-runtime-network/notebooks/hopleninfo.csv")

In [19]:
df.columns

Index(['path', 'split', 'config_type', 'modeltype', 'graphtype', 'modelname',
       'config_nodes', 'nodes', 'edges', 'config_runtime', 'percent of config',
       'ratio of config nodes', 'ratio of config nodes @ hop 0',
       'ratio of config nodes @ hop 1', 'ratio of config nodes @ hop 2',
       'ratio of config nodes @ hop 3', 'ratio of config nodes @ hop 4',
       'ratio of config nodes @ hop 5'],
      dtype='object')

In [26]:
df["nodes"].max()

43615

In [25]:
for i in range(1,6):
    print(f"name:{i}",(df[f"ratio of config nodes @ hop {i}"]*df["nodes"]).max())

name:1 9294.0
name:2 14124.0
name:3 21690.0
name:4 32679.0
name:5 39501.0


In [None]:
import json
import matplotlib.pyplot as plt

# Sample JSON data
data = json.load(open("/app/nn-runtime-network/workdir/listmle_graphsage_fused_xla_embedding/node_conf_importances_group.json"))
data2 = json.load(open("/app/nn-runtime-network/workdir/listmle_graphsage_fused_xla_embedding/node_importances_group.json"))
data.update(data2)

# Extracting Kendall tau values and labels
kendall_tau_data = {}
original_kendall_tau = None

for key, value in data.items():
    if key == "original":
        original_kendall_tau = value[1]
    else:
        kendall_tau_data[key] = value[1]

# Sort the keys by their ktau values and select the top 10
top_10_keys = sorted(kendall_tau_data, key=kendall_tau_data.get)[:13]
top_10_ktaus = [kendall_tau_data[key] for key in top_10_keys]

# Plotting
plt.figure(figsize=(10, 6))
bar_plot = plt.bar(["_".join(x.split("_")[:-1]) for x in  top_10_keys], top_10_ktaus, color='skyblue')

# Adding the baseline (original value)
plt.axhline(y=original_kendall_tau, color='r', linestyle='-', label='Original Baseline')

# Adding labels and title
plt.xlabel('Feature groups')
plt.ylabel('Kendall Tau')
plt.title('Performance Change when Feature (Groups) are Corrupted')

# Rotate the x-axis labels
plt.xticks(rotation=90)

# Adding a legend
plt.legend()

# Show the plot
plt.show()

In [None]:
from configs.listmle_gsage_xla_fused import Configs

In [None]:
CFG = Configs()

In [None]:
CFG.load_state_dict(os.path.join(CFG.OUTPUTDIR,"bestmodel_opa.pkl"),map_location="cpu")
model = CFG.model
model.cuda()
model.eval()
1

In [None]:
import json

def parse_file_to_dicts(file_path):
    dicts = []
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith('###Iter:'):
                # Extract the JSON string part
                json_str = line.split('  ::  ', 1)[1].strip()
                iter = int(line.split('  ::  ', 1)[0].strip().split("###Iter: ")[1])

                # Use json.loads to convert the string to a dictionary
                dict_data = json.loads(json_str.replace("'", "\""))
                dict_data["iteration"] = iter
                dicts.append(dict_data)
    return dicts

In [None]:
training_info = {}
for i in sorted(glob.glob("/app/nn-runtime-network/workdir/listmle*/logs.txt")):
    training_dicts = parse_file_to_dicts(os.path.join(i))
    training_info[i.split("/")[4]] = training_dicts
    maxim = -1
    tmpdict = {}
    for d in training_dicts:
        if d.get("ordered_pair_accuracy",False):
            val = d.get("ordered_pair_accuracy")
            if val>maxim:
                maxim=val
                tmpdict =d
    print(i.split("/")[4],":",tmpdict)

In [None]:
import matplotlib.pyplot as plt

In [None]:
names = ['default:nlp','default+random:xla','random:nlp','random:xla']

In [None]:
import matplotlib.pyplot as plt

datasets = [
    ['listmle_graphsage_default_nlp', 'listmle_graphsage_fused_xla_embedding', 'listmle_graphsage_random_nlp_embedding_redo', "listmle_graphsage_random_xla_embedding"],
]
num_rows = len(datasets)
num_cols = len(datasets[0])

for i, dataset in enumerate(datasets):
    fig, axs = plt.subplots(num_rows, num_cols, figsize=(24, 6 * 1))

    for j, k in enumerate(dataset):
        ax = axs[i, j] if num_rows > 1 else axs[j]

        iters = [x["iteration"] for x in training_info[k] if "training_loss" in x and "valid_loss" in x]
        valid_loss = [x["valid_loss"]/200 for x in training_info[k] if "training_loss" in x and "valid_loss" in x]
        training_loss = [x["training_loss"]/10 for x in training_info[k] if "training_loss" in x and "valid_loss" in x]
        opa = [x["ordered_pair_accuracy"] for x in training_info[k] if "training_loss" in x and "valid_loss" in x]
        ktau = [x["kendall_tau"] for x in training_info[k] if "training_loss" in x and "valid_loss" in x]

        # Ensure the lengths are equal
        assert len(iters) == len(valid_loss) == len(training_loss) == len(opa)

        ax.plot(iters, valid_loss, label='Valid Loss/200')
        ax.plot(iters, training_loss, label='Training Loss/10', linestyle='--')
        ax.plot(iters, opa, label='OPA', linestyle=':')
        ax.plot(iters, ktau, label='kendall tau', linestyle='-.')

        ax.set_xlabel('Iterations')
        ax.set_ylabel('Loss')
        ax.set_title(names[j])
        ax.legend()

    plt.show()


In [None]:
training_dicts = parse_file_to_dicts(os.path.join(CFG.OUTPUTDIR,"logs.txt"))
maxim = -1
tmpdict = {}
for d in training_dicts:
    if d.get("ordered_pair_accuracy",False):
        val = d.get("ordered_pair_accuracy")
        if val>maxim:
            maxim=val
            tmpdict =d
tmpdict

In [None]:
def count_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return total_params, trainable_params

total, trainable = count_parameters(model)
print(f"Total parameters: {total}")
print(f"Trainable parameters: {trainable}")


In [None]:
for path in glob.glob("../configs/listmle*.py"):
    break

In [None]:
# CFG.test_dataset.files = CFG.test_dataset.files[:8]

In [None]:
USED_KEYS = ["node_features","node_config_features","node_separation","node_ops","edges","batches"]
pred_sequences = []
for info in tqdm(CFG.test_dataset):
    predictions = []
    for batch in CFG.stream_dataloder_collate([info]):
        with torch.no_grad():
            out = model(**{k:batch[k].cuda() for k in USED_KEYS}).cpu()
            predictions.append(out)
    pred_sequences.append(torch.concat(predictions).flatten()[:len(info["config_runtimes"])])

In [None]:
df = pd.DataFrame(CFG.test_dataset.files,columns=["ID"])
if CFG.test_dataset.is_tile:
    df["ID"] = df.ID.apply(lambda x: x.split("/")[-1].split(".")[0].replace("___",":").replace("test:",""))
    df["TopConfigs"] = [";".join([str(x) for x in ps.numpy().argsort().tolist()[:10]]) for ps in pred_sequences]
else:
    df["ID"] = df.ID.apply(lambda x: "layout:"+x.split("/")[-1].split(".")[0].replace("___",":").replace("test:",""))
    df["TopConfigs"] = [";".join([str(x) for x in ps.numpy().argsort().tolist()]) for ps in pred_sequences]

In [None]:
df.to_csv(os.path.join(CFG.OUTPUTDIR,"submission.csv"),index=False)

# Combine multiple files and save

In [None]:
import pandas as pd
import os

In [None]:
files = [
    "/app/nn-runtime-network/workdir/listmle_graphsage_default_nlp_embedding_hop2/submission.csv",
    "/app/nn-runtime-network/workdir/listmle_graphsage_fused_xla_embedding_hop2/submission.csv",
    '/app/nn-runtime-network/workdir/listmle_graphsage_random_nlp_embedding_hop2/submission.csv',
    "/app/nn-runtime-network/workdir/listmle_graphsage_random_xla_embedding_hop2/submission.csv",
    "/app/nn-runtime-network/workdir/tile_model/results_1697250122338.csv"

]
make_zero = [
    False,False,False,False,False
]
# make_zero = [
#     True,True,True,True,True
# ]

In [None]:
pdfs =[]
for x,mask in zip(files,make_zero):
    tdf = pd.read_csv(x)
    if "fused" in x:
        tdf = tdf[tdf.ID.apply(lambda x: "default" in x)]
    if mask:
        print("skipping")
        tdf["TopConfigs"] = "0;1"
    pdfs.append(tdf.reset_index(drop=True))

In [None]:
tdfs = pd.concat(pdfs).drop_duplicates().reset_index(drop=True)
name="submission_embedding_hop2.csv"
if not os.path.exists(os.path.join("./submission",name)):
    tdfs.to_csv(os.path.join("./submission",name),index=False)