In [None]:
import json
import os
import numpy as np
import pandas as pd
import pickle
import time

import torch
import torch_geometric

from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

DATA_DIR = "/nfs/rohit/ALL_LOGS"
OUTPUT_DIR = "/nfs/rohit/models2/"
RUN_NUMBER = 2

RESULT_DIR = os.path.join(OUTPUT_DIR, f"run_{RUN_NUMBER}")

os.mkdir(RESULT_DIR)

In [None]:
with open(os.path.join(DATA_DIR, "pyg_simplified.pkl"), 'rb') as f:
    data = pickle.load(f)

In [None]:
all_e2e_times = pd.read_csv(os.path.join(DATA_DIR, "all_e2e_times.csv"), names=['file', 'time'])

In [None]:
def process_datapoint(x):
    assert x.endswith(".log")
    filename = x[:-4]
    info = filename.split('_')
    assert len(info) == 6, f"{x}"
    data[x]['program'] = info[0]
    data[x]['cpu'] = info[1].replace('cpu', '')
    data[x]['mem'] = info[2].replace('mem', '')
    data[x]['file'] = info[3]
    data[x]['lvl'] = info[4].replace('lvl', '')
    data[x]['run'] = info[5].replace('run', '')
    data[x]['e2e_time'] = all_e2e_times[all_e2e_times['file'] == f"{filename}.info"]['time'].item()
    return data[x]

dataset = list(map(process_datapoint, data.keys()))
dataset = list(filter(lambda x: x['cpu'] in ['0.1', '0.3', '0.5', '0.7', '1.0'], dataset))

X_train, X_test = train_test_split(dataset, test_size = 0.01)

In [None]:
all_features = torch.concat(tuple(map(lambda y: y.x, X_train)))

scaler = MinMaxScaler()
scaler.fit(all_features)

# assert not os.path.exists(os.path.join(RESULT_DIR, 'scaler.pkl'))
with open(os.path.join(RESULT_DIR, 'scaler.pkl'), 'wb') as f:
    pickle.dump(scaler, f)

# Group all training graphs into a single graph to perform sampling
train_graphs = torch_geometric.data.Batch.from_data_list(list(X_train))
train_graphs.x = scaler.transform(train_graphs.x).astype(np.float32)

test_graphs = torch_geometric.data.Batch.from_data_list(list(X_test))
test_graphs.x = scaler.transform(test_graphs.x).astype(np.float32)

In [None]:
hyperparams = {
    'BATCH_SIZE': 65536,
    'NUM_NEIGHBORS': [10, 10],
    'HIDDEN_CHANNELS': 64,
    'NUM_LAYERS': 5,
    'OUT_CHANNELS': 32,
    'LR': 0.01,
    'NUM_EPOCHS': 25
}

with open(os.path.join(RESULT_DIR, 'hyperparams.json'), 'w') as f:
    json.dump(hyperparams, f)

train_loader = torch_geometric.loader.LinkNeighborLoader(train_graphs, batch_size=hyperparams['BATCH_SIZE'], shuffle=True,
                            neg_sampling_ratio=1.0, num_neighbors=hyperparams['NUM_NEIGHBORS'],
                            num_workers=6, persistent_workers=True)

eval_loader  = torch_geometric.loader.LinkNeighborLoader(test_graphs, batch_size=hyperparams['BATCH_SIZE'], shuffle=True,
                            neg_sampling_ratio=1.0, num_neighbors=hyperparams['NUM_NEIGHBORS'],
                            num_workers=6, persistent_workers=True)

for i in range(len(X_train)):
    X_train[i].x = torch.from_numpy(scaler.transform(X_train[i].x).astype(np.float32))

for i in range(len(X_test)):
    X_test[i].x = torch.from_numpy(scaler.transform(X_test[i].x).astype(np.float32))

In [None]:
device = torch.device('cuda:1')
model = torch_geometric.nn.GraphSAGE(
    in_channels=train_graphs.num_features,
    hidden_channels=hyperparams['HIDDEN_CHANNELS'],
    num_layers=hyperparams['NUM_LAYERS'],
    out_channels=hyperparams['OUT_CHANNELS'],
).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=hyperparams['LR'])

In [None]:
from graphsage_unsup import train, evaluate

for epoch in range(1, hyperparams['NUM_EPOCHS']):
    start_train = time.time()
    loss = train(model, optimizer, train_loader, device)
    end_train = time.time()

    start_val = time.time()
    val_loss = evaluate(model, optimizer, eval_loader, device)
    end_val = time.time()
    
    logline = {
        'epoch': epoch,
        'loss': loss,
        'val_loss': val_loss,
        'train_time': end_train - start_train,
        'val_time': end_val - start_val
    }
    with open(os.path.join(RESULT_DIR, 'logs.json'), 'a') as f:
        json.dump(logline, f)
        f.write("\n")

    print(logline)
    torch.save(model.state_dict(), os.path.join(RESULT_DIR, f"ckpt_{epoch}.pt"))    