In [None]:
import json
import glob
import os
import pandas as pd
import numpy as np

with open("adjacency_list_dags/1000genome.json", "r") as f:
    adjacency_list = json.load(f)

counter = 0
edge_index = []
lookup = {}
for l in adjacency_list:
    lookup[l] = counter
    counter+=1

for l in adjacency_list:
    for e in adjacency_list[l]:
        edge_index.append([lookup[l], lookup[e]])

In [None]:
graphs = []
classes = {"normal": 0}
counter = 1
for d in os.listdir("data"):
    d = d.split("_")[0]
    if d in classes: continue
    classes[d] = counter
    counter += 1
    
    
import math
for d in os.listdir("data"):
    for f in glob.glob(os.path.join("data", d, "1000genome*")):
        if f in ["data/cpu_3/1000genome-20200613T072602Z-run0011.csv",
                "data/hdd_100/1000genome-20200901T234910Z-run0048.csv",
                "data/hdd_100/1000genome-20200901T234910Z-run0049.csv",
                "data/loss_1/1000genome-20200520T215721Z-run0014.csv",
                "data/norma/1000genome-20200616T174351Z-run0022.csv"]: continue
            
        print(f)

        graph = {"y": classes[d.split("_")[0]], "edge_index": edge_index, "x":[]}
        features = pd.read_csv(f, index_col=[0])
        # time_list
        for l in lookup:
            if l.startswith("create_dir_") or l.startswith("cleanup_"):
                new_l = l.split("-")[0]
            else:
                new_l = l
            
            job_features = features[features.index.str.startswith(new_l)][['type', 'is_clustered', 'ready',
                                   'submit', 'execute_start', 'execute_end', 'post_script_start',
                                   'post_script_end', 'wms_delay', 'pre_script_delay', 'queue_delay',
                                   'runtime', 'post_script_delay', 'stage_in_delay', 'stage_out_delay',
                                   'stage_in_bytes', 'stage_out_bytes', 'kickstart_user', 'kickstart_site', 
                                   'kickstart_hostname', 'kickstart_transformations', 'kickstart_executables',
                                   'kickstart_executables_argv', 'kickstart_executables_cpu_time', 'kickstart_status',
                                   'kickstart_executables_exitcode']].values.tolist()[0]

            
            if job_features[0]=='auxiliary':
                job_features[0]= 0
            if job_features[0]=='compute':
                job_features[0]= 1
            if job_features[0]=='transfer':
                job_features[0]= 2
                #             print(job_features)
            job_features = [-1 if x != x else x for x in job_features]
            graph['x'].insert(lookup[l], job_features)

        t_list=[]
        for i in range(len(graph['x'])):
            t_list.append(graph['x'][i][1])
        minim= min(t_list)
        for i in range(len(graph['x'])):
            lim = graph['x'][i][1:7]
            lim=[ v-minim for v in lim]
            graph['x'][i][1:7]= lim
        graphs.append(graph)
#graphs should have the input for pytorch geometric

In [None]:
## Dumped into the pickle file.
import pickle
with open('graph.pkl','wb') as f:
    pickle.dump(graphs, f)

In [None]:
# Import the file.
import pickle
with open('graph.pkl','rb') as f:
    graphs= pickle.load(f)

In [None]:
import numpy as np
y_list = []
for gr in graphs:
    y_list.append(gr['y'])
print(min(y_list))
print(max(y_list))
print(np.unique(np.array(y_list), return_counts=True))


In [None]:
from torch_geometric.data import Data
datasets=[]
import numpy
for element in graphs:
    gx = torch.tensor(numpy.array(element['x']) ) 
    ge =torch.tensor(numpy.array(element['edge_index']) ).T
    gy =torch.tensor(numpy.array(element['y']).reshape([-1]))
    # print(gx.shape, ge.shape, gy.shape)
    # print(gy)
    v_min, v_max = gx.min(), gx.max()
    new_min, new_max = 0, 1
    gx = (gx - v_min)/(v_max - v_min)*(new_max - new_min) + new_min
    # print(gx.min(), gx.max())
    datasets.append( Data(x=gx, edge_index=ge, y=gy) )

In [None]:
import torch
from torch_geometric.datasets import TUDataset

dataset = datasets

# print()
# print(f'Dataset: {dataset}:')
print('====================')
print(f'Number of graphs: {len(dataset)}')
# print(f'Number of features: {dataset.num_features}')
# print(f'Number of classes: {dataset.num_classes}')
data = dataset[0]  # Get the first graph object.
print()
print(data)
print('=============================================================')
# Gather some statistics about the first graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

In [None]:
torch.manual_seed(12345)
import random
random.seed(12345)
random.shuffle(datasets)
train_dataset = datasets[:1000]
test_dataset = datasets[1000:]

print(f'Number of training graphs: {len(train_dataset)}')
print(f'Number of test graphs: {len(test_dataset)}')

In [None]:
from torch_geometric.loader import DataLoader

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

for step, data in enumerate(train_loader):
    print(f'Step {step + 1}:')
    print('=======')
    print(f'Number of graphs in the current batch: {data.num_graphs}')
    print(data)
    print()

In [None]:
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool


class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GCN, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GCNConv(14, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, 4)

    def forward(self, x, edge_index, batch):
        # 1. Obtain node embeddings 
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)
        # 2. Readout layer
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]
        # 3. Apply a final classifier
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)
        return x

model = GCN(hidden_channels=64)
print(model)

In [None]:
# from IPython.display import Javascript
# display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

model = GCN(hidden_channels=64).float()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

def train():
    model.train()
    for data in train_loader:  # Iterate in batches over the training dataset.
        out = model(data.x.float(), data.edge_index, data.batch)  # Perform a single forward pass.
        loss = criterion(out, data.y)  # Compute the loss.
        loss.backward()  # Derive gradients.
        optimizer.step()  # Update parameters based on gradients.
        optimizer.zero_grad()  # Clear gradients.

def test(loader):
    model.eval()
    correct = 0
    for data in loader:  # Iterate in batches over the training/test dataset.
        out = model(data.x.float(), data.edge_index, data.batch)  
        pred = out.argmax(dim=1)  # Use the class with highest probability.
        correct += int((pred == data.y).sum())  # Check against ground-truth labels.
    return correct / len(loader.dataset)  # Derive ratio of correct predictions.
import torch.optim.lr_scheduler as lrs
scheduler = lrs.ExponentialLR(optimizer, gamma=0.9)
for epoch in range(1, 1000):
    train()
    train_acc = test(train_loader)
    test_acc = test(test_loader)
    if epoch%50 ==0:
        scheduler.step()
        print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')