# Attention!!!

This is a very simple but bad quality notebook. 
 - I do not use any sort of ranking loss, which would be better.
 - My strategy instead is to min-max scale the times and apply L1-loss
 - My model is also not optimized. It is a relatively simple GNN that embeds the graph and only processes 1 datapoint at a time and is only trained on 1 epoch.
 - The public score would be much better if you paired this submission with a trained model for layout. Since this only contributes to half of the score.
 - Have fun playing around with it!
 
 
 # CHANGES
 - V5 - normalized train and infer targets, use MSE loss, changed evaluation metric to perform top5 mean instead of top5 max for robustness, 5-fold CV
 - V6 - use SAGEConv instead of GCN, add dropout layer, increase number of paramters, changed evaluation metric to perform top50 mean, 10->20 epochs.
 - V13 - fixed problem where model weights weren't being reset leading to heavy overfitting...oops

In [2]:
# !pip install torch-geometric torch-scatter

In [3]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm 

import sklearn,sklearn.model_selection
import torch
from torch import nn
from torch import Tensor
from torch_geometric.nn import GCNConv,SAGEConv
from torch_geometric.datasets import Planetoid
from torch.utils.data import DataLoader, Dataset
from timm.scheduler import CosineLRScheduler
import matplotlib.pyplot as plt
device = 'cuda' if torch.cuda.is_available() else 'cpu'

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
data_dir = "/home/khizbud/predict-ai-model-runtime/predict-ai-model-runtime/"

In [5]:
def load_df(directory):
    splits = ["train", "valid", "test"]
    dfs = dict()
    
    for split in splits:
        path = os.path.join(directory, split)
        files = os.listdir(path)
        list_df = []
        
        for file in files:
            d = dict(np.load(os.path.join(path,file)))
            d['file'] = file
            list_df.append(d)
        dfs[split] = pd.DataFrame.from_dict(list_df)
    return dfs

tile_xla = load_df(os.path.join(data_dir, "npz_all/npz/tile/xla/"))

In [7]:
tile_xla.keys()

dict_keys(['train', 'valid', 'test'])

In [9]:
train_df = tile_xla['train']
len(train_df)

5709

In [14]:
train_df.head(3)

Unnamed: 0,node_feat,node_opcode,edge_index,config_feat,config_runtime,config_runtime_normalizers,file
0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[63, 11, 63, 11, 63, 41, 63, 41, 26, 63, 63, 41]","[[1, 0], [3, 2], [5, 1], [5, 4], [7, 3], [7, 6...","[[32.0, 32.0, 0.0, 0.0, 0.0, 0.0, 64.0, 1024.0...","[263238, 2029255, 1192602, 1027600, 1962135, 5...","[263238, 263238, 263238, 263238, 263238, 26323...",alexnet_train_batch_32_-1bae27a41d70f4dc.npz
1,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[24, 13, 48, 87, 63, 13, 25, 52, 25, 63, 24, 1...","[[1, 0], [3, 1], [3, 2], [5, 4], [6, 5], [7, 3...","[[6.0, 12.0, 2.0, 2.0, 0.0, 0.0, 22.0, 288.0, ...","[155012, 3950817, 2048285, 1528077, 682642, 77...","[155012, 155012, 155012, 155012, 155012, 15501...",alexnet_train_batch_32_-21d9f3b8c41eb3e3.npz
2,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[63, 11, 63, 11, 63, 63, 13, 63, 41, 63, 41, 2...","[[1, 0], [3, 2], [6, 5], [8, 1], [8, 7], [10, ...","[[3.0, 12.0, 4.0, 3.0, 0.0, 0.0, 22.0, 432.0, ...","[113020, 667977, 966760, 5897798, 1554171, 308...","[113020, 113020, 113020, 113020, 113020, 11302...",alexnet_train_batch_32_-282ddd3271de7d28.npz


In [12]:
list(train_df.columns)

['node_feat',
 'node_opcode',
 'edge_index',
 'config_feat',
 'config_runtime',
 'config_runtime_normalizers',
 'file']

In [23]:
for k, v in train_df.iloc[0].to_dict().items():
    print(k, type(v))
    if isinstance(v, np.ndarray):
        print(v.shape, v.dtype)
    print()

node_feat <class 'numpy.ndarray'>
(12, 140) float32

node_opcode <class 'numpy.ndarray'>
(12,) uint8

edge_index <class 'numpy.ndarray'>
(11, 2) int64

config_feat <class 'numpy.ndarray'>
(266, 24) float32

config_runtime <class 'numpy.ndarray'>
(266,) int64

config_runtime_normalizers <class 'numpy.ndarray'>
(266,) int64

file <class 'str'>



# Define Dataset and Model

In [5]:
class TileDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        config_feat = torch.tensor(row['config_feat'].astype(np.float32))
        node_feat = torch.tensor(row['node_feat'].astype(np.float32))
        node_opcode = torch.tensor(row['node_opcode'].astype(np.int64))
        edge_index = torch.tensor(np.swapaxes(row['edge_index'],0,1).astype(np.int64))
        target = (row['config_runtime']/(row['config_runtime_normalizers']+1e-5)).astype(np.float32) #/row['config_runtime_normalizers']
        # minmax scale the target, we only care about order
        target = (target-np.mean(target))/(np.std(target)+1e-5)

#         target = (target-np.mean(target))/(np.std(target))
        target = torch.tensor(target)
        return config_feat,node_feat,node_opcode,edge_index,target

In [6]:
class SimpleModel(torch.nn.Module):
    def __init__(self, 
                 hidden_channels = [32,48,64,84], 
                 graph_in = 64, 
                 graph_out = 64, 
                 hidden_dim=128, 
                 dropout = 0.2):
        super().__init__()
        op_embedding_dim = 4 # I choose 4-dimensional embedding
        self.embedding = torch.nn.Embedding(120, #120 different op-codes
                                            op_embedding_dim,
                                           )
        assert len(hidden_channels)>0
        
        self.linear = nn.Linear(op_embedding_dim+140,graph_in)
        in_channels=graph_in
        self.convs = torch.nn.ModuleList()
        last_dim = hidden_channels[0]
        conv = SAGEConv
        self.convs.append(conv(in_channels, hidden_channels[0]))
        for i in range(len(hidden_channels)-1):
            self.convs.append(conv(hidden_channels[i], hidden_channels[i+1]))
            last_dim = hidden_channels[i+1]
        self.convs.append(conv(last_dim, graph_out))
        
        
        
        self.dense = torch.nn.Sequential(nn.Linear(graph_out*2+24, hidden_dim),
                                         nn.Dropout(p=dropout),
                                         nn.ReLU(),
                                         nn.Linear(hidden_dim, hidden_dim),
                                         nn.Dropout(p=dropout),
                                         nn.ReLU(),
                                         nn.Linear(hidden_dim, 1),
                                        )
#         self.dropout = nn.Dropout(p=dropout)

    def forward(self, x_cfg: Tensor,x_feat: Tensor, x_op: Tensor, edge_index: Tensor) -> Tensor:
        
        #get graph features
        x = torch.concat([x_feat,self.embedding(x_op)],dim = 1)
        x = self.linear(x)
        #pass though conv layers
        for conv in self.convs:
            x = conv(x, edge_index).relu()
        # get 1d graph embedding using average pooling
        x_mean = x.mean(0)
        x_max = x.max(0).values
        
        #put graph data into config data
        x = torch.concat([x_cfg,x_max.repeat((len(x_cfg),1)),x_mean.repeat((len(x_cfg),1))],axis=1)
        #put into dense nn
        x = torch.flatten(self.dense(x))
        x = (x-torch.mean(x))/(torch.std(x)+1e-5)
        return x


# Train One Epoch

In [7]:
df = pd.concat((tile_xla["train"],tile_xla["valid"]),axis=0).reset_index(drop=True)

In [8]:
kfold = sklearn.model_selection.KFold(n_splits=5,shuffle=True,random_state=0)
score_means = []
score_maxs = []
for fold,(tr_idx,va_idx) in enumerate(kfold.split(df)):
    model = SimpleModel().to(device)
    train_dataset = TileDataset(df.iloc[tr_idx])
    val_dataset = TileDataset(df.iloc[va_idx])
    criterion = torch.nn.MSELoss()
    steps = len(train_dataset)*20
    warmup_steps = int(steps*0.2)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4,weight_decay = 1e-4)
    scheduler = CosineLRScheduler(optimizer,t_initial= steps,warmup_t=warmup_steps, warmup_lr_init=1e-6,lr_min=2e-8,)
    
    def score_tile_mean(predictions, df):
        score = 0
        for i in range(len(df)):
            predbest = np.mean(df.iloc[i]['config_runtime'][predictions[i]])
            best = np.mean(np.sort(df.iloc[i]['config_runtime'])[:50])
            score += 2-predbest/best
        score /= len(df)
        return score
    def score_tile_max(predictions, df):
        score = 0
        for i in range(len(df)):
            predbest = np.min(df.iloc[i]['config_runtime'][predictions[i][:5]])
            best = np.min(df.iloc[i]['config_runtime'])
    #         print(best,predbest)
            score += 2 - predbest/best
        score /= len(df)
        return score

    best_score = 0
    best_score_max = 0
    for epoch in range(10):
        model.train()
        pbar = tqdm(range(len(train_dataset)),leave=False)
        loss_sum = 0
        n = 0
        for i in pbar:
            cfg_ft,nd_ft,nd_op,ind,target = train_dataset[i]
            cfg_ft,nd_ft,nd_op,ind,target = cfg_ft.to(device),nd_ft.to(device),nd_op.to(device),ind.to(device),target.to(device)

            out = model(cfg_ft,nd_ft,nd_op,ind)
            loss = criterion(out, target)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1e-2)
            scheduler.step(i+len(train_dataset)*epoch)
            optimizer.step()
            loss_sum+=loss.item()
            n+=1
            pbar.set_description(f'running loss: {(loss_sum/n):.2f},current loss: {(loss.item()):.2f}')
        pbar.close()
        model.eval()

        tile_xla_predictions = []
        pbar = tqdm(range(len(val_dataset)),leave=False)
        for i in pbar:
            cfg_ft,nd_ft,nd_op,ind,target = val_dataset[i]
            cfg_ft,nd_ft,nd_op,ind,target = cfg_ft.to(device),nd_ft.to(device),nd_op.to(device),ind.to(device),target.to(device)

            out = model(cfg_ft,nd_ft,nd_op,ind)
            tile_xla_predictions.append(np.argsort(out.detach().cpu().numpy())[:50])
        pbar.close()
        score_mean = score_tile_mean(tile_xla_predictions, val_dataset.df)
        score_max = score_tile_max(tile_xla_predictions, val_dataset.df)
        print(f'fold {fold} epoch {epoch}, comp_score = {score_max:.3f}, mean_score = {score_mean:.3f},')
        if score_mean>best_score:
            best_score = score_mean
            best_score_max = score_max
            torch.save(model.state_dict(), f'best_model_{fold}.pth')
    score_means.append(best_score)
    score_maxs.append(best_score_max)
print(f'comp_score = {np.mean(score_maxs)}, mean_score = {np.mean(score_means)},')

                                                                                                                                                        

fold 0 epoch 0, comp_score = 0.614, mean_score = 0.289,


                                                                                                                                                        

KeyboardInterrupt: 

# Evaluate on Validation Dataset

**0.31 is not bad considering that this model only trained on 1 epoch and is not on a ranking loss!**

# Predict and Submit (only tile:xla predictions)

In [13]:
print(device)

cpu


In [15]:
dataset = TileDataset(tile_xla["test"])
tile_xla_predictions = [[] for i in range(len(dataset))]
for fold in range(5):
    model.load_state_dict(torch.load(f'best_model_{fold}.pth', map_location=device))
    model.eval()
    pbar = tqdm(range(len(dataset)))
    for i in pbar:
        cfg_ft,nd_ft,nd_op,ind,target = dataset[i]
        cfg_ft,nd_ft,nd_op,ind,target = cfg_ft.to(device),nd_ft.to(device),nd_op.to(device),ind.to(device),target.to(device)

        out = model(cfg_ft,nd_ft,nd_op,ind)
        tile_xla_predictions[i].append(out.detach().numpy())
tile_xla_predictions = [np.argsort(np.mean(pred,axis=0))[:5] for pred in tile_xla_predictions]

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 844/844 [00:07<00:00, 112.90it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 844/844 [00:14<00:00, 57.25it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 844/844 [00:07<00:00, 107.40it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 844/844 [00:06<00:00, 126.68it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 844/844 [00:09<00:00, 85.39it/s]


In [16]:
sub = pd.read_csv(os.path.join(data_dir, 'sample_submission.csv'))
for i,filename in enumerate(tile_xla["test"]['file'].values):
    id = 'tile:xla:' +filename[:-4]
    sub.loc[sub.ID == id,'TopConfigs'] = ';'.join(tile_xla_predictions[i].astype(str))
sub.to_csv('submission.csv',index=False)
sub

Unnamed: 0,ID,TopConfigs
0,tile:xla:d6f5f54247bd1e58a10b9e7062c636ab,0;1;2;3;4
1,tile:xla:e3a655daa38e34ec240df959b650ac16,827;125;1065;709;281
2,tile:xla:f8c2c1a1098b2a361c26df668b286c87,41;116;101;202;166
3,tile:xla:4dd1716853ed46ee4e7d09ede1732de8,1474;1045;5985;6222;4859
4,tile:xla:d0a69155b6340748c36724e4bfc34be3,655;159;650;151;215
...,...,...
889,layout:nlp:random:60880ed76de53f4d7a1b960b24f2...,0;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18...
890,layout:nlp:random:23559853d9702baaaacbb0c83fd3...,0;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18...
891,layout:nlp:random:f6c146fc5cf10be4f3accbaca989...,0;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18...
892,layout:nlp:random:32531d07a084b319dce484f53a4c...,0;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18...
