In [1]:
%load_ext autoreload
%autoreload 2
%load_ext tensorboard

In [1]:
from random import random

from SourceCodeTools.models.training_config import get_config, save_config, load_config
from SourceCodeTools.code.data.dataset.Dataset import SourceGraphDataset, filter_dst_by_freq
from SourceCodeTools.models.graph.train.sampling_multitask2 import training_procedure, SamplingMultitaskTrainer
from SourceCodeTools.models.graph.train.objectives.NodeClassificationObjective import NodeClassifierObjective
from SourceCodeTools.models.graph.train.objectives.SubgraphClassifierObjective import SubgraphAbstractObjective, \
    SubgraphClassifierObjective, SubgraphEmbeddingObjective
from SourceCodeTools.models.graph.train.objectives.SubgraphEmbedderObjective import SubgraphEmbeddingObjective, \
    SubgraphMatchingObjective
from SourceCodeTools.models.graph.train.utils import get_name, get_model_base
from SourceCodeTools.models.graph import RGGAN
from SourceCodeTools.tabular.common import compact_property
from SourceCodeTools.code.data.file_utils import unpersist

import dgl
import torch
import numpy as np
from argparse import Namespace
from torch import nn
from datetime import datetime
from os.path import join
from functools import partial

ModuleNotFoundError: No module named 'SourceCodeTools.models.graph.train.objectives.SubgraphEmbedderObjective'

# Prepare parameters and options

Full list of options that can be added can be found in `SourceCodeTools/models/training_options.py`. They are ment to be used as arguments for cli trainer. Trainer script can be found in `SourceCodeTools/scripts/train.py`.

For the task of subgraph classification the important options are:
- `subgraph_partition` is path to subgraph-based train/val/test sets. Storead as Dataframe with subgraph id and partition mask
- `subgraph_id_column` is a column is `common_edges` file that stores subgraph id.
- For variable misuse task (same will apply to authorship attribution) subgraphs are created for individual functions (files for SCAA). The label is stored in `common_filecontent`.

In [None]:
tokenizer_path = "sentencepiece_bpe.model"

data_path = "scaa_9_tasks_python_graph"
subgraph_partition = join(data_path, "subgraph_partition.json.bz2")
filecontent_path = join(data_path, "common_filecontent.json.bz2")

In [6]:
unpersist(subgraph_partition)

Unnamed: 0,id,train_mask,val_mask,test_mask
0,489,True,False,False
1,474,False,False,True
2,310,True,False,False
3,80,True,False,False
4,374,True,False,False
...,...,...,...,...
508,48,True,False,False
509,108,True,False,False
510,484,True,False,False
511,406,True,False,False


In [7]:
unpersist(join(data_path, "common_edges.json.bz2"), nrows=10)

Unnamed: 0,id,type,source_node_id,target_node_id,file_id,mentioned_in
0,0,subword,68048,49684,489,
1,1,name,49684,32062,489,20349.0
2,2,name_rev,32062,49684,489,20349.0
3,3,names,32062,75398,489,20349.0
4,4,names_rev,75398,32062,489,20349.0
5,5,subword,48908,19829,489,
6,6,name,19829,70699,489,20349.0
7,7,name_rev,70699,19829,489,20349.0
8,8,names,70699,75398,489,20349.0
9,9,names_rev,75398,70699,489,20349.0


In [8]:
config = get_config(
    subgraph_id_column="file_id",
    data_path=data_path,
    model_output_dir=data_path,
    subgraph_partition=subgraph_partition,
    tokenizer_path=tokenizer_path,
    objectives="subgraph_clf",
    measure_scores=True,
    dilate_scores=1
)

In [9]:
# save_config(config, "var_misuse_tiny.yaml")

In [10]:
config

{'DATASET': {'data_path': 'scaa_9_tasks_python_graph',
  'train_frac': 0.9,
  'filter_edges': None,
  'min_count_for_objectives': 5,
  'self_loops': False,
  'use_node_types': False,
  'use_edge_types': False,
  'no_global_edges': False,
  'remove_reverse': False,
  'custom_reverse': None,
  'restricted_id_pool': None,
  'random_seed': None,
  'subgraph_id_column': 'file_id',
  'subgraph_partition': 'scaa_9_tasks_python_graph/subgraph_partition.json.bz2'},
 'TRAINING': {'model_output_dir': 'scaa_9_tasks_python_graph',
  'pretrained': None,
  'pretraining_phase': 0,
  'sampling_neighbourhood_size': 10,
  'neg_sampling_factor': 3,
  'use_layer_scheduling': False,
  'schedule_layers_every': 10,
  'elem_emb_size': 100,
  'embedding_table_size': 200000,
  'epochs': 100,
  'batch_size': 128,
  'learning_rate': 0.001,
  'objectives': 'subgraph_clf',
  'save_each_epoch': False,
  'save_checkpoints': True,
  'early_stopping': False,
  'early_stopping_tolerance': 20,
  'force_w2v_ns': False,
  '

# Create Dataset

In [11]:
dataset = SourceGraphDataset(
    **{**config["DATASET"], **config["TOKENIZER"]}
)

# Declare target loading function (labels)

In [15]:
def load_labels():
    filecontent = unpersist(filecontent_path)
    return filecontent[["id", "user"]].rename({"id": "src", "user": "dst"}, axis=1)

In [16]:
load_labels()

Unnamed: 0,src,dst
0,489,thekushalghosh
1,474,shubhi_
2,310,briangodwinlim
3,80,Itachi_uchiha
4,374,erdnase
...,...,...
508,48,DavidEdey
509,108,KimJohnWu
510,484,spelvin
511,406,luctchak


One or several objectives could be used

In [17]:
class Trainer(SamplingMultitaskTrainer):
    def create_objectives(self, dataset, tokenizer_path):
        self.objectives = nn.ModuleList()
        
        self.objectives.append(
            self._create_subgraph_objective(
                objective_name="SCAAMatching",
                objective_class=SubgraphMatchingObjective,
                dataset=dataset,
                tokenizer_path=tokenizer_path,
                labels_fn=load_labels,
            )
        )

In [28]:
%tensorboard --logdir data_path

In [18]:
training_procedure(
    dataset, 
    model_name=RGGAN, 
    model_params=config["MODEL"],
    trainer_params=config["TRAINING"],
    model_base_path=get_model_base(config["TRAINING"], get_name(RGGAN, str(datetime.now()))),
    trainer=Trainer
)

Number of nodes 80375


Precompute Target Embeddings:   0%|                                                                                                     | 0/1 [00:00<?, ?it/s]
Epoch 0:  25%|████████████████████████████▌                                                                                     | 1/4 [00:19<00:58, 19.42s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.07s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.64s/it]
Epoch 1:   0%|                                                                                                                          | 0/4 [00:00<?, ?it/s]

Epoch: 0, Time: 27 s
{'Accuracy/test/SCAAMatching_': 0.5,
 'Accuracy/train/SCAAMatching_': 0.5,
 'Accuracy/train_avg/SCAAMatching': 0.5,
 'Accuracy/val/SCAAMatching_': 0.5,
 'Loss/test/SCAAMatching_': 0.5998330116271973,
 'Loss/train/SCAAMatching_': 0.5997103452682495,
 'Loss/train_avg/SCAAMatching': 0.5997103452682495,
 'Loss/val/SCAAMatching_': 0.6000617146492004}


Epoch 1:  25%|████████████████████████████▌                                                                                     | 1/4 [00:18<00:54, 18.00s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.66s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.60s/it]
Epoch 2:   0%|                                                                                                                          | 0/4 [00:00<?, ?it/s]

Epoch: 1, Time: 25 s
{'Accuracy/test/SCAAMatching_': 0.5,
 'Accuracy/train/SCAAMatching_': 0.5,
 'Accuracy/train_avg/SCAAMatching': 0.5,
 'Accuracy/val/SCAAMatching_': 0.5,
 'Loss/test/SCAAMatching_': 0.5996013283729553,
 'Loss/train/SCAAMatching_': 0.5993266701698303,
 'Loss/train_avg/SCAAMatching': 0.5993266701698303,
 'Loss/val/SCAAMatching_': 0.5997006893157959}


Epoch 2:  25%|████████████████████████████▌                                                                                     | 1/4 [00:21<01:05, 21.76s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.24s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:05<00:00,  5.48s/it]
Epoch 3:   0%|                                                                                                                          | 0/4 [00:00<?, ?it/s]

Epoch: 2, Time: 30 s
{'Accuracy/test/SCAAMatching_': 0.5,
 'Accuracy/train/SCAAMatching_': 0.5,
 'Accuracy/train_avg/SCAAMatching': 0.5,
 'Accuracy/val/SCAAMatching_': 0.5,
 'Loss/test/SCAAMatching_': 0.5998241901397705,
 'Loss/train/SCAAMatching_': 0.5993597507476807,
 'Loss/train_avg/SCAAMatching': 0.5993597507476807,
 'Loss/val/SCAAMatching_': 0.5999167561531067}


Epoch 3:  25%|████████████████████████████▌                                                                                     | 1/4 [00:18<00:54, 18.30s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.89s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.69s/it]
Epoch 4:   0%|                                                                                                                          | 0/4 [00:00<?, ?it/s]

Epoch: 3, Time: 27 s
{'Accuracy/test/SCAAMatching_': 0.5,
 'Accuracy/train/SCAAMatching_': 0.5,
 'Accuracy/train_avg/SCAAMatching': 0.5,
 'Accuracy/val/SCAAMatching_': 0.5,
 'Loss/test/SCAAMatching_': 0.5998523831367493,
 'Loss/train/SCAAMatching_': 0.5972433090209961,
 'Loss/train_avg/SCAAMatching': 0.5972433090209961,
 'Loss/val/SCAAMatching_': 0.5991601943969727}


Epoch 4:  25%|████████████████████████████▌                                                                                     | 1/4 [00:19<00:57, 19.01s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.01s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.99s/it]
Epoch 5:   0%|                                                                                                                          | 0/4 [00:00<?, ?it/s]

Epoch: 4, Time: 27 s
{'Accuracy/test/SCAAMatching_': 0.5,
 'Accuracy/train/SCAAMatching_': 0.5,
 'Accuracy/train_avg/SCAAMatching': 0.5,
 'Accuracy/val/SCAAMatching_': 0.5,
 'Loss/test/SCAAMatching_': 0.5996302366256714,
 'Loss/train/SCAAMatching_': 0.5981554388999939,
 'Loss/train_avg/SCAAMatching': 0.5981554388999939,
 'Loss/val/SCAAMatching_': 0.5998492240905762}


Epoch 5:  25%|████████████████████████████▌                                                                                     | 1/4 [00:18<00:54, 18.13s/it]
  0%|                                                                                                                                   | 0/1 [00:02<?, ?it/s]


KeyboardInterrupt: 