In [1]:
%load_ext autoreload
%autoreload 2

Set-up

In [2]:
import os
import sys

root_dir = os.path.abspath('..')
print("Root dir: ", root_dir)
sys.path.append(root_dir)

Root dir:  /Users/user010/Desktop/Programming/ML/STS


In [3]:
from utils.helpers import read_config, print_config
import numpy as np

glob_cfg = read_config("../config.yaml")
cfg = read_config(glob_cfg.configs.scoring)

print_config(cfg)

{
  "dataset": "stacktraces_hf",
  "save_dataset": "sts_traces",
  "model": "mini_lm",
  "scores": {
    "file": 0.15,
    "func": 0.15,
    "edit": 0.1,
    "msg": 0.2,
    "err": 0.4
  }
}


In [4]:
import os
from huggingface_hub import login

read_token = os.environ["HUGGING_FACE_HUB_TOKEN_READ"]

login(token=read_token)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /Users/user010/.cache/huggingface/token
Login successful


Preprocessing

In [5]:
from datasets import load_dataset

dataset_params = glob_cfg.datasets[cfg.dataset].hf_params
print("Params:", dataset_params)
dataset = load_dataset(**dataset_params)

Params: {'path': 'under-tree/stacktrace_dataset'}


In [7]:
# filter out the examples that have empty trace_list
n_before = len(dataset['train'])
dataset = dataset.filter(lambda x: len(x["trace_list"]))
n_after = len(dataset['train'])
print(f"Filtered out {n_before - n_after} examples")

Filtered out 0 examples
Dataset size: 248671


In [None]:
dataset = dataset["train"]
dataset

Dataset({
    features: ['url', 'interpreter', 'stack_trace', 'trace_list', 'error_type', 'error_msg'],
    num_rows: 248671
})

Scoring

In [8]:
from transformers import AutoTokenizer, AutoModel

params = glob_cfg.models[cfg.model].hf_params
print("Params:", params)
tokenizer = AutoTokenizer.from_pretrained(**params.tokenizer)
model = AutoModel.from_pretrained(**params.model)

Params: {'model': {'pretrained_model_name_or_path': 'sentence-transformers/paraphrase-MiniLM-L6-v2'}, 'tokenizer': {'pretrained_model_name_or_path': 'sentence-transformers/paraphrase-MiniLM-L6-v2'}}


In [133]:
from utils.auto_similarity import Scorer
import json
scorer = Scorer(config=cfg, tokenizer=tokenizer, model=model)

obj1 = dataset[np.random.randint(0, len(dataset))]
obj2 = dataset[np.random.randint(0, len(dataset))]


score = scorer.calculate_trace_score(obj1, obj2)

print("Score:", score)

print("Trace 1")
last_five_a = obj1['trace_list'][-3:]
print(json.dumps(last_five_a, indent=2))

print("Trace 2")
last_five_b = obj2['trace_list'][-3:]
print(json.dumps(last_five_b, indent=2))

Score: 0.14089230448007584
Trace 1
[
  {
    "file_name": "/usr/lib/python2.7/dist-packages/nose/case.py",
    "func_name": "runTest",
    "line_num": 197
  },
  {
    "file_name": "/\u00abPKGBUILDDIR\u00bb/build/lib.linux-aarch64-2.7/matplotlib/testing/decorators.py",
    "func_name": "wrapped_function",
    "line_num": 118
  },
  {
    "file_name": "/\u00abPKGBUILDDIR\u00bb/build/lib.linux-aarch64-2.7/matplotlib/tests/test_image.py",
    "func_name": "test_cursor_data",
    "line_num": 183
  }
]
Trace 2
[
  {
    "file_name": "/home/clq/code/ObjectDetection/detectron/tools/train_net.py",
    "func_name": "create_model",
    "line_num": 212
  },
  {
    "file_name": "/home/clq/software/anaconda2/envs/caffe2_py27/lib/python2.7/site-packages/caffe2/python/workspace.py",
    "func_name": "RunNetOnce",
    "line_num": 234
  },
  {
    "file_name": "/home/clq/software/anaconda2/envs/caffe2_py27/lib/python2.7/site-packages/caffe2/python/workspace.py",
    "func_name": "CallWithExceptionInte

In [217]:
from sklearn.metrics.pairwise import cosine_similarity
import torch

# Convert traces to text
id1, id2 = np.random.randint(0, len(dataset), size=2, dtype=int)
id1, id2 = int(id1), int(id2)
trace1 = dataset[id1]
trace2 = dataset[id2]

score = scorer.get_full_score(trace1, trace2)

print("Score:", score)

n = 3
last_n1 = trace1['trace_list'][-n:]
last_n2 = trace2['trace_list'][-n:]

print("Trace 1")
print(json.dumps(last_n1, indent=2))

print("Trace 2")
print(json.dumps(last_n2, indent=2))

print("Error type 1:", trace1['error_type'])
print("Error type 2:", trace2['error_type'])

print("Error msg 1:", trace1['error_msg'])
print("Error msg 2:", trace2['error_msg'])

0.16368641853332522
Score: 0.2502129957079888
Trace 1
[
  {
    "file_name": "/home/test/.local/lib/python3.7/site-packages/meshio/_ply.py",
    "func_name": "read_buffer",
    "line_num": 102
  },
  {
    "file_name": "/home/test/.local/lib/python3.7/site-packages/meshio/_ply.py",
    "func_name": "_read_binary",
    "line_num": 240
  },
  {
    "file_name": "/home/test/.local/lib/python3.7/site-packages/meshio/_ply.py",
    "func_name": "<listcomp>",
    "line_num": 240
  }
]
Trace 2
[
  {
    "file_name": "/usr/local/lib/python3.5/dist-packages/sanic/server.py",
    "func_name": "data_received",
    "line_num": 144
  },
  {
    "file_name": "httptools/parser/parser.pyx",
    "func_name": "httptools.parser.parser.HttpParser.feed_data (httptools/parser/parser.c:2721)",
    "line_num": 171
  }
]
Error type 1: KeyError
Error type 2: httptools.parser.errors.HttpParserCallbackError
Error msg 1: 'int'
Error msg 2: the on_headers_complete callback failed


In [190]:
from tqdm import tqdm

n_group = 6000 # cfg.n_group

t_low, t_high = 0.33, 0.67

groups = {
    "low": [],
    "mid": [],
    "high": []
}

def condition():
    for gr in groups.values():
        if len(gr) < n_group:
            return True
    return False

def gen_pair(dataset):
    idx1, idx2 = np.random.randint(0, len(dataset), size=2)
    idx1, idx2 = int(idx1), int(idx2)
    obj_a, obj_b = dataset[idx1], dataset[idx2]
    return obj_a, obj_b

def choose_group(score):
    if score < t_low:
        return "low"
    elif score < t_high:
        return "mid"
    return "high"

bar = tqdm(total=n_group, desc="Generating pairs (high)")
while condition():
    obj_a, obj_b = gen_pair(dataset)
    score = scorer.get_full_score(obj_a, obj_b)
    group_name = choose_group(score)
    if group_name == "high": bar.update(1)
    if len(groups[group_name]) < n_group:
        groups[group_name].append((obj_a, obj_b, float(score[0])))


In [197]:
cfg.groups_path = "groups.json"
with open(cfg.groups_path, 'w') as f:
    json.dump(groups, f, indent=2)

In [218]:
# split to train and val

groups_val = {}
groups_train = {}
for name, group in groups.items():
    groups_val[name] = group[:n_group//6]
    groups_train[name] = group[n_group//6:]

In [207]:
def group_obj_to_dict(obj):
    text1 = obj[0]['stack_trace']
    text2 = obj[1]['stack_trace']
    score = obj[2]
    return {
        "text1": text1,
        "text2": text2,
        "score": score
    }

list_val = sum(groups_val.values(), [])
list_train = sum(groups_train.values(), [])

list_val = list(map(group_obj_to_dict, list_val))
list_train = list(map(group_obj_to_dict, list_train))

In [211]:
from datasets import DatasetDict, Dataset

# dataset from list of dicts
ds_val = Dataset.from_list(list_val)
ds_train = Dataset.from_list(list_train)
ds_val

Dataset({
    features: ['text1', 'text2', 'score'],
    num_rows: 3000
})

In [214]:
ds_final = DatasetDict({
    "train": ds_train,
    "val": ds_val
})
ds_final.push_to_hub(cfg.save_dataset)

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]