In [3]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Set-up

In [25]:
import os
import sys

root_dir = os.path.abspath('..')
print("Root dir: ", root_dir)
sys.path.append(root_dir)

Root dir:  /Users/user010/Desktop/Programming/ML/STS


In [40]:
from utils.helpers import read_config, print_config
import numpy as np

glob_cfg = read_config("../config.yaml")
cfg = read_config(glob_cfg.configs.dataset)

print_config(cfg)

{
  "dataset": "stacktraces",
  "n_samples": 25000,
  "test_split": 0.2,
  "save_dataset": "stacktraces_hf"
}


Preprocessing

In [5]:
import json

dataset_path = glob_cfg.datasets[cfg.dataset].path
print("Dataset path:", dataset_path)
with open(dataset_path) as f:
    dataset = json.load(f)

Dataset path: /Users/user010/Desktop/Programming/ML/STS/dataset/stacktraces.json


In [6]:
dataset = dataset['data']

In [7]:
import numpy as np
import pandas as pd

df = pd.DataFrame(dataset)
df.head(3)

Unnamed: 0,0,1,2,3
0,12,https://github.com/jiangxufeng/v2rayL/issues/13,cpython,"Traceback (most recent call last):\r File ""v2..."
1,28,https://github.com/jiangxufeng/v2rayL/issues/31,cpython,mackel@linux-pd0d:[~/Downloads]:Traceback (mos...
2,64,https://github.com/jiangxufeng/v2rayL/issues/69,cpython,root@ubuntu:V2Ray# Traceback (most recent call...


In [8]:
# rename columns
df.rename(columns={
    0: 'id',
    1: 'url',
    2: 'interpreter',
    3: 'stack_trace'
}, inplace=True)
df.head(3)

Unnamed: 0,id,url,interpreter,stack_trace
0,12,https://github.com/jiangxufeng/v2rayL/issues/13,cpython,"Traceback (most recent call last):\r File ""v2..."
1,28,https://github.com/jiangxufeng/v2rayL/issues/31,cpython,mackel@linux-pd0d:[~/Downloads]:Traceback (mos...
2,64,https://github.com/jiangxufeng/v2rayL/issues/69,cpython,root@ubuntu:V2Ray# Traceback (most recent call...


In [9]:
df['stack_trace'] = df['stack_trace'].str.replace('\r|\t|  +|File', '\n', regex=True)

In [10]:
from utils.parser import parse_stack_trace
rnd_idx = np.random.randint(0, len(df))
sample = df.iloc[rnd_idx]
trace_list, final_error = parse_stack_trace(sample.stack_trace)


print("Trace:")
print(sample.stack_trace)

Trace:

22.7
66 | Traceback (most recent call last):

22.7
66 |

 "<string>", line 1, in <module>



In [11]:
trace_list, final_error = parse_stack_trace(sample.stack_trace)
print("Final error:", json.dumps(final_error, indent=2))
print("Trace list:", json.dumps(trace_list, indent=2))

Final error: null
Trace list: [
  {
    "func_name": "<module>",
    "file_name": "<string>",
    "line_num": 1
  }
]


In [12]:
# apply parse_stack_trace to each stack_trace
df['trace_list'], df['final_error'] = zip(*df['stack_trace'].map(parse_stack_trace))
df.head(3)

Unnamed: 0,id,url,interpreter,stack_trace,trace_list,final_error
0,12,https://github.com/jiangxufeng/v2rayL/issues/13,cpython,"Traceback (most recent call last):\n\n\n ""v2ra...","[{'func_name': '<module>', 'file_name': 'v2ray...","{'error_type': 'ImportError', 'error_msg': 'Un..."
1,28,https://github.com/jiangxufeng/v2rayL/issues/31,cpython,mackel@linux-pd0d:[~/Downloads]:Traceback (mos...,"[{'func_name': '<module>', 'file_name': 'v2ray...","{'error_type': 'ImportError', 'error_msg': 'Un..."
2,64,https://github.com/jiangxufeng/v2rayL/issues/69,cpython,root@ubuntu:V2Ray# Traceback (most recent call...,"[{'func_name': 'qt_message_handler', 'file_nam...","{'error_type': 'ValueError', 'error_msg': 'not..."


In [13]:
# count number of Nones in final_error
n_nans = df['final_error'].isna().sum()
print("Number of Nones in final_error:", n_nans, "out of", len(df))
print("Percentage of Nones in final_error:", n_nans / len(df))

Number of Nones in final_error: 109728 out of 377678
Percentage of Nones in final_error: 0.2905332055348736


In [14]:
df.dropna(inplace=True)

In [15]:
# split final error into error type and error message
df['error_type'] = df['final_error'].apply(lambda x: x['error_type'])
df['error_msg'] = df['final_error'].apply(lambda x: x['error_msg'])
df.head(3)

Unnamed: 0,id,url,interpreter,stack_trace,trace_list,final_error,error_type,error_msg
0,12,https://github.com/jiangxufeng/v2rayL/issues/13,cpython,"Traceback (most recent call last):\n\n\n ""v2ra...","[{'func_name': '<module>', 'file_name': 'v2ray...","{'error_type': 'ImportError', 'error_msg': 'Un...",ImportError,Unable to find zbar shared library
1,28,https://github.com/jiangxufeng/v2rayL/issues/31,cpython,mackel@linux-pd0d:[~/Downloads]:Traceback (mos...,"[{'func_name': '<module>', 'file_name': 'v2ray...","{'error_type': 'ImportError', 'error_msg': 'Un...",ImportError,Unable to find zbar shared library
2,64,https://github.com/jiangxufeng/v2rayL/issues/69,cpython,root@ubuntu:V2Ray# Traceback (most recent call...,"[{'func_name': 'qt_message_handler', 'file_nam...","{'error_type': 'ValueError', 'error_msg': 'not...",ValueError,"not enough values to unpack (expected 2, got 1)"


In [16]:
drop_cols = ['id', 'final_error']
df.drop(columns=drop_cols, inplace=True)
df.head(3)

Unnamed: 0,url,interpreter,stack_trace,trace_list,error_type,error_msg
0,https://github.com/jiangxufeng/v2rayL/issues/13,cpython,"Traceback (most recent call last):\n\n\n ""v2ra...","[{'func_name': '<module>', 'file_name': 'v2ray...",ImportError,Unable to find zbar shared library
1,https://github.com/jiangxufeng/v2rayL/issues/31,cpython,mackel@linux-pd0d:[~/Downloads]:Traceback (mos...,"[{'func_name': '<module>', 'file_name': 'v2ray...",ImportError,Unable to find zbar shared library
2,https://github.com/jiangxufeng/v2rayL/issues/69,cpython,root@ubuntu:V2Ray# Traceback (most recent call...,"[{'func_name': 'qt_message_handler', 'file_nam...",ValueError,"not enough values to unpack (expected 2, got 1)"


In [17]:
# shuffle with seed
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df.head(3)

Unnamed: 0,url,interpreter,stack_trace,trace_list,error_type,error_msg
0,https://github.com/ytdl-org/youtube-dl/issues/...,cpython,"Traceback (most recent call last):\n\n\n ""/usr...","[{'func_name': '_decrypt_signature', 'file_nam...",youtube_dl.utils.ExtractorError,"Unsupported JS expression '[1596180896,'; plea..."
1,https://github.com/Nandaka/PixivUtil2/issues/400,cpython,"2018-08-22 13:42:00,933 - PixivUtil20180815-be...","[{'func_name': 'process_member', 'file_name': ...",AttributeError,'NoneType' object has no attribute 'has_key'
2,https://github.com/frappe/erpnext/issues/16928,cpython,"Traceback (most recent call last):\n\n\n ""/hom...","[{'func_name': 'application', 'file_name': '/h...",ValidationError,Stripe Settings not found


In [21]:
# convert to hf
from datasets import Dataset
hf_dataset = Dataset.from_pandas(df)

In [22]:
hf_dataset

Dataset({
    features: ['url', 'interpreter', 'stack_trace', 'trace_list', 'error_type', 'error_msg'],
    num_rows: 267950
})

In [23]:
# make 3 splits (train, test, other)


n_samples = cfg.n_samples
test_split = cfg.test_split
# take in other len(df) - n_samples samples
hf_dataset = hf_dataset.shuffle(seed=42)
hf_testtrain = hf_dataset.select(range(n_samples))
hf_other = hf_dataset.select(range(n_samples, len(hf_dataset)))
hf_final = hf_testtrain.train_test_split(test_size=test_split, seed=42)
hf_final["other"] = hf_other

print("Train size:", len(hf_final["train"]))
print("Test size:", len(hf_final["test"]))
print("Other size:", len(hf_final["other"]))

Train size: 20000
Test size: 5000
Other size: 242950


In [44]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [45]:
save_params = glob_cfg.datasets[cfg.save_dataset].hf_save_params
print("Save params:", save_params)

hf_dataset.push_to_hub(
    **save_params,
    private=True
)

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/268 [00:00<?, ?ba/s]