/
toolchain_run.py
62 lines (44 loc) · 2.1 KB
/
toolchain_run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import os
import shutil
from lib.TimeLogger import TimeLogger
from lib.kotlin_source2ast.source2ast import source2ast
from lib.ast_set2matrix.asts2vectors import asts2vectors
from lib.ast_set2matrix.sparse_transform import sparse_transform
from lib.ast_set2matrix.vectors2matrix import vectors2matrix
from lib.ast_set2matrix.matrix2csv import matrix2csv
from lib.anomaly_detection.autoencoding import autoencoding
from lib.anomaly_detection.anomaly_selection import anomaly_selection
class Paths:
# Path to the data of intermediate stages
STAGES_DATA = './data'
# Stage paths to input-output files/folders
AST = '%s/ast' % STAGES_DATA
AST_VECTORS = '%s/ast_vectors' % STAGES_DATA
AST_SPARSED_VECTORS = '%s/ast_sparsed_vectors' % STAGES_DATA
DATASET_JSON = '%s/dataset.json' % STAGES_DATA
DATASET_CSV = '%s/dataset.csv' % STAGES_DATA
DISTANCES = '%s/distances.json' % STAGES_DATA
# Stage paths to additional files
FEATURES_CONFIG = './features_config.json'
ALL_FEATURES = '%s/all_features.json' % AST_VECTORS
FILES_MAP = '%s/files_map.json' % STAGES_DATA
AUTOENCODER_SPLIT_DATASET_PERCENT = 0.9
AUTOENCODER_DIM_PERCENT = 0.8
def toolchain_run(input, output):
time_logger = TimeLogger(task_name='Code anomaly detection')
if not os.path.exists(Paths.STAGES_DATA):
os.makedirs(Paths.STAGES_DATA)
# Kotlin source codes parsing
source2ast(input, Paths.AST)
# Kotlin CST factorization
asts2vectors(Paths.AST, Paths.AST_VECTORS, Paths.FEATURES_CONFIG)
sparse_transform(Paths.AST_VECTORS, Paths.AST_SPARSED_VECTORS, Paths.ALL_FEATURES, 'list')
vectors2matrix(Paths.AST_SPARSED_VECTORS, Paths.DATASET_JSON)
matrix2csv(Paths.DATASET_JSON, Paths.DATASET_CSV)
# Anomaly detection
autoencoding(Paths.DATASET_CSV, AUTOENCODER_SPLIT_DATASET_PERCENT, AUTOENCODER_DIM_PERCENT, Paths.DISTANCES)
anomalies_number =\
anomaly_selection(Paths.FILES_MAP, output, use_dbscan=False, differences_file=Paths.DISTANCES)
shutil.rmtree(Paths.STAGES_DATA)
time_logger.finish(full_finish=True)
print('Found %d anomalies' % anomalies_number)