In [126]:
from hydra import initialize, compose

with initialize(version_base=None, config_path="../../src/conf/"):
    cfg = compose(config_name='main.yaml', overrides=["local=default"])
    print(cfg)

{'num_workers': 1, 'num_splits': 10, 'split': 1, 'seed': 42, 'project_path': '${base_path}/data/raw/dataset_with_graphs.csv', 'extraction': {'_target_': 'src.feature.extract.NameFeatureExtraction', 'graph_path': '${arcan_graphs}', 'out_path': '${out_path}/processed/', 'stopwords': 'None'}, 'embedding': {'_target_': 'src.feature.embedding.FastTextEmbedding', 'model': 'fastText', 'path': '${base_path}/data/models/wiki.en.bin'}, 'base_path': '/home/sasce/PycharmProjects/CodeGraphClassification', 'out_path': '${base_path}/data/', 'arcan_out': '${base_path}/data/interim/', 'arcan_graphs': '${arcan_out}/arcanOutput', 'raw_data': '${out_path}/raw/', 'dataset': '${base_path}/data/raw/classification_dataset_lang.csv', 'language': 'JAVA', 'repositories_path': '${base_path}/data/raw/repositories/', 'arcan_script': '${base_path}/scripts/bash/run-arcan.sh', 'arcan_path': '${base_path}/tools/arcan', 'repository_path': '${base_path}/data/raw/repositories', 'logs_path': '${base_path}/logs/', 'model': 

In [127]:

from data.graph import ArcanGraphLoader
from os.path import join
from feature.content import NameContentExtraction
from utils import git_clone, get_versions, git_checkout, filter_by_label
from loguru import logger
import pandas as pd

LABEL = "database"
projects = pd.read_csv("/home/sasce/PycharmProjects/CodeGraphClassification/data/raw/dataset_with_graphs.csv")
content_extractor = NameContentExtraction(graph_path=cfg.arcan_graphs)

In [128]:
projects.head()

Unnamed: 0,name,label,level,version,sha,nodes,edges
0,Norconex|collector-filesystem,"['search engine', 'web crawler']","[3, 5]",133,292a42769f7f6f1beaf4b178c3f7170fddfee282,68,212
1,Norconex|collector-filesystem,"['search engine', 'web crawler']","[3, 5]",63,c01cf7bf4093aa21fafc1fba93ba50ca0343c619,36,110
2,Norconex|collector-filesystem,"['search engine', 'web crawler']","[3, 5]",72,785b7928896b302fd84d81f9437ec47af5d7b330,36,110
3,Norconex|collector-filesystem,"['search engine', 'web crawler']","[3, 5]",21,b08c040bfe57cb1f3220ca487d33b0240a6564fa,39,155
4,Norconex|collector-filesystem,"['search engine', 'web crawler']","[3, 5]",115,30cb73f240f83e7a3a363d4a5a5d27c04e313907,68,212


In [129]:
projects = filter_by_label(projects.copy(deep=True), [LABEL])

2022-10-26 11:18:55.601 | INFO     | utils:filter_by_label:105 - Labels ['database']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = df['label'].apply(tuple)


In [130]:
projects.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3360 entries, 975 to 59154
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     3360 non-null   object
 1   label    3360 non-null   object
 2   level    3360 non-null   object
 3   version  3360 non-null   int64 
 4   sha      3360 non-null   object
 5   nodes    3360 non-null   int64 
 6   edges    3360 non-null   int64 
dtypes: int64(3), object(4)
memory usage: 210.0+ KB


In [131]:
projects = list(set(projects['name'].tolist()))
len(projects)

111

In [132]:
from tqdm import tqdm

graph_loader = ArcanGraphLoader()
logger.info(f"Extracting features for {len(projects)} projects")
identifiers = []
content = {}
for project_name in tqdm(projects):
    project = project_name.replace('|', '/')

    project_url = f'https://github.com/{project}'
    if content_extractor.clone:
        git_clone(project_url, project_name, cfg.repositories_path)


    versions = [get_versions(project_name, cfg.arcan_graphs)[-1]]

    logger.info(f"Found {len(versions)} versions for project {project}")
    for num, sha in tqdm(versions):
        try:
            if content_extractor.clone:
                git_checkout(join(cfg.repositories_path, project_name), sha)
            project_graph = graph_loader.load(join(cfg.arcan_graphs, project_name, f'dependency-graph-{num}_{sha}.graphml'))
            res = content_extractor.extract(project_name, sha, num)
            identifiers.append(res)
        except:
            continue

    text = [" ".join(x).replace(".", " ") for x in identifiers]
    content[project_name] = text

    #repo_path = join(cfg.repository_path, project_name)
    #shutil.rmtree(repo_path, ignore_errors=True)


2022-10-26 11:18:56.302 | INFO     | __main__:<module>:4 - Extracting features for 111 projects
  0%|          | 0/111 [00:00<?, ?it/s]2022-10-26 11:18:56.305 | INFO     | __main__:<module>:17 - Found 1 versions for project nolia/Noodle

100%|██████████| 1/1 [00:00<00:00, 418.68it/s]
2022-10-26 11:18:56.312 | INFO     | __main__:<module>:17 - Found 1 versions for project srotya/sidewinder

100%|██████████| 1/1 [00:00<00:00, 48.76it/s]
2022-10-26 11:18:56.342 | INFO     | __main__:<module>:17 - Found 1 versions for project zapr-oss/druidry

100%|██████████| 1/1 [00:00<00:00, 86.35it/s]
2022-10-26 11:18:56.365 | INFO     | __main__:<module>:17 - Found 1 versions for project cinchapi/concourse

100%|██████████| 1/1 [00:00<00:00, 34.81it/s]
2022-10-26 11:18:56.402 | INFO     | __main__:<module>:17 - Found 1 versions for project DevinZ1993/SimpleDB-Database-System

100%|██████████| 1/1 [00:00<00:00, 171.60it/s]
  5%|▍         | 5/111 [00:00<00:02, 46.88it/s]2022-10-26 11:18:56.413 | INFO   

In [133]:
content

{'nolia|Noodle': ['com com noodle collection collection id storage random access file storage storage storage description storage record collection collection stored converted collection call collection gson converter storage bytes wrapper storage collection converter noodle storage encryption'],
 'srotya|sidewinder': ['',
  'sidewinder core api minuteman rpc sidewinder core functions list single series function sidewinder core predicates between predicate sidewinder core api database ops api minuteman cluster minuteman wal remote wal client sidewinder core storage series output sidewinder core functions list constant function sidewinder ingesters statsd sidewinder core utils time utils sidewinder core storage item not found exception sidewinder ingesters statsd statds decoder minuteman cluster router mod hash routing strategy sidewinder core api grafana target series sidewinder core functions list basic constant functions sidewinder ingesters statsd statsd server sidewinder core predi

In [134]:
with open("/home/sasce/PycharmProjects/CodeGraphClassification/data/raw/java_stopwords", 'rt') as inf:
    java_stopwords = {x.strip() for x in inf.readlines()}

In [135]:
java_stopwords.update(['get', 'set', 'org', 'com', 'exception', 'override', 'java', 'string',
                       'list', 'util', 'value', 'length', 'println'])

In [136]:
import yake
kw_extractor = yake.KeywordExtractor(n=1, stopwords=java_stopwords, top=50)
extracted_kw = {}

In [137]:
for project in content:
   extracted_kw[project] = kw_extractor.extract_keywords(" ".join(content[project]).lower().strip())

In [138]:
from collections import Counter

counter = Counter()
for p in extracted_kw:
    terms = [x[0] for x in extracted_kw[p]]
    counter.update(terms)

In [139]:
counter.most_common()

[('data', 55),
 ('type', 48),
 ('model', 38),
 ('factory', 37),
 ('database', 36),
 ('impl', 35),
 ('table', 34),
 ('query', 34),
 ('handler', 30),
 ('sql', 29),
 ('config', 29),
 ('connection', 28),
 ('service', 27),
 ('column', 25),
 ('result', 24),
 ('client', 23),
 ('manager', 23),
 ('server', 21),
 ('main', 21),
 ('test', 20),
 ('entity', 19),
 ('file', 18),
 ('filter', 18),
 ('activity', 18),
 ('builder', 18),
 ('api', 17),
 ('common', 17),
 ('provider', 17),
 ('request', 16),
 ('transaction', 16),
 ('application', 16),
 ('index', 16),
 ('configuration', 16),
 ('processor', 15),
 ('source', 15),
 ('node', 15),
 ('info', 15),
 ('statement', 15),
 ('user', 15),
 ('view', 15),
 ('core', 14),
 ('function', 14),
 ('reader', 14),
 ('search', 14),
 ('metadata', 14),
 ('field', 14),
 ('mapper', 14),
 ('object', 14),
 ('response', 14),
 ('schema', 14),
 ('writer', 13),
 ('cache', 13),
 ('auth', 13),
 ('dao', 13),
 ('listener', 13),
 ('event', 13),
 ('base', 13),
 ('stream', 13),
 ('securi

In [140]:

from sklearn.metrics.pairwise import cosine_similarity
import spacy

nlp = spacy.load("en_core_web_trf")
label_emb = list(nlp.pipe([LABEL]))[0]._.trf_data.tensors[-1]
keywords = [x[0] for x in counter.most_common()]
print(len(keywords))
similarities = []
for i, doc in tqdm(enumerate(nlp.pipe(keywords))):
    tokvecs = doc._.trf_data.tensors[-1]
    sim = cosine_similarity(tokvecs, label_emb)[0][0]
    similarities.append((keywords[i], sim))

1823


1823it [00:07, 234.37it/s]


In [141]:

similarities.sort(key=lambda x: -x[1])

In [142]:
len(similarities)

1823

In [143]:
similarities[:50]

[('database', 1.0),
 ('network', 0.9952489),
 ('array', 0.9949566),
 ('memory', 0.9948596),
 ('template', 0.99437517),
 ('archive', 0.99385405),
 ('activity', 0.9937778),
 ('item', 0.99373776),
 ('production', 0.993542),
 ('reader', 0.99353135),
 ('pattern', 0.9934614),
 ('instance', 0.9934313),
 ('browser', 0.99339503),
 ('selection', 0.99336755),
 ('storage', 0.9933018),
 ('layer', 0.99299926),
 ('shell', 0.9929731),
 ('record', 0.99292314),
 ('computer', 0.99290323),
 ('performance', 0.9928975),
 ('history', 0.9928813),
 ('platform', 0.9928543),
 ('console', 0.9928153),
 ('definition', 0.9927453),
 ('journal', 0.99247694),
 ('wave', 0.9924642),
 ('product', 0.9924138),
 ('callback', 0.99235606),
 ('depth', 0.99230707),
 ('software', 0.9922871),
 ('property', 0.9922705),
 ('activation', 0.9922137),
 ('collection', 0.9922061),
 ('layout', 0.9921307),
 ('container', 0.99212086),
 ('project', 0.9919642),
 ('expression', 0.9919148),
 ('process', 0.9919102),
 ('loader', 0.99187255),
 ('an

In [144]:
from feature.embedding import FastTextEmbedding

ft = FastTextEmbedding(path='/home/sasce/PycharmProjects/CodeGraphClassification/data/models/wiki.en.bin', model='fastText')



In [145]:
label_emb = [ft.get_embedding(LABEL)]
keywords = [x[0] for x in counter.most_common()]

similarities_ft = []
for i, key in tqdm(enumerate(keywords)):
    tokvecs = [ft.get_embedding(key)]
    sim = cosine_similarity(tokvecs, label_emb)[0][0]
    similarities_ft.append((keywords[i], sim))

similarities_ft.sort(key=lambda x: -x[1])

1823it [00:00, 4475.38it/s]


In [148]:
similarities_ft[:50]

[('database', 1.0),
 ('dbms', 0.57058567),
 ('query', 0.55238044),
 ('resource', 0.5301167),
 ('hmdb', 0.52991575),
 ('metadata', 0.5251692),
 ('mysql', 0.51636124),
 ('lmdb', 0.51181525),
 ('bsql', 0.5078706),
 ('sqlite', 0.5057728),
 ('mssql', 0.50572383),
 ('hsql', 0.5025117),
 ('postgresql', 0.5009709),
 ('sql', 0.49948573),
 ('dataset', 0.496668),
 ('hsqldb', 0.49662802),
 ('odbc', 0.49397185),
 ('datasource', 0.49052164),
 ('catalog', 0.4900716),
 ('dataservices', 0.48991224),
 ('queries', 0.48664963),
 ('repository', 0.48496497),
 ('jdbc', 0.4808603),
 ('hbase', 0.47848862),
 ('server', 0.4743712),
 ('backend', 0.47358575),
 ('repositories', 0.47180155),
 ('dbmetadata', 0.47095165),
 ('annotation', 0.4706476),
 ('annotate', 0.46840617),
 ('directory', 0.46519646),
 ('data', 0.46244183),
 ('browser', 0.45689577),
 ('webservice', 0.455578),
 ('mongodb', 0.45305884),
 ('sparql', 0.45140108),
 ('sqliteparser', 0.450005),
 ('indexserver', 0.44592124),
 ('orientdb', 0.4444078),
 ('log