In [1]:
from hydra import initialize, compose

with initialize(version_base=None, config_path="../../src/conf/"):
    cfg = compose(config_name='main.yaml', overrides=["local=default"])
    print(cfg)

{'num_workers': 1, 'num_splits': 10, 'split': 1, 'seed': 42, 'project_path': '${base_path}/data/raw/dataset_with_graphs.csv', 'extraction': {'_target_': 'src.feature.extract.NameFeatureExtraction', 'graph_path': '${arcan_graphs}', 'out_path': '${out_path}/processed/', 'stopwords': 'None'}, 'embedding': {'_target_': 'src.feature.embedding.FastTextEmbedding', 'model': 'fastText', 'path': '${base_path}/data/models/wiki.en.bin'}, 'base_path': '/home/sasce/PycharmProjects/CodeGraphClassification', 'out_path': '${base_path}/data/', 'arcan_out': '${base_path}/data/interim/', 'arcan_graphs': '${arcan_out}/arcanOutput', 'raw_data': '${out_path}/raw/', 'dataset': '${base_path}/data/raw/classification_dataset_lang.csv', 'language': 'JAVA', 'repositories_path': '${base_path}/data/raw/repositories/', 'arcan_script': '${base_path}/scripts/bash/run-arcan.sh', 'arcan_path': '${base_path}/tools/arcan', 'repository_path': '${base_path}/data/raw/repositories', 'logs_path': '${base_path}/logs/', 'model': 

In [2]:

from data.graph import ArcanGraphLoader
from os.path import join
from feature.content import NameContentExtraction
from utils import git_clone, get_versions, git_checkout, filter_by_label
from loguru import logger
import pandas as pd

LABEL = "machine learning"
projects = pd.read_csv("/home/sasce/PycharmProjects/CodeGraphClassification/data/raw/dataset_with_graphs.csv")
content_extractor = NameContentExtraction(graph_path=cfg.arcan_graphs)

In [3]:
projects.head()

Unnamed: 0,name,label,level,version,sha,nodes,edges
0,Norconex|collector-filesystem,"['search engine', 'web crawler']","[3, 5]",133,292a42769f7f6f1beaf4b178c3f7170fddfee282,68,212
1,Norconex|collector-filesystem,"['search engine', 'web crawler']","[3, 5]",63,c01cf7bf4093aa21fafc1fba93ba50ca0343c619,36,110
2,Norconex|collector-filesystem,"['search engine', 'web crawler']","[3, 5]",72,785b7928896b302fd84d81f9437ec47af5d7b330,36,110
3,Norconex|collector-filesystem,"['search engine', 'web crawler']","[3, 5]",21,b08c040bfe57cb1f3220ca487d33b0240a6564fa,39,155
4,Norconex|collector-filesystem,"['search engine', 'web crawler']","[3, 5]",115,30cb73f240f83e7a3a363d4a5a5d27c04e313907,68,212


In [4]:
projects = filter_by_label(projects.copy(deep=True), [LABEL])

2022-10-26 11:54:44.261 | INFO     | utils:filter_by_label:105 - Labels ['machine learning']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = df['label'].apply(tuple)


In [5]:
projects.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2012 entries, 907 to 59634
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     2012 non-null   object
 1   label    2012 non-null   object
 2   level    2012 non-null   object
 3   version  2012 non-null   int64 
 4   sha      2012 non-null   object
 5   nodes    2012 non-null   int64 
 6   edges    2012 non-null   int64 
dtypes: int64(3), object(4)
memory usage: 125.8+ KB


In [6]:
projects = list(set(projects['name'].tolist()))
len(projects)

100

In [7]:
from tqdm import tqdm

graph_loader = ArcanGraphLoader()
logger.info(f"Extracting features for {len(projects)} projects")
identifiers = []
content = {}
for project_name in tqdm(projects):
    project = project_name.replace('|', '/')

    project_url = f'https://github.com/{project}'
    if content_extractor.clone:
        git_clone(project_url, project_name, cfg.repositories_path)


    versions = [get_versions(project_name, cfg.arcan_graphs)[-1]]

    logger.info(f"Found {len(versions)} versions for project {project}")
    for num, sha in tqdm(versions):
        try:
            if content_extractor.clone:
                git_checkout(join(cfg.repositories_path, project_name), sha)
            project_graph = graph_loader.load(join(cfg.arcan_graphs, project_name, f'dependency-graph-{num}_{sha}.graphml'))
            res = content_extractor.extract(project_name, sha, num)
            identifiers.append(res)
        except:
            continue

    text = [" ".join(x).replace(".", " ") for x in identifiers]
    content[project_name] = text

    #repo_path = join(cfg.repository_path, project_name)
    #shutil.rmtree(repo_path, ignore_errors=True)


2022-10-26 11:54:44.835 | INFO     | __main__:<module>:4 - Extracting features for 100 projects
  0%|          | 0/100 [00:00<?, ?it/s]2022-10-26 11:54:44.838 | INFO     | __main__:<module>:17 - Found 1 versions for project gidim/Babler

100%|██████████| 1/1 [00:00<00:00, 110.98it/s]
2022-10-26 11:54:44.853 | INFO     | __main__:<module>:17 - Found 1 versions for project lsds/Crossbow

100%|██████████| 1/1 [00:00<00:00, 54.60it/s]
2022-10-26 11:54:44.882 | INFO     | __main__:<module>:17 - Found 1 versions for project elki-project/elki

  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:00<00:00,  2.55it/s][A
  3%|▎         | 3/100 [00:00<00:15,  6.22it/s]2022-10-26 11:54:45.321 | INFO     | __main__:<module>:17 - Found 1 versions for project lizhangqu/TensorflowLite

100%|██████████| 1/1 [00:00<00:00, 108.80it/s]
2022-10-26 11:54:45.334 | INFO     | __main__:<module>:17 - Found 1 versions for project viadee/javaAnchorExplainer

100%|██████████| 1/1 [00:00<00:00, 173.17

In [8]:
content

{'gidim|Babler': ['main bing main language id textcat text categorizer main language id com detect language errors main db export to text main bing azure search query main language id com detect language responses main language id cld cld2 main you tube you tube captions scraper main bing php bb scraper main bing php bb scraper bb broker main bing blogspot scraper main language id lingpipe ling pipe main collection babel broker main article extraction main http client main language id com main article extraction language with post count main bing php bb scraper bb job manager main language id lingpipe main article extraction diffbot article main mt http client main url main language id cld main article extraction post extractor main bing php bb scraper bb fetcher and saver main file opener main language id textcat main language data manager main language id textcat my properties main collection babel scraper edu columbia main file saver main language id textcat n gram entry comparator 

In [9]:
with open("/home/sasce/PycharmProjects/CodeGraphClassification/data/raw/java_stopwords", 'rt') as inf:
    java_stopwords = {x.strip() for x in inf.readlines()}

In [10]:
java_stopwords.update(['get', 'set', 'org', 'com', 'exception', 'override', 'java', 'string',
                       'list', 'util', 'value', 'length', 'println'])

In [11]:
import yake
kw_extractor = yake.KeywordExtractor(n=1, stopwords=java_stopwords, top=50)
extracted_kw = {}

In [12]:
for project in content:
   extracted_kw[project] = kw_extractor.extract_keywords(" ".join(content[project]).lower().strip())

In [13]:
from collections import Counter

counter = Counter()
for p in extracted_kw:
    terms = [x[0] for x in extracted_kw[p]]
    counter.update(terms)

In [14]:
counter.most_common()

[('data', 41),
 ('model', 40),
 ('test', 29),
 ('result', 21),
 ('function', 21),
 ('tree', 20),
 ('feature', 20),
 ('config', 20),
 ('main', 19),
 ('service', 19),
 ('classifier', 19),
 ('type', 17),
 ('activity', 17),
 ('example', 17),
 ('factory', 16),
 ('impl', 16),
 ('configuration', 14),
 ('file', 14),
 ('core', 14),
 ('text', 13),
 ('filter', 13),
 ('base', 13),
 ('response', 12),
 ('task', 12),
 ('application', 12),
 ('helper', 12),
 ('image', 12),
 ('api', 12),
 ('client', 11),
 ('random', 11),
 ('node', 11),
 ('object', 10),
 ('loss', 10),
 ('linear', 10),
 ('instance', 10),
 ('matrix', 10),
 ('stream', 10),
 ('functions', 10),
 ('handler', 9),
 ('distance', 9),
 ('vector', 9),
 ('classification', 9),
 ('metric', 9),
 ('loader', 9),
 ('flow', 9),
 ('info', 9),
 ('services', 9),
 ('parser', 9),
 ('view', 9),
 ('context', 9),
 ('models', 8),
 ('error', 8),
 ('search', 8),
 ('parameter', 8),
 ('algorithm', 8),
 ('spark', 8),
 ('analysis', 8),
 ('reader', 8),
 ('learning', 8),
 (

In [15]:

from sklearn.metrics.pairwise import cosine_similarity
import spacy

nlp = spacy.load("en_core_web_trf")
label_emb = list(nlp.pipe([LABEL]))[0]._.trf_data.tensors[-1]
keywords = [x[0] for x in counter.most_common()]
print(len(keywords))
similarities = []
for i, doc in tqdm(enumerate(nlp.pipe(keywords))):
    tokvecs = doc._.trf_data.tensors[-1]
    sim = cosine_similarity(tokvecs, label_emb)[0][0]
    similarities.append((keywords[i], sim))

2022-10-26 11:54:58.590142: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-26 11:54:58.798889: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-10-26 11:54:59.544939: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-10-26 11:54:59.545096: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or 

1783


1783it [00:09, 187.42it/s]


In [16]:

similarities.sort(key=lambda x: -x[1])

In [17]:
len(similarities)

1783

In [18]:
similarities[:50]

[('tooling', 0.98146695),
 ('turing', 0.980965),
 ('scripting', 0.97998285),
 ('scaling', 0.978513),
 ('blocking', 0.97837454),
 ('grouping', 0.97795147),
 ('cleaning', 0.97794044),
 ('racing', 0.97787106),
 ('streaming', 0.9777824),
 ('construction', 0.97773445),
 ('sorting', 0.97772104),
 ('detection', 0.9767931),
 ('utility', 0.9767429),
 ('modeler', 0.97642154),
 ('arithmetic', 0.9762926),
 ('detector', 0.9762422),
 ('structure', 0.97588086),
 ('coordination', 0.97580934),
 ('training', 0.97576797),
 ('featurepruning', 0.975617),
 ('machinelearning', 0.9753634),
 ('registry', 0.9751737),
 ('evaluation', 0.9748831),
 ('recording', 0.97487015),
 ('activity', 0.9748595),
 ('recognition', 0.97475684),
 ('configuration', 0.9741598),
 ('trainer', 0.97412884),
 ('binning', 0.9740665),
 ('polymer', 0.97387385),
 ('sampling', 0.97385454),
 ('cluster', 0.9734316),
 ('execution', 0.9733811),
 ('divergence', 0.97336006),
 ('dependency', 0.9730565),
 ('exploration', 0.9730322),
 ('boundary', 0.

In [19]:
from feature.embedding import FastTextEmbedding

ft = FastTextEmbedding(path='/home/sasce/PycharmProjects/CodeGraphClassification/data/models/wiki.en.bin', model='fastText')



In [20]:
label_emb = [ft.get_embedding(LABEL)]
keywords = [x[0] for x in counter.most_common()]

similarities_ft = []
for i, key in tqdm(enumerate(keywords)):
    tokvecs = [ft.get_embedding(key)]
    sim = cosine_similarity(tokvecs, label_emb)[0][0]
    similarities_ft.append((keywords[i], sim))

similarities_ft.sort(key=lambda x: -x[1])

1783it [00:00, 4118.75it/s]


In [21]:
similarities_ft[:50]

[('machine', 0.8008606),
 ('learning', 0.80086046),
 ('machinelearning', 0.77182347),
 ('modellearning', 0.60952586),
 ('learner', 0.59861034),
 ('learners', 0.5608892),
 ('learn', 0.5550004),
 ('turing', 0.54515105),
 ('weblearner', 0.51500314),
 ('unsupervised', 0.51103646),
 ('statemachine', 0.49880147),
 ('dataflow', 0.48761946),
 ('algorithms', 0.48238376),
 ('letterpredictor', 0.4713303),
 ('neural', 0.469069),
 ('parsing', 0.46866322),
 ('retrieval', 0.46826547),
 ('programming', 0.46619743),
 ('tool', 0.4624128),
 ('autoencoder', 0.46194303),
 ('processing', 0.46188936),
 ('tasks', 0.4562919),
 ('backpropagation', 0.45508614),
 ('mainframe', 0.45230386),
 ('initialization', 0.44877338),
 ('optimizer', 0.44834492),
 ('prover', 0.44755194),
 ('innovation', 0.44578502),
 ('languagetool', 0.44306433),
 ('debugging', 0.44161376),
 ('algorithm', 0.43994573),
 ('recursive', 0.43767765),
 ('dataprocessing', 0.43338186),
 ('reconfigurable', 0.4320647),
 ('visualization', 0.4318238),
 ('