In [1]:
from hydra import initialize, compose

with initialize(version_base=None, config_path="../../src/conf/"):
    cfg = compose(config_name='main.yaml', overrides=["local=default"])
    print(cfg)

{'num_workers': 1, 'num_splits': 10, 'split': 1, 'seed': 42, 'project_path': '${base_path}/data/raw/dataset_with_graphs.csv', 'extraction': {'_target_': 'src.feature.extract.NameFeatureExtraction', 'graph_path': '${arcan_graphs}', 'out_path': '${out_path}/processed/', 'stopwords': 'None'}, 'embedding': {'_target_': 'src.feature.embedding.FastTextEmbedding', 'model': 'fastText', 'path': '${base_path}/data/models/wiki.en.bin'}, 'base_path': '/home/sasce/PycharmProjects/CodeGraphClassification', 'out_path': '${base_path}/data/', 'arcan_out': '${base_path}/data/interim/', 'arcan_graphs': '${arcan_out}/arcanOutput', 'raw_data': '${out_path}/raw/', 'dataset': '${base_path}/data/raw/classification_dataset_lang.csv', 'language': 'JAVA', 'repositories_path': '${base_path}/data/raw/repositories/', 'arcan_script': '${base_path}/scripts/bash/run-arcan.sh', 'arcan_path': '${base_path}/tools/arcan', 'repository_path': '${base_path}/data/raw/repositories', 'logs_path': '${base_path}/logs/', 'model': 

In [2]:

from data.graph import ArcanGraphLoader
from os.path import join
from feature.content import IdentifiersContentExtraction
from utils import git_clone, get_versions, git_checkout, filter_by_label
from loguru import logger
import pandas as pd

projects = pd.read_csv("/home/sasce/PycharmProjects/CodeGraphClassification/data/raw/dataset_with_graphs.csv")
content_extractor = IdentifiersContentExtraction(graph_path=cfg.arcan_graphs,
                                                 repo_path=cfg.repository_path)

In [3]:
projects.head()

Unnamed: 0,name,label,level,version,sha,nodes,edges
0,Norconex|collector-filesystem,"['search engine', 'web crawler']","[3, 5]",133,292a42769f7f6f1beaf4b178c3f7170fddfee282,68,212
1,Norconex|collector-filesystem,"['search engine', 'web crawler']","[3, 5]",63,c01cf7bf4093aa21fafc1fba93ba50ca0343c619,36,110
2,Norconex|collector-filesystem,"['search engine', 'web crawler']","[3, 5]",72,785b7928896b302fd84d81f9437ec47af5d7b330,36,110
3,Norconex|collector-filesystem,"['search engine', 'web crawler']","[3, 5]",21,b08c040bfe57cb1f3220ca487d33b0240a6564fa,39,155
4,Norconex|collector-filesystem,"['search engine', 'web crawler']","[3, 5]",115,30cb73f240f83e7a3a363d4a5a5d27c04e313907,68,212


In [4]:
projects = filter_by_label(projects.copy(deep=True), ['machine learning'])

2022-10-26 09:05:18.330 | INFO     | utils:filter_by_label:105 - Labels ['machine learning']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = df['label'].apply(tuple)


In [5]:
projects.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2012 entries, 907 to 59634
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     2012 non-null   object
 1   label    2012 non-null   object
 2   level    2012 non-null   object
 3   version  2012 non-null   int64 
 4   sha      2012 non-null   object
 5   nodes    2012 non-null   int64 
 6   edges    2012 non-null   int64 
dtypes: int64(3), object(4)
memory usage: 125.8+ KB


In [6]:
projects = set(projects['name'].tolist())
len(projects)

100

In [7]:
from tqdm import tqdm
import shutil

graph_loader = ArcanGraphLoader()
logger.info(f"Extracting features for {len(projects)} projects")
identifiers = []
keywords = {}
for project_name in tqdm(projects):
    project = project_name.replace('|', '/')

    project_url = f'https://github.com/{project}'
    git_clone(project_url, project_name, cfg.repositories_path)


    versions = [get_versions(project_name, cfg.arcan_graphs)[-1]]

    logger.info(f"Found {len(versions)} versions for project {project}")
    for num, sha in tqdm(versions):
        try:
            if content_extractor.clone:
                git_checkout(join(cfg.repositories_path, project_name), sha)
            project_graph = graph_loader.load(join(cfg.arcan_graphs, project_name, f'dependency-graph-{num}_{sha}.graphml'))
            res = content_extractor.extract(project_name, sha, num)
            identifiers.append(res)
        except:
            continue

    content = [" ".join(x) for x in identifiers]
    keywords[project_name] = content

    #repo_path = join(cfg.repository_path, project_name)
    #shutil.rmtree(repo_path, ignore_errors=True)


2022-10-26 09:05:18.878 | INFO     | __main__:<module>:5 - Extracting features for 100 projects
  0%|          | 0/100 [00:00<?, ?it/s]Cloning into '/home/sasce/PycharmProjects/CodeGraphClassification/data/raw/repositories/vishnugh|evo-NEAT'...
2022-10-26 09:05:19.504 | INFO     | __main__:<module>:17 - Found 1 versions for project vishnugh/evo-NEAT

  0%|          | 0/1 [00:00<?, ?it/s][ANote: switching to '8cb412eb1faaefa713c09a94ecd585c64f23b38d'.

You are in 'detached HEAD' state. You can look around, make experimental
changes and commit them, and you can discard any commits you make in this
state without impacting any branches by switching back to a branch.

If you want to create a new branch to retain commits you create, you may
do so (now or later) by using -c with the switch command. Example:

  git switch -c <new-branch-name>

Or undo this operation with:

  git switch -

Turn off this advice by setting config variable advice.detachedHead to false

HEAD is now at 8cb412e docs

In [8]:
with open("/home/sasce/PycharmProjects/CodeGraphClassification/data/raw/java_stopwords", 'rt') as inf:
    java_stopwords = {x.strip() for x in inf.readlines()}

In [9]:
java_stopwords.update(['get', 'set', 'org', 'com', 'exception', 'override', 'java', 'string',
                       'list', 'util', 'value', 'length', 'println'])

In [10]:
import yake
kw_extractor = yake.KeywordExtractor(n=1, stopwords=java_stopwords, top=50)
extracted_kw = {}

In [11]:
for project in keywords:
   extracted_kw[project] = kw_extractor.extract_keywords(" ".join(keywords[project]).lower().strip())

In [12]:
from collections import Counter

counter = Counter()
for p in extracted_kw:
    terms = [x[0] for x in extracted_kw[p]]
    counter.update(terms)

In [13]:
counter.most_common()

[('name', 66),
 ('data', 63),
 ('size', 61),
 ('type', 55),
 ('array', 54),
 ('add', 54),
 ('file', 54),
 ('map', 52),
 ('model', 50),
 ('result', 45),
 ('index', 39),
 ('input', 35),
 ('text', 30),
 ('path', 30),
 ('stream', 30),
 ('log', 29),
 ('output', 29),
 ('builder', 29),
 ('label', 28),
 ('node', 27),
 ('out', 27),
 ('count', 27),
 ('object', 27),
 ('logger', 26),
 ('integer', 25),
 ('num', 25),
 ('max', 25),
 ('key', 23),
 ('instance', 22),
 ('context', 22),
 ('create', 21),
 ('vector', 21),
 ('config', 20),
 ('put', 20),
 ('system', 20),
 ('request', 19),
 ('configuration', 18),
 ('image', 18),
 ('core', 17),
 ('info', 17),
 ('min', 17),
 ('tree', 16),
 ('hash', 16),
 ('line', 16),
 ('test', 16),
 ('field', 15),
 ('response', 15),
 ('train', 15),
 ('android', 15),
 ('start', 15),
 ('weight', 14),
 ('error', 14),
 ('height', 14),
 ('args', 14),
 ('matrix', 14),
 ('feature', 14),
 ('reader', 14),
 ('entry', 13),
 ('width', 13),
 ('names', 13),
 ('score', 13),
 ('parameters', 13

In [14]:
kw_extractor = yake.KeywordExtractor(n=1, stopwords=java_stopwords, top=50)
all_text = " ".join([" ".join(keywords[project]) for project in keywords])

all_keywords =  kw_extractor.extract_keywords(all_text)

In [15]:
for x in all_keywords:
    print(x)

('data', 2.5364163643828056e-08)
('name', 3.249414488485974e-08)
('type', 4.8664492873375205e-08)
('add', 5.944575887143773e-08)
('file', 6.063323783804538e-08)
('size', 6.553095029130837e-08)
('result', 7.124376940519278e-08)
('map', 8.854624674965106e-08)
('model', 9.227293667043316e-08)
('index', 9.760248942706905e-08)
('num', 1.003353467599736e-07)
('instance', 1.0478115211397847e-07)
('object', 1.1709826776616213e-07)
('node', 1.2902499287222149e-07)
('config', 1.4609188716585373e-07)
('input', 1.518467073108732e-07)
('instances', 1.6075679225146214e-07)
('array', 1.6452512673919166e-07)
('builder', 1.6622123735069255e-07)
('path', 1.780427693970337e-07)
('output', 1.975313558500376e-07)
('option', 2.0619667109775861e-07)
('max', 2.2310033336598874e-07)
('options', 2.5951338700805383e-07)
('vector', 2.625128668876674e-07)
('log', 2.849889020163885e-07)
('count', 2.856821810054825e-07)
('matrix', 3.0034718289317545e-07)
('names', 3.019885924266424e-07)
('attribute', 3.2237444323351

In [37]:

from sklearn.metrics.pairwise import cosine_similarity
import spacy

nlp = spacy.load("en_core_web_trf")
label_emb = list(nlp.pipe(['machine learning']))[0]._.trf_data.tensors[-1]
keywords = [x[0] for x in counter.most_common()]
print(len(keywords))
similarities = []
for i, doc in tqdm(enumerate(nlp.pipe(keywords))):
    tokvecs = doc._.trf_data.tensors[-1]
    sim = cosine_similarity(tokvecs, label_emb)[0][0]
    similarities.append((keywords[i], sim))

1539


1539it [00:10, 152.52it/s]


In [38]:

similarities.sort(key=lambda x: -x[1])

In [39]:
len(similarities)

1539

In [54]:
similarities[:50]

[('turing', 0.9809649),
 ('instruction', 0.9800265),
 ('streaming', 0.9777824),
 ('construction', 0.97773445),
 ('detection', 0.9767931),
 ('utility', 0.976743),
 ('modeler', 0.9764216),
 ('scanner', 0.97631496),
 ('detector', 0.9762422),
 ('structure', 0.97588086),
 ('training', 0.97576797),
 ('mutation', 0.97568756),
 ('machinelearning', 0.9753635),
 ('specification', 0.97524905),
 ('registry', 0.9751737),
 ('planner', 0.9750017),
 ('evaluation', 0.9748831),
 ('activity', 0.9748595),
 ('recognition', 0.97475684),
 ('rotation', 0.9743566),
 ('pattern', 0.97430557),
 ('configuration', 0.9741598),
 ('trainer', 0.97412884),
 ('animation', 0.97403884),
 ('polymer', 0.97387385),
 ('sampling', 0.97385454),
 ('mining', 0.973853),
 ('trait', 0.9737062),
 ('capacity', 0.9735535),
 ('cluster', 0.9734316),
 ('tracking', 0.97341496),
 ('execution', 0.9733811),
 ('taste', 0.97310764),
 ('dependency', 0.9730565),
 ('analysis', 0.9729489),
 ('storage', 0.97260875),
 ('regularization', 0.97256637),
 

In [55]:
from feature.embedding import FastTextEmbedding

ft = FastTextEmbedding(path='/home/sasce/PycharmProjects/CodeGraphClassification/data/models/wiki.en.bin', model='fastText')



In [58]:
label_emb = [ft.get_embedding('machine learning')]
keywords = [x[0] for x in counter.most_common()]

similarities_ft = []
for i, key in tqdm(enumerate(keywords)):
    tokvecs = [ft.get_embedding(key)]
    sim = cosine_similarity(tokvecs, label_emb)[0][0]
    similarities_ft.append((keywords[i], sim))

similarities_ft.sort(key=lambda x: -x[1])

1539it [00:00, 3457.35it/s]


In [61]:
similarities_ft[:50]

[('learning', 0.80086046),
 ('machinelearning', 0.77182347),
 ('learner', 0.59861034),
 ('learners', 0.5608892),
 ('learn', 0.5550004),
 ('turing', 0.54515105),
 ('weblearner', 0.51500314),
 ('instruction', 0.500135),
 ('dataflow', 0.48761946),
 ('neural', 0.469069),
 ('retrieval', 0.46826547),
 ('learnlib', 0.46304932),
 ('tool', 0.4624128),
 ('processing', 0.46188936),
 ('tasks', 0.4562919),
 ('innovation', 0.44578502),
 ('algorithm', 0.43994573),
 ('visualization', 0.4318238),
 ('tools', 0.4295457),
 ('visualizer', 0.42692038),
 ('inference', 0.42638168),
 ('mapreduce', 0.42484185),
 ('modeler', 0.4239036),
 ('opencv', 0.42344636),
 ('recognizer', 0.4231896),
 ('iterator', 0.42285484),
 ('predictive', 0.4199339),
 ('extractor', 0.41926408),
 ('tokenizer', 0.41307795),
 ('processor', 0.4125301),
 ('memory', 0.41247398),
 ('javacpp', 0.41011852),
 ('communication', 0.41005751),
 ('training', 0.4083612),
 ('ytklearn', 0.4064971),
 ('encoder', 0.4046787),
 ('propositional', 0.40421215),