In [1]:
import glob
import random

import numpy as np
import json
import plotly.express as px

projects_base_path = '/home/sasce/Downloads/SoftwareTopics/data/raw/'
projects_path = glob.glob(projects_base_path + '*.json')

In [2]:
def load_project_files(projects_path):
    contents = []
    project_name = []
    file_labels = []
    
    for project_path in projects_path:
        content = json.load(open(project_path, 'r'))
        project = content['name']
        files = content['versions'][0]['files']
        filenames = [file for file in files]
        identifiers = [files[file]['identifiers'] for file in filenames]
        distributions = [files[file]['annotation']['distribution'] for file in filenames]
        
        labels = [np.argmax(distribution) for distribution in distributions]
        file_labels.extend(labels)
        contents.extend([" ".join(code) for code in identifiers])
        project_name.extend([project]*len(filenames))
    return project_name, contents, file_labels

In [3]:
import re
def split_camelcase(text: str):
    return re.sub(
        '([A-Z][a-z]+)|_', r' \1', re.sub('([A-Z]+)', r' \1', text)
    ).split()


In [4]:
project_name, codes, labels = load_project_files(projects_path)

In [5]:
codes_split = [split_camelcase(code) for code in codes]
codes_split = [" ".join(code) for code in codes_split]

In [6]:
codes_split[0]



In [7]:
from sentence_transformers import SentenceTransformer

# Pre-calculate embeddings
embedding_model = SentenceTransformer("codecompletedeployment/unixcoder-base-nine")


.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/182 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/691 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/117 [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/444k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/504M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/55.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/938k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [8]:
embeddings = embedding_model.encode(codes, show_progress_bar=True)

Batches:   0%|          | 0/3650 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.50 GiB. GPU 0 has a total capacty of 3.81 GiB of which 1.29 GiB is free. Process 2513 has 55.55 MiB memory in use. Including non-PyTorch memory, this process has 2.47 GiB memory in use. Of the allocated memory 2.35 GiB is allocated by PyTorch, and 44.85 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
precomputes = np.array(embeddings)
with open('precomputes_unixcoder.npy', 'wb') as f:
    np.save(f, precomputes)
#embeddings_split = embedding_model.encode(codes_split, show_progress_bar=True)

In [None]:
with open('precomputes_unixcoder.npy', 'rb') as f:
    precomputes = np.load(f)

In [None]:
from umap import UMAP
from hdbscan import HDBSCAN

umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=150, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

In [None]:

embeddings = precomputes

In [None]:
embeddings = np.array(embeddings)

In [None]:
# Pre-reduce embeddings for visualization purposes
reduced_embeddings = UMAP(n_neighbors=15, n_components=2, min_dist=0.5, metric='cosine', random_state=42).fit(embeddings)
#reduced_embeddings_split = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine', random_state=42).fit(embeddings_split)

In [None]:
labels_plot = []
seen = {}
for name in project_name:
    if name not in seen:
        seen[name] = len(seen)
    labels_plot.append(seen[name])
labels_plot = np.array(labels_plot)
names = {seen[name]: name for name in seen}

In [None]:
import umap.plot
umap.plot.points(reduced_embeddings, labels=labels_plot)

In [None]:
# Pre-reduce embeddings for visualization purposes
reduced_embeddings = UMAP(n_neighbors=15, n_components=2, min_dist=0.15, metric='cosine').fit(embeddings)

umap.plot.points(reduced_embeddings, labels=labels_plot)