In [26]:
import pandas as pd
import torch
from sklearn.cluster import KMeans
from transformers import RobertaTokenizer, RobertaModel


# Load the pre-trained CodeBERT model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model = RobertaModel.from_pretrained("microsoft/codebert-base").to('cuda:0')



df = pd.read_json("./container_data/bigvul-train.jsonl", lines=True)
df = df[df['target'] == 1]
print(len(df))
df.head()

8783


Unnamed: 0,index,target,processed_func,func_after,flaw_line
0,186765,1,BrowserContext* SharedWorkerDevToolsAgentHost:...,BrowserContext* SharedWorkerDevToolsAgentHost...,RenderProcessHost* rph = GetProcess();
49,179392,1,"int perf_config(config_fn_t fn, void *data)\n{...","int perf_config(config_fn_t fn, void *data)\n...",\tchar *repo_config = NULL;/~/\trepo_config = ...
54,182089,1,static int __videobuf_mmap_mapper(struct video...,static int __videobuf_mmap_mapper(struct video...,\tmap = q->bufs[first]->map = kmalloc(sizeof(s...
58,183738,1,void CloudPolicyController::SetState(\nCloudPo...,void CloudPolicyController::SetState(\n C...,backend_.reset(); // Discard any pending re...
106,183899,1,void HTMLElement::setOuterHTML(const String& h...,void HTMLElement::setOuterHTML(const String& h...,RefPtr<DocumentFragment> fragment = create...


In [29]:
def encode_code(code): #Tokenization + Input_id transform
    inputs = tokenizer(code, return_tensors="pt", truncation=True, padding=True).to('cuda:0')
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the mean of the hidden states as the vector representation
    # return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy() # features[:, 0, :] is [cls]

encoded_code_vectors = []
index_to_vector_map = {}

i = 0
for index, row in df.iterrows():
    index = int(row['index'])
    function = str(row['processed_func'])
    vector = encode_code(function)
    encoded_code_vectors.append(vector)
    index_to_vector_map[index] = vector


encoded_code_vectors = torch.tensor(encoded_code_vectors)



In [None]:
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(encoded_code_vectors)

# Get the cluster labels for each code piece
cluster_labels = kmeans.labels_

In [31]:
df['cluster'] = cluster_labels

for cluster_number, group_df in df.groupby('cluster'):
    file_name = f'./container_data/bigvul_vuls_cls_{cluster_number}.jsonl'
    group_df[['index', 'processed_func', 'target', 'flaw_line', 'func_after']].to_json(file_name, orient='records', lines=True)


In [33]:
arr = [0,0,0,0,0,0]
for i in cluster_labels:
    arr[i]+=1

print(arr) # avg -> [1238, 1187, 1069, 1372, 3917, 0] order is [4,3,1,0,2] and kmeans indices are 0,1,2,3,4 for 5 clusters

[1351, 1510, 540, 1984, 3398, 0]
