In [6]:
import pandas as pd
import torch
from sklearn.cluster import KMeans
from transformers import RobertaTokenizer, RobertaModel


# Load the pre-trained CodeBERT model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model = RobertaModel.from_pretrained("microsoft/codebert-base").to('cuda:0')



# df = pd.read_json("./container_data/bigvul-train.jsonl", lines=True)
df = pd.read_json("./container_data/primevul_train_cleaned_paired_full.jsonl", lines=True)
df = df[df['target'] == 1]
df = df[df['flaw_line'].str.strip() != ''] # filtering only vul lines available ones

print(len(df))
df.head()



2352


Unnamed: 0,index,processed_func,target,flaw_line
0,0,long ssl_get_algorithm2(SSL *s)\n {\n ...,1,if (TLS1_get_version(s) >= TLS1_2_VERSION &&
6,3,"getftp (struct url *u, wgint passed_expected_b...",1,bool pasv_mode_open = false;/~/pasv_mode_open ...
10,8,"add_range(fz_context *ctx, pdf_cmap *cmap, uns...",1,"add_range(ctx, cmap, high+1, new_high, tree[cu..."
12,9,"pdf_show_image(fz_context *ctx, pdf_run_proce...",1,if (image->mask)/~/if (gstate->blendmode)/~/if...
20,28,void ArthurOutputDev::drawImage(GfxState *stat...,1,buffer = (unsigned char *)gmalloc (width * hei...


In [7]:
def encode_code(code): #Tokenization + Input_id transform
    inputs = tokenizer(code, return_tensors="pt", truncation=True, padding=True).to('cuda:0')
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the mean of the hidden states as the vector representation
    # return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy() # features[:, 0, :] is [cls]

encoded_code_vectors = []
index_to_vector_map = {}

i = 0
for index, row in df.iterrows():
    index = int(row['index'])
    function = str(row['processed_func'])
    vector = encode_code(function)
    encoded_code_vectors.append(vector)
    index_to_vector_map[index] = vector


encoded_code_vectors = torch.tensor(encoded_code_vectors)



In [8]:
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(encoded_code_vectors)

# Get the cluster labels for each code piece
cluster_labels = kmeans.labels_

  super()._check_params_vs_input(X, default_n_init=10)


In [9]:
df['cluster'] = cluster_labels

for cluster_number, group_df in df.groupby('cluster'):
    # file_name = f'./container_data/bigvul_vuls_cls_{cluster_number}.jsonl'
    # group_df[['index', 'processed_func', 'target', 'flaw_line', 'func_after']].to_json(file_name, orient='records', lines=True)
    file_name = f'./container_data/primevul_vuls_cls_{cluster_number}_flaw_only.jsonl'
    group_df[['index', 'processed_func', 'target', 'flaw_line']].to_json(file_name, orient='records', lines=True)


In [None]:
arr = [0,0,0,0,0,0]
for i in cluster_labels:
    arr[i]+=1

print(arr) # avg -> [1238, 1187, 1069, 1372, 3917, 0] order is [4,3,1,0,2] and kmeans indices are 0,1,2,3,4 for 5 clusters of bigvuk
 # for prime vul [558, 224, 961, 1309, 737, 0] so order is [3, 2, 4, 0, 1] and kmeans indices are 0,1,2,3,4 for 5 clusters of primevul
 # for flaw only primevul [807, 396, 308, 146, 695, 0] so order is [0, 4, 1, 2, 3] and kmeans indices are 0,1,2,3,4 for 5 clusters of primevul

[807, 396, 308, 146, 695, 0]
