# Imports 

In [2]:
import os
import sys
import copy
import glob
import tqdm
from torch import nn
import random
import torch
import platform
from typing import Callable, List, Optional, Dict
import numpy as np
import scipy.sparse as sp

import warnings
warnings.filterwarnings('ignore')

from transformers import AutoTokenizer, AutoModel

import torch_geometric
from torch_geometric.data import (
    Data,
    InMemoryDataset,
    Batch
    )
import torch_geometric.datasets as datasets
import torch_geometric.transforms as transforms
from torch_geometric.data import Data
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, GATConv
from torch_geometric.nn import global_mean_pool

# Helper function for visualization.
%matplotlib inline
import networkx as nx
import matplotlib.pyplot as plt
from torch_geometric.utils import to_networkx

import umap
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.cluster import Birch
from sklearn.cluster import SpectralClustering

from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, silhouette_score

# To ensure determinism
seed = 1234
def seed_everything(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(seed)

# Check versions
print(torch.__version__)
print(torch.version.cuda)
print(platform.python_version())
print(torch_geometric.__version__)



1.8.1+cu101
10.1
3.8.18
1.7.0


# Evalute Using CodeBERT

In [2]:
import gc

#Set GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "2,3,4,5"
device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu')

# Initialize the models
codebert_tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
codebert_model = AutoModel.from_pretrained("microsoft/codebert-base")
codebert_model = codebert_model.to(device)
max_source_length= 512

def get_code_embeddings_from_codebert(codelines):
    gc.collect()
    torch.cuda.empty_cache()
    code = " ".join(codelines)
    source_tokens = codebert_tokenizer.tokenize(code)[:max_source_length-2]
    source_tokens = [codebert_tokenizer.cls_token]+source_tokens+[codebert_tokenizer.sep_token]
    source_ids =  codebert_tokenizer.convert_tokens_to_ids(source_tokens) 
    padding_length = max_source_length - len(source_ids)
    source_ids+=[codebert_tokenizer.pad_token_id]*padding_length
    source_ids = torch.tensor(source_ids)
    
    # tokens = []
    # for code_line in codelines:
    #     code_tokens = codebert_tokenizer.tokenize(code_line, truncation=True, max_length=510)
    #     if tokens == []:
    #         tokens = [codebert_tokenizer.cls_token] + code_tokens
    #     else:
    #         tokens = tokens + [codebert_tokenizer.sep_token] + code_tokens
    # tokens = tokens + [codebert_tokenizer.eos_token]
    # tokens_ids = torch.tensor(codebert_tokenizer.convert_tokens_to_ids(tokens))
    source_ids = source_ids.to(device)
    context_embeddings = codebert_model(source_ids[None,:])
    cls_token_embedding = context_embeddings.last_hidden_state[0,0,:]
    return cls_token_embedding

In [3]:
import gc
from unixcoder import UniXcoder

#Set GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "2,3,4,5"
device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu')

# Initialize the models
unixcoder_model = UniXcoder("microsoft/unixcoder-base")
unixcoder_model = unixcoder_model.to(device)
max_source_length= 512

def get_code_embeddings_from_unixcoder(codelines):
    gc.collect()
    torch.cuda.empty_cache()
    code = " ".join(codelines)
    tokens_ids = unixcoder_model.tokenize([code], max_length=512, mode="<encoder-only>")
    source_ids = torch.tensor(tokens_ids).to(device)
    tokens_embeddings, code_embedding = unixcoder_model(source_ids)
    return torch.flatten(code_embedding)

In [4]:
def get_embeddings_from_llms(folders, model):
  embeddings = {}
  for label, folder in tqdm.tqdm(enumerate(folders)):
    folder_name = folder.strip().split("/")[-1]
    print("\nProcessing: {}\n".format(folder_name))
    embeddings[folder_name] = []
    files = glob.glob(os.path.join(folder, '*/*.java'))
    print("\nNumber of files: {}\n".format(len(files)))
    count = 1
    for file in files:
      sample_name = file.split("/")[-2].strip()
      file_name = file.split("/")[-1].strip()
      if(count % 5 == 0):
          print("\nAt file: {}\n".format(count))
                        
      fp = open(file,'r')
      lines = fp.readlines()
      lines = [line for line in lines if not line.startswith("import") and not len(line.strip('\n')) == 0]
      lines = [line.strip('\n').strip(" ") for line in lines]
      if model == "codebert":
        embedding = get_code_embeddings_from_codebert(lines)
      elif model == "unixcoder":
        embedding = get_code_embeddings_from_unixcoder(lines)
      embedding = embedding.detach().numpy()
      embeddings[folder_name].append([file_name, sample_name, embedding])
      count += 1
    
  return embeddings

In [12]:
OUTPUT_FOLDER_LOCATION = "/home/siddharthsa/cs21mtech12001-Tamal/API-Misuse-Prediction/PDG-gen/Repository/Code_kernel_data/after_relabeling"
project_folders = [os.path.join(OUTPUT_FOLDER_LOCATION, name) for name in os.listdir(OUTPUT_FOLDER_LOCATION) if os.path.isdir(os.path.join(OUTPUT_FOLDER_LOCATION, name))]

embeddings = get_embeddings_from_llms(project_folders, "codebert")

0it [00:00, ?it/s]


Processing: Lock.lock


Number of files: 6


At file: 5



1it [00:02,  2.21s/it]


Processing: ExecutorService.submit


Number of files: 26


At file: 5


At file: 10


At file: 15


At file: 20


At file: 25



2it [00:11,  5.81s/it]


In [20]:
ground_truth_cluster_numbers = {}
for folder in project_folders:
    folder_name = folder.strip().split("/")[-1]
    ground_truth_cluster_numbers[folder_name] = len([name for name in os.listdir(folder) if os.path.isdir(os.path.join(folder, name))])
print(ground_truth_cluster_numbers)

{'Lock.lock': 2, 'ExecutorService.submit': 6}


In [8]:
from sklearn.cluster import Birch

def cluster_and_compare(embeddings, ground_truth_cluster_number):
    
    birch_model = Birch(n_clusters = ground_truth_cluster_number)
    clusters_birch = birch_model.fit_predict([emb[2] for emb in embeddings])
    
    total_count, currect_count, wrong_count = 0, 0, 0
    both_right, both_wrong = 0, 0
    confusion_matrix = {"TP": 0, "TN": 0, "FP": 0, "FN": 0}

    original_one_final_two = []
    original_two_final_one = []
    
    for i in tqdm.tqdm(range(len(embeddings))):
        for j in range(i+1, len(embeddings)):
            total_count += 1
            if (embeddings[i][1] == embeddings[j][1]):
                if (clusters_birch[i] == clusters_birch[j]):
                    both_right += 1
                    currect_count += 1
                    confusion_matrix["TP"] += 1
                else:
                    #original_one_final_two.append([embeddings[i][0], embeddings[j][0], embeddings[i][1], clusters_birch[i], clusters_birch[j]])
                    wrong_count += 1
                    confusion_matrix["FN"] += 1
            else:
                if (clusters_birch[i] != clusters_birch[j]):
                    both_wrong += 1
                    currect_count += 1
                    confusion_matrix["TN"] += 1
                else:
                    #original_two_final_one.append([embeddings[i][0], embeddings[j][0], embeddings[i][1], embeddings[j][1], clusters_birch[i]])
                    wrong_count += 1
                    confusion_matrix["FP"] += 1
                    
    print("total_count = {}, currect_count = {}, wrong_count = {}, both_right = {}, both_wrong = {}".format(total_count, currect_count, wrong_count, both_right, both_wrong))
    print(confusion_matrix)
    precision = float(format(confusion_matrix["TP"] / (confusion_matrix["TP"] + confusion_matrix["FP"]), ".3f"))
    recall = float(format(confusion_matrix["TP"] / (confusion_matrix["TP"] + confusion_matrix["FN"]), ".3f"))
    f1_score = float(format(2 * (precision * recall) / (precision + recall), ".3f"))
    accuracy = float(format(currect_count/total_count, ".3f"))
    print("Precision: {}, Recall: {} and F1-Score: {}".format(precision, recall, f1_score))
    print("Accuracy: {}".format(accuracy))
    
    return precision, recall, f1_score, accuracy

In [50]:
total_precision, total_recall, total_f1_score, total_accuracy = 0, 0, 0, 0

for folder in project_folders:
    folder_name = folder.strip().split("/")[-1]
    print("\n\nAnalyzing: ", folder_name)
    precision, recall, f1_score, accuracy = cluster_and_compare(embeddings[folder_name], ground_truth_cluster_numbers[folder_name])
    total_precision += precision
    total_recall += recall
    total_f1_score += f1_score
    total_accuracy += accuracy
    
print("\n\nAverage Precision: {}, Recall: {} and F1-Score: {} and Accuracy: {}".format(total_precision/len(project_folders), total_recall/len(project_folders), total_f1_score/len(project_folders), total_accuracy/len(project_folders)))



Analyzing:  Lock.lock


100%|██████████| 6/6 [00:00<00:00, 35246.25it/s]


total_count = 15, currect_count = 10, wrong_count = 5, both_right = 10, both_wrong = 0
{'TP': 10, 'TN': 0, 'FP': 5, 'FN': 0}
Precision: 0.667, Recall: 1.0 and F1-Score: 0.8
Accuracy: 0.667


Analyzing:  ExecutorService.submit


100%|██████████| 26/26 [00:00<00:00, 13895.50it/s]

total_count = 325, currect_count = 230, wrong_count = 95, both_right = 12, both_wrong = 218
{'TP': 12, 'TN': 218, 'FP': 46, 'FN': 49}
Precision: 0.207, Recall: 0.197 and F1-Score: 0.202
Accuracy: 0.708


Average Precision: 0.437, Recall: 0.5985 and F1-Score: 0.501 and Accuracy: 0.6875





# Evaluate with UnixCoder

In [5]:
OUTPUT_FOLDER_LOCATION = "/home/siddharthsa/cs21mtech12001-Tamal/API-Misuse-Prediction/PDG-gen/Repository/Code_kernel_data/after_relabeling"
project_folders = [os.path.join(OUTPUT_FOLDER_LOCATION, name) for name in os.listdir(OUTPUT_FOLDER_LOCATION) if os.path.isdir(os.path.join(OUTPUT_FOLDER_LOCATION, name))]

embeddings = get_embeddings_from_llms(project_folders, "unixcoder")

0it [00:00, ?it/s]


Processing: Lock.lock


Number of files: 6


At file: 5



1it [00:01,  1.46s/it]


Processing: ExecutorService.submit


Number of files: 26


At file: 5


At file: 10


At file: 15


At file: 20


At file: 25



2it [00:08,  4.04s/it]


In [6]:
ground_truth_cluster_numbers = {}
for folder in project_folders:
    folder_name = folder.strip().split("/")[-1]
    ground_truth_cluster_numbers[folder_name] = len([name for name in os.listdir(folder) if os.path.isdir(os.path.join(folder, name))])
print(ground_truth_cluster_numbers)

{'Lock.lock': 2, 'ExecutorService.submit': 6}


In [9]:
total_precision, total_recall, total_f1_score, total_accuracy = 0, 0, 0, 0

for folder in project_folders:
    folder_name = folder.strip().split("/")[-1]
    print("\n\nAnalyzing: ", folder_name)
    precision, recall, f1_score, accuracy = cluster_and_compare(embeddings[folder_name], ground_truth_cluster_numbers[folder_name])
    total_precision += precision
    total_recall += recall
    total_f1_score += f1_score
    total_accuracy += accuracy
    
print("\n\nAverage Precision: {}, Recall: {} and F1-Score: {} and Accuracy: {}".format(total_precision/len(project_folders), total_recall/len(project_folders), total_f1_score/len(project_folders), total_accuracy/len(project_folders)))



Analyzing:  Lock.lock


100%|██████████| 6/6 [00:00<00:00, 41803.69it/s]


total_count = 15, currect_count = 6, wrong_count = 9, both_right = 4, both_wrong = 2
{'TP': 4, 'TN': 2, 'FP': 3, 'FN': 6}
Precision: 0.571, Recall: 0.4 and F1-Score: 0.47
Accuracy: 0.4


Analyzing:  ExecutorService.submit


100%|██████████| 26/26 [00:00<00:00, 96591.59it/s]

total_count = 325, currect_count = 236, wrong_count = 89, both_right = 17, both_wrong = 219
{'TP': 17, 'TN': 219, 'FP': 45, 'FN': 44}
Precision: 0.274, Recall: 0.279 and F1-Score: 0.276
Accuracy: 0.726


Average Precision: 0.4225, Recall: 0.3395 and F1-Score: 0.373 and Accuracy: 0.563



