# Imports 

In [1]:
import os
import sys
import copy
import glob
import tqdm
from torch import nn
import random
import torch
import platform
from typing import Callable, List, Optional, Dict
import numpy as np
import scipy.sparse as sp

import warnings
warnings.filterwarnings('ignore')

from transformers import AutoTokenizer, AutoModel

import torch_geometric
from torch_geometric.data import (
    Data,
    InMemoryDataset,
    Batch
    )
import torch_geometric.datasets as datasets
import torch_geometric.transforms as transforms
from torch_geometric.data import Data
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, GATConv
from torch_geometric.nn import global_mean_pool

# Helper function for visualization.
%matplotlib inline
import networkx as nx
import matplotlib.pyplot as plt
from torch_geometric.utils import to_networkx

import umap
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.cluster import Birch
from sklearn.cluster import SpectralClustering

from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, silhouette_score

# To ensure determinism
seed = 1234
def seed_everything(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(seed)

# Check versions
print(torch.__version__)
print(torch.version.cuda)
print(platform.python_version())
print(torch_geometric.__version__)



1.8.1+cu101
10.1
3.8.18
1.7.0


# Evalute Using CodeBERT

In [120]:
import gc

#Set GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "2,3,4,5"
device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu')

# Initialize the models
codebert_tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
codebert_model = AutoModel.from_pretrained("microsoft/codebert-base")
codebert_model = codebert_model.to(device)
max_source_length= 512

def get_code_embeddings_from_codebert(codelines):
    gc.collect()
    torch.cuda.empty_cache()
    code = " ".join(codelines)
    source_tokens = codebert_tokenizer.tokenize(code)[:max_source_length-2]
    source_tokens = [codebert_tokenizer.cls_token]+source_tokens+[codebert_tokenizer.sep_token]
    source_ids =  codebert_tokenizer.convert_tokens_to_ids(source_tokens) 
    padding_length = max_source_length - len(source_ids)
    source_ids+=[codebert_tokenizer.pad_token_id]*padding_length
    source_ids = torch.tensor(source_ids)
    
    # tokens = []
    # for code_line in codelines:
    #     code_tokens = codebert_tokenizer.tokenize(code_line, truncation=True, max_length=510)
    #     if tokens == []:
    #         tokens = [codebert_tokenizer.cls_token] + code_tokens
    #     else:
    #         tokens = tokens + [codebert_tokenizer.sep_token] + code_tokens
    # tokens = tokens + [codebert_tokenizer.eos_token]
    # tokens_ids = torch.tensor(codebert_tokenizer.convert_tokens_to_ids(tokens))
    source_ids = source_ids.to(device)
    context_embeddings = codebert_model(source_ids[None,:])
    cls_token_embedding = context_embeddings.last_hidden_state[0,0,:]
    return cls_token_embedding

In [121]:
import gc
from unixcoder import UniXcoder

#Set GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "2,3,4,5"
device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu')

# Initialize the models
unixcoder_model = UniXcoder("microsoft/unixcoder-base")
unixcoder_model = unixcoder_model.to(device)
max_source_length= 512

def get_code_embeddings_from_unixcoder(codelines):
    gc.collect()
    torch.cuda.empty_cache()
    code = " ".join(codelines)
    tokens_ids = unixcoder_model.tokenize([code], max_length=512, mode="<encoder-only>")
    source_ids = torch.tensor(tokens_ids).to(device)
    tokens_embeddings, code_embedding = unixcoder_model(source_ids)
    return torch.flatten(code_embedding)

In [122]:
def get_embeddings_from_llms(folders, model):
  embeddings = {}
  for label, folder in tqdm.tqdm(enumerate(folders)):
    folder_name = folder.strip().split("/")[-1]
    print("\nProcessing: {}\n".format(folder_name))
    embeddings[folder_name] = []
    files = glob.glob(os.path.join(folder, '*/*.java'))
    print("\nNumber of files: {}\n".format(len(files)))
    count = 1
    for file in files:
      sample_name = file.split("/")[-2].strip()
      file_name = file.split("/")[-1].strip()
      if(count % 5 == 0):
          print("\nAt file: {}\n".format(count))
                        
      fp = open(file,'r')
      lines = fp.readlines()
      lines = [line for line in lines if not line.startswith("import") and not len(line.strip('\n')) == 0]
      lines = [line.strip('\n').strip(" ") for line in lines]
      if model == "codebert":
        embedding = get_code_embeddings_from_codebert(lines)
      elif model == "unixcoder":
        embedding = get_code_embeddings_from_unixcoder(lines)
      embedding = embedding.detach().numpy()
      embeddings[folder_name].append([file_name, sample_name, embedding])
      count += 1
    
  return embeddings

In [123]:
OUTPUT_FOLDER_LOCATION = "/home/siddharthsa/cs21mtech12001-Tamal/API-Misuse-Prediction/PDG-gen/Repository/Benchmarks/Code-Kernel-Relabelled/after_preprocessing"
project_folders = [os.path.join(OUTPUT_FOLDER_LOCATION, name) for name in os.listdir(OUTPUT_FOLDER_LOCATION) if os.path.isdir(os.path.join(OUTPUT_FOLDER_LOCATION, name))]

embeddings = get_embeddings_from_llms(project_folders, "codebert")

0it [00:00, ?it/s]


Processing: ExecutorService.submit


Number of files: 26


At file: 5


At file: 10


At file: 15


At file: 20


At file: 25



1it [00:10, 10.72s/it]


Processing: Lock.lock


Number of files: 6


At file: 5



2it [00:13,  5.82s/it]


Processing: Timestamp.compareTo


Number of files: 23


At file: 5



Token indices sequence length is longer than the specified maximum sequence length for this model (526 > 512). Running this sequence through the model will result in indexing errors



At file: 10


At file: 15


At file: 20



3it [00:22,  7.45s/it]


In [124]:
ground_truth_cluster_numbers = {}
for folder in project_folders:
    folder_name = folder.strip().split("/")[-1]
    ground_truth_cluster_numbers[folder_name] = len([name for name in os.listdir(folder) if os.path.isdir(os.path.join(folder, name))])
print(ground_truth_cluster_numbers)

{'ExecutorService.submit': 6, 'Lock.lock': 2, 'Timestamp.compareTo': 3}


In [18]:
from sklearn.cluster import KMeans
from sklearn.cluster import Birch
from sklearn.mixture import GaussianMixture
from sklearn.cluster import AgglomerativeClustering

def cluster_and_compare(embeddings, ground_truth_cluster_number, clustering_algorithm = "Birch"):
    
    if(clustering_algorithm == "Birch"):
        birch_model = Birch(n_clusters = ground_truth_cluster_number)
        clusters_result = birch_model.fit_predict([emb[2] for emb in embeddings])
    elif(clustering_algorithm == "Agglomerative"):
        agglomerative_model = AgglomerativeClustering(n_clusters = ground_truth_cluster_number)
        clusters_result = agglomerative_model.fit_predict([emb[2] for emb in embeddings])
    elif(clustering_algorithm == "KMeans"):
        kmeans_model = KMeans(n_clusters = ground_truth_cluster_number)
        clusters_result = kmeans_model.fit_predict([emb[2] for emb in embeddings])
    elif(clustering_algorithm == "GM"):
        gaussian_model = GaussianMixture(n_components = ground_truth_cluster_number)
        clusters_result = gaussian_model.fit_predict([emb[2] for emb in embeddings])
        
    cluster_count = {}
    cluster_mapping = {}
    for i in range(len(clusters_result)):
        try:
            cluster_count[clusters_result[i]] += 1
            cluster_mapping[clusters_result[i]].append(embeddings[i][0])
        except:
            cluster_count[clusters_result[i]] = 1
            cluster_mapping[clusters_result[i]] = [embeddings[i][0]]
    print("Cluster Counts: ", cluster_count)
    print("======== Cluster Examples =========")
    for cluster_no in cluster_mapping:
        print("{}: {}".format(cluster_no, cluster_mapping[cluster_no][:min(5, len(cluster_mapping[cluster_no]))]))
    print("===================================")
    
    total_count, currect_count, wrong_count = 0, 0, 0
    both_right, both_wrong = 0, 0
    confusion_matrix = {"TP": 0, "TN": 0, "FP": 0, "FN": 0}

    original_one_final_two = []
    original_two_final_one = []
    
    for i in tqdm.tqdm(range(len(embeddings))):
        for j in range(i+1, len(embeddings)):
            total_count += 1
            if (embeddings[i][1] == embeddings[j][1]):
                if (clusters_result[i] == clusters_result[j]):
                    both_right += 1
                    currect_count += 1
                    confusion_matrix["TP"] += 1
                else:
                    #original_one_final_two.append([embeddings[i][0], embeddings[j][0], embeddings[i][1], clusters_result[i], clusters_result[j]])
                    wrong_count += 1
                    confusion_matrix["FN"] += 1
            else:
                if (clusters_result[i] != clusters_result[j]):
                    both_wrong += 1
                    currect_count += 1
                    confusion_matrix["TN"] += 1
                else:
                    #original_two_final_one.append([embeddings[i][0], embeddings[j][0], embeddings[i][1], embeddings[j][1], clusters_result[i]])
                    wrong_count += 1
                    confusion_matrix["FP"] += 1
                    
    print("total_count = {}, currect_count = {}, wrong_count = {}, both_right = {}, both_wrong = {}".format(total_count, currect_count, wrong_count, both_right, both_wrong))
    print(confusion_matrix)
    precision = float(format(confusion_matrix["TP"] / (confusion_matrix["TP"] + confusion_matrix["FP"]), ".3f"))
    recall = float(format(confusion_matrix["TP"] / (confusion_matrix["TP"] + confusion_matrix["FN"]), ".3f"))
    f1_score = float(format(2 * (precision * recall) / (precision + recall), ".3f"))
    accuracy = float(format(currect_count/total_count, ".3f"))
    print("Precision: {}, Recall: {} and F1-Score: {}".format(precision, recall, f1_score))
    print("Accuracy: {}".format(accuracy))
    
    return precision, recall, f1_score, accuracy

In [126]:
total_precision, total_recall, total_f1_score, total_accuracy = 0, 0, 0, 0

for folder in project_folders:
    folder_name = folder.strip().split("/")[-1]
    print("\n\nAnalyzing: ", folder_name)
    precision, recall, f1_score, accuracy = cluster_and_compare(embeddings[folder_name], ground_truth_cluster_numbers[folder_name])
    total_precision += precision
    total_recall += recall
    total_f1_score += f1_score
    total_accuracy += accuracy
    
print("\n\nAverage Precision: {}, Recall: {} and F1-Score: {} and Accuracy: {}".format(total_precision/len(project_folders), total_recall/len(project_folders), total_f1_score/len(project_folders), total_accuracy/len(project_folders)))



Analyzing:  ExecutorService.submit
Cluster Counts:  {4: 8, 2: 6, 1: 4, 3: 4, 0: 3, 5: 1}


100%|██████████| 26/26 [00:00<00:00, 48814.64it/s]


total_count = 325, currect_count = 230, wrong_count = 95, both_right = 12, both_wrong = 218
{'TP': 12, 'TN': 218, 'FP': 46, 'FN': 49}
Precision: 0.207, Recall: 0.197 and F1-Score: 0.202
Accuracy: 0.708


Analyzing:  Lock.lock
Cluster Counts:  {0: 6}


100%|██████████| 6/6 [00:00<00:00, 49932.19it/s]


total_count = 15, currect_count = 10, wrong_count = 5, both_right = 10, both_wrong = 0
{'TP': 10, 'TN': 0, 'FP': 5, 'FN': 0}
Precision: 0.667, Recall: 1.0 and F1-Score: 0.8
Accuracy: 0.667


Analyzing:  Timestamp.compareTo
Cluster Counts:  {1: 12, 2: 8, 0: 3}


100%|██████████| 23/23 [00:00<00:00, 52485.85it/s]

total_count = 253, currect_count = 132, wrong_count = 121, both_right = 47, both_wrong = 85
{'TP': 47, 'TN': 85, 'FP': 50, 'FN': 71}
Precision: 0.485, Recall: 0.398 and F1-Score: 0.437
Accuracy: 0.522


Average Precision: 0.453, Recall: 0.5316666666666667 and F1-Score: 0.4796666666666667 and Accuracy: 0.6323333333333333





# Evaluate Using UnixCoder

In [127]:
OUTPUT_FOLDER_LOCATION = "/home/siddharthsa/cs21mtech12001-Tamal/API-Misuse-Prediction/PDG-gen/Repository/Benchmarks/Code-Kernel-Relabelled/after_preprocessing"
project_folders = [os.path.join(OUTPUT_FOLDER_LOCATION, name) for name in os.listdir(OUTPUT_FOLDER_LOCATION) if os.path.isdir(os.path.join(OUTPUT_FOLDER_LOCATION, name))]

embeddings = get_embeddings_from_llms(project_folders, "unixcoder")

0it [00:00, ?it/s]


Processing: ExecutorService.submit


Number of files: 26


At file: 5


At file: 10


At file: 15


At file: 20


At file: 25



1it [00:08,  8.28s/it]


Processing: Lock.lock


Number of files: 6


At file: 5



2it [00:09,  4.42s/it]


Processing: Timestamp.compareTo


Number of files: 23


At file: 5


At file: 10


At file: 15


At file: 20



3it [00:17,  5.87s/it]


In [128]:
ground_truth_cluster_numbers = {}
for folder in project_folders:
    folder_name = folder.strip().split("/")[-1]
    ground_truth_cluster_numbers[folder_name] = len([name for name in os.listdir(folder) if os.path.isdir(os.path.join(folder, name))])
print(ground_truth_cluster_numbers)

{'ExecutorService.submit': 6, 'Lock.lock': 2, 'Timestamp.compareTo': 3}


In [129]:
total_precision, total_recall, total_f1_score, total_accuracy = 0, 0, 0, 0

for folder in project_folders:
    folder_name = folder.strip().split("/")[-1]
    print("\n\nAnalyzing: ", folder_name)
    precision, recall, f1_score, accuracy = cluster_and_compare(embeddings[folder_name], ground_truth_cluster_numbers[folder_name])
    total_precision += precision
    total_recall += recall
    total_f1_score += f1_score
    total_accuracy += accuracy
    
print("\n\nAverage Precision: {}, Recall: {} and F1-Score: {} and Accuracy: {}".format(total_precision/len(project_folders), total_recall/len(project_folders), total_f1_score/len(project_folders), total_accuracy/len(project_folders)))



Analyzing:  ExecutorService.submit
Cluster Counts:  {4: 6, 0: 9, 2: 3, 3: 2, 1: 4, 5: 2}


100%|██████████| 26/26 [00:00<00:00, 49681.96it/s]


total_count = 325, currect_count = 236, wrong_count = 89, both_right = 17, both_wrong = 219
{'TP': 17, 'TN': 219, 'FP': 45, 'FN': 44}
Precision: 0.274, Recall: 0.279 and F1-Score: 0.276
Accuracy: 0.726


Analyzing:  Lock.lock
Cluster Counts:  {0: 4, 1: 2}


100%|██████████| 6/6 [00:00<00:00, 43690.67it/s]


total_count = 15, currect_count = 6, wrong_count = 9, both_right = 4, both_wrong = 2
{'TP': 4, 'TN': 2, 'FP': 3, 'FN': 6}
Precision: 0.571, Recall: 0.4 and F1-Score: 0.47
Accuracy: 0.4


Analyzing:  Timestamp.compareTo
Cluster Counts:  {0: 9, 1: 5, 2: 9}


100%|██████████| 23/23 [00:00<00:00, 52571.66it/s]

total_count = 253, currect_count = 165, wrong_count = 88, both_right = 56, both_wrong = 109
{'TP': 56, 'TN': 109, 'FP': 26, 'FN': 62}
Precision: 0.683, Recall: 0.475 and F1-Score: 0.56
Accuracy: 0.652


Average Precision: 0.5093333333333333, Recall: 0.38466666666666666 and F1-Score: 0.43533333333333335 and Accuracy: 0.5926666666666667





# Evaluate Using MuGNN

## Prune the PDGs

In [130]:
import os
import sys
import glob
import tqdm

""" ALGORITHM

a. Clean the raw edge info (eg. remove wrongly formatted edges, class edges etc.)
b. Merge same code-lines into a single line/node
c. Consider all nodes that are reachable from the API node
d. Consider all nodes from which API node is reachable
e. Add the all the edges(CD/FD) in the current subgraph

"""

PRUNING_ERROR_COUNT, GOOD_DATA_POINTS, TOTAL_DATA_POINTS = 0, 0, 0
PRUNING_ERROR_COUNT_IN_DATASET, GOOD_DATA_POINTS_IN_DATASET, TOTAL_DATA_POINTS_IN_DATASET = 0, 0, 0
DATASET_STATISTICS = {}

def get_pruned_pdg(pdg_file, output_pdg_file, api_name):
    
    global PRUNING_ERROR_COUNT, GOOD_DATA_POINTS, TOTAL_DATA_POINTS
    
    # all_edges = [bytes(l, 'utf-8').decode('utf-8', 'ignore').strip()
    #              for l in pdg_file.readlines()]
    all_edges = [l.replace("\n", "").replace("\r", "").strip()
                 for l in pdg_file.readlines()]

    # Remove unnecesssary edges("class" edge, wrongly formatted edges etc.)
    all_edges = [edge for edge in all_edges if edge.find(
        "-->") != -1 and edge.count("$$") == 2]
    all_edges = [edge for edge in all_edges if len(edge.split("-->")) == 2 and
                 len(edge.split("-->")[0].split("$$")) == 2 and
                 len(edge.split("-->")[1].split("$$")) == 2]
    all_edges = [edge for edge in all_edges if edge.split("-->")[0].find("Entry") == -1 and
                 edge.split("-->")[0].find("class") == -1]
    #print("ALL EDGES : \n")
    #print(all_edges, "\n")

    # Merge nodes referring to same code-line
    line_mapping, edge_mapping = {}, {}
    for edge in all_edges:
        node_1, node_2 = edge[:edge.rindex("[")].strip().split("-->")
        edge_type = edge[edge.rindex("[") + 1: -1].strip()
        line_numbers = []
        for node in [node_1, node_2]:
            line_number, line_code = node.strip().split("$$")
            line_number, line_code = line_number.strip(), line_code.strip()
            line_numbers.append(line_number)
            if line_number in line_mapping:
                if line_mapping[line_number] != line_code:
                    line_mapping[line_number] = line_code if len(line_code) > len(
                        line_mapping[line_number]) else line_mapping[line_number]
            else:
                line_mapping[line_number] = line_code
        if tuple(line_numbers) in edge_mapping:
            edge_mapping[tuple(line_numbers)] = list(set(edge_mapping[tuple(line_numbers)] + [edge_type]))
        else:
            edge_mapping[tuple(line_numbers)] = [edge_type]

    #print("NODE MAPPING : \n")
    #print(line_mapping, "\n")
    #print("EDGE MAPPING : \n")
    #print(edge_mapping, "\n")

    # Add all the nodes that are reachable to or from the API-NODE
    api_nodes = []
    for line in line_mapping:
        if line_mapping[line].find("." + api_name + "(") != -1:
            api_nodes.append(line)
    #print("API NODES : \n")
    #print(api_nodes, "\n")
    
    # Get vertices that are reachable from the API-NODE
    vertices_from_api_node, previous_vertices = set(api_nodes), set(api_nodes)
    while(True):
        next_vertices = set([])
        for edge in edge_mapping:
            if edge[0] in list(previous_vertices) and edge[1] not in list(vertices_from_api_node):
                next_vertices.add(edge[1])
        if len(next_vertices) == 0:
            break
        else:
            vertices_from_api_node = vertices_from_api_node.union(next_vertices)
            previous_vertices = next_vertices
    
    # Get vertices from which the API-NODE is reachable
    vertices_to_api_node, next_vertices = set(api_nodes), set(api_nodes)
    while(True):
        previous_vertices = set([])
        for edge in edge_mapping:
            if edge[1] in list(next_vertices) and edge[0] not in list(vertices_to_api_node):
                previous_vertices.add(edge[0])
        if len(previous_vertices) == 0:
            break
        else:
            vertices_to_api_node = vertices_to_api_node.union(previous_vertices)
            next_vertices = previous_vertices
    
    # All nodes in the final sub-graph
    subgraph_vertices = list(vertices_from_api_node.union(vertices_to_api_node))

    # Add all the edges(CD/FD) between the subgraph vertices
    sub_graph_edges = {}
    for edge in edge_mapping:
        if edge[0] in subgraph_vertices and edge[1] in subgraph_vertices:
            if edge in sub_graph_edges:
                sub_graph_edges[edge] = list(set(sub_graph_edges[edge] + edge_mapping[edge]))
            else:
                sub_graph_edges[edge] = edge_mapping[edge]
    #print("AFTER ADDING REST OF THE EDGES : \n")
    #print(sub_graph_edges, "\n")

    # Remove self-loops from subgraph
    sub_graph_edges_temp = {}
    for edge in sub_graph_edges:
        if edge[0] != edge[1]:
            sub_graph_edges_temp[edge] = sub_graph_edges[edge]
    sub_graph_edges = sub_graph_edges_temp
    #print("AFTER REMOVING SELF-LOOPS : \n")
    #print(sub_graph_edges, "\n")

    # Save the pruned PDG
    edge_data_list = []
    for edge in sub_graph_edges:
        for edge_type in sub_graph_edges[edge]:
            edge_data = edge[0].strip() + " $$ " + \
                        line_mapping[edge[0]].strip() + " --> " + \
                        edge[1].strip() + " $$ " + \
                        line_mapping[edge[1]].strip() + " [" + \
                        edge_type.strip() + "]\n"
            edge_data_list.append(edge_data)
    #print("FINAL EDGE LIST: \n")
    #print(edge_data_list, "\n")
    if len(edge_data_list) >= 3:
        GOOD_DATA_POINTS += 1
        
    output_pdg_file.writelines(edge_data_list)
    if len(edge_data_list) > 0:
        TOTAL_DATA_POINTS += 1

    return output_pdg_file, len(edge_data_list)

In [131]:
PDG_FOLDER_LOCATION = "/home/siddharthsa/cs21mtech12001-Tamal/API-Misuse-Prediction/PDG-gen/Repository/Benchmarks/Code-Kernel-Relabelled/before_pruning"
OUTPUT_FOLDER_LOCATION = "/home/siddharthsa/cs21mtech12001-Tamal/API-Misuse-Prediction/PDG-gen/Repository/Benchmarks/Code-Kernel-Relabelled/after_pruning"
pdg_folders_list = glob.glob(PDG_FOLDER_LOCATION + "/*/")
print("\nNumber of total APIs: {}\n".format(len(pdg_folders_list)))
for folder in tqdm.tqdm(pdg_folders_list):
    print("\nProcessing: {}\n".format(folder))
    api_name = folder[folder.rindex("/", 0, len(folder) - 1) + 1 : -1]
    pdg_files_list = glob.glob(os.path.join(folder, '*.txt'))
    OUTPUT_API_FOLDER_LOCATION = OUTPUT_FOLDER_LOCATION + "/" + api_name
    if not os.path.exists(OUTPUT_API_FOLDER_LOCATION):
        os.makedirs(OUTPUT_API_FOLDER_LOCATION)
    for pdg_file_location in pdg_files_list:
        pdg_file = open(pdg_file_location, 'r')
        output_file_location = OUTPUT_API_FOLDER_LOCATION + "/" + pdg_file_location[pdg_file_location.rindex("/")+1:]
        output_pdg_file = open(output_file_location, "+w")
        try:
            output_pdg_file, no_of_edges = get_pruned_pdg(pdg_file, output_pdg_file, api_name[api_name.rindex(".") + 1 :].strip())
        except Exception as e:
            PRUNING_ERROR_COUNT += 1
            print("\nERROR WHILE PRUNING PDG\n")
            print("\nFile: {}\n".format(pdg_file_location))
            print("\nERROR: {}\n".format(e))
            pdg_file.close()
            output_pdg_file.close()
            os.remove(output_file_location)
        else:
            output_pdg_file.close()
            if no_of_edges == 0:
                os.remove(output_file_location)
            pdg_file.close()

    print("\nGOOD PDG DATA POINTS: {}\n".format(GOOD_DATA_POINTS))
    print("\nTOTAL PDG DATA POINTS: {}\n".format(TOTAL_DATA_POINTS))
    print("\nTOTAL PRUNING ERROR: {}\n".format(PRUNING_ERROR_COUNT))
    print("\n=================================================================\n")
    PRUNING_ERROR_COUNT_IN_DATASET += PRUNING_ERROR_COUNT
    GOOD_DATA_POINTS_IN_DATASET += GOOD_DATA_POINTS
    TOTAL_DATA_POINTS_IN_DATASET += TOTAL_DATA_POINTS
    DATASET_STATISTICS[api_name] = [TOTAL_DATA_POINTS, GOOD_DATA_POINTS, PRUNING_ERROR_COUNT]
    PRUNING_ERROR_COUNT, GOOD_DATA_POINTS, TOTAL_DATA_POINTS = 0, 0, 0
    
print("\nTOTAL GOOD PDG DATA POINTS IN DATASET: {}\n".format(GOOD_DATA_POINTS_IN_DATASET))
print("\nTOTAL PDG DATA POINTS IN DATASET: {}\n".format(TOTAL_DATA_POINTS_IN_DATASET))
print("\nTOTAL PRUNING ERROR IN DATASET: {}\n".format(PRUNING_ERROR_COUNT_IN_DATASET))
print("\nDATASET STATISTICS: {}\n".format(DATASET_STATISTICS))


Number of total APIs: 3



100%|██████████| 3/3 [00:00<00:00, 119.99it/s]


Processing: /home/siddharthsa/cs21mtech12001-Tamal/API-Misuse-Prediction/PDG-gen/Repository/Benchmarks/Code-Kernel-Relabelled/before_pruning/ExecutorService.submit/


GOOD PDG DATA POINTS: 7


TOTAL PDG DATA POINTS: 24


TOTAL PRUNING ERROR: 0




Processing: /home/siddharthsa/cs21mtech12001-Tamal/API-Misuse-Prediction/PDG-gen/Repository/Benchmarks/Code-Kernel-Relabelled/before_pruning/Lock.lock/


GOOD PDG DATA POINTS: 6


TOTAL PDG DATA POINTS: 6


TOTAL PRUNING ERROR: 0




Processing: /home/siddharthsa/cs21mtech12001-Tamal/API-Misuse-Prediction/PDG-gen/Repository/Benchmarks/Code-Kernel-Relabelled/before_pruning/Timestamp.compareTo/


GOOD PDG DATA POINTS: 19


TOTAL PDG DATA POINTS: 23


TOTAL PRUNING ERROR: 0




TOTAL GOOD PDG DATA POINTS IN DATASET: 32


TOTAL PDG DATA POINTS IN DATASET: 53


TOTAL PRUNING ERROR IN DATASET: 0


DATASET STATISTICS: {'ExecutorService.submit': [24, 7, 0], 'Lock.lock': [6, 6, 0], 'Timestamp.compareTo': [23, 19, 0]}






## Get the Graph Data

In [3]:
def get_nodes_edges(inTextFile, add_reverse_edges = False):
  # FD = 0, CD = 1
  # to support the hetero data object as suggested by the documentation 
  nodes_dict = {}
  edge_indices_CD = []
  edge_indices_FD = []

  #to support the Data object as used by the Entities dat object as used in RGAT source code
  edge_indices = []
  edge_type = []
  
  # nodes_dict is an index_map
  node_count=0
  with open(inTextFile) as fp:
    
    file_name = inTextFile.split("/")[-1].strip()

    Lines = fp.readlines()
    for line in Lines:

      N = line.split('-->')
      N[0], N[1] = N[0].strip(), N[1].strip()
      
      #t1 = N[0].split('$$')   
      src = N[0].strip()   
      if src not in nodes_dict.keys():
        nodes_dict[src] = node_count
        node_count+=1
        
      #t2 = N[1].split('$$')
      right_idx = N[1].rfind('[')
      dst = N[1][:right_idx].strip()
      if dst not in nodes_dict.keys():
        nodes_dict[dst] = node_count
        node_count+=1

      x = N[1].strip()[right_idx + 1 : -1].strip()
      if(x == 'FD'):
        y=0
        edge_type.append(y)
        edge_indices.append([nodes_dict[src], nodes_dict[dst]])
        if add_reverse_edges:
          edge_type.append(y)
          edge_indices.append([nodes_dict[dst], nodes_dict[src]])
        edge_indices_FD.append([nodes_dict[src], nodes_dict[dst]])
      else: 
        y=1
        edge_type.append(y)
        edge_indices.append([nodes_dict[src], nodes_dict[dst]])
        if add_reverse_edges:
          edge_type.append(y)
          edge_indices.append([nodes_dict[dst], nodes_dict[src]])
        edge_indices_CD.append([nodes_dict[src], nodes_dict[dst]])
     
  return nodes_dict, edge_indices_FD, edge_indices_CD, edge_indices, edge_type, file_name

In [4]:
import gc

#Set GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2,3"
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

# Initialize the models
codebert_tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
codebert_model = AutoModel.from_pretrained("microsoft/codebert-base")
codebert_model = codebert_model.to(device)

def get_node_embedding_from_codebert(nodes):
    list_of_embeddings = []
    for code_line in nodes.keys():
        code_line = code_line.split("$$")[1].strip()
        code_tokens = codebert_tokenizer.tokenize(code_line, truncation=True, max_length=510)
        tokens = [codebert_tokenizer.cls_token]+code_tokens+[codebert_tokenizer.eos_token]
        tokens_ids = torch.tensor(codebert_tokenizer.convert_tokens_to_ids(tokens))
        tokens_ids = tokens_ids.to(device)
        context_embeddings = codebert_model(tokens_ids[None,:])
        cls_token_embedding = context_embeddings.last_hidden_state[0,0,:]
        list_of_embeddings.append(cls_token_embedding.to("cpu"))
        del tokens_ids
        del context_embeddings
        del cls_token_embedding
    gc.collect()
    torch.cuda.empty_cache()
    return torch.stack(list_of_embeddings)

In [5]:
def create_graph_dataset(folders):
  dataset =[]
  for label, folder in tqdm.tqdm(enumerate(folders)):
    print("\nProcessing: {}\n".format(folder))
    files = glob.glob(os.path.join(folder, '*.txt'))
    print("\nNumber of files: {}\n".format(len(files)))
    count = 0
    for file in files:

      if(count % 5 == 0):
          print("\nAt file: {}\n".format(count))
                        
      try:
          nodes_dict, edge_indices_FD, edge_indices_CD, edge_indices, edge_type, file_name = get_nodes_edges(file, add_reverse_edges = True)
      except Exception as e:
          print("\nError: ", e)
          continue
                    
      if(len(nodes_dict) == 0):
          print("\nNo Data: ", file)
          continue
      #print(nodes_dict, edge_indices_CD, edge_indices_FD, edge_type)

      # Node feature matrix with shape [num_nodes, num_node_features]=(N, 768).
      try:
          with torch.no_grad():
            CodeEmbedding = get_node_embedding_from_codebert(nodes_dict)
      except Exception as e :
          print("\nError: ", e)
          print(nodes_dict)
          continue
      #print(CodeEmbedding.shape)

      # FIXING DATA FOTMATS AND SHAPE
      x = torch.tensor(CodeEmbedding)
      # print(x.shape)
  
      # data.y: Target to train against (may have arbitrary shape),
      # graph-level targets of shape [1, *]
      label = 1
      y = torch.tensor([label], dtype=torch.long)
      #print(type(y))

      # edge_index (LongTensor, optional) – Graph connectivity in COO format with shape [2, num_edges]
      edge_index_CD = torch.tensor(edge_indices_CD, dtype=torch.long).t().contiguous()
      edge_index_FD = torch.tensor(edge_indices_FD, dtype=torch.long).t().contiguous()
      edge_index = torch.tensor(edge_indices, dtype=torch.long).t().contiguous()
      edge_attr = torch.tensor(edge_type, dtype=torch.long).t().contiguous()
      #print(edge_index_CD, edge_index_FD, edge_index, edge_type)
  
      data = Data(edge_index=edge_index, edge_attr=edge_attr, x=x)
      data.id = torch.tensor([count])
      data.y = y
      # data.num_nodes = len(nodes_dict)
      data.api = file_name
      dataset.append(data)
      count += 1
    
  return dataset

In [6]:
OUTPUT_FOLDER_LOCATION = "/home/siddharthsa/cs21mtech12001-Tamal/API-Misuse-Prediction/PDG-gen/Repository/Benchmarks/Code-Kernel-Relabelled/after_pruning"
project_folders = [os.path.join(OUTPUT_FOLDER_LOCATION, name) for name in os.listdir(OUTPUT_FOLDER_LOCATION) if os.path.isdir(os.path.join(OUTPUT_FOLDER_LOCATION, name))]
print(project_folders)

gnn_dataset = create_graph_dataset(project_folders)
print("\nLength of the dataset: ", len(gnn_dataset))

['/home/siddharthsa/cs21mtech12001-Tamal/API-Misuse-Prediction/PDG-gen/Repository/Benchmarks/Code-Kernel-Relabelled/after_pruning/ExecutorService.submit', '/home/siddharthsa/cs21mtech12001-Tamal/API-Misuse-Prediction/PDG-gen/Repository/Benchmarks/Code-Kernel-Relabelled/after_pruning/Lock.lock', '/home/siddharthsa/cs21mtech12001-Tamal/API-Misuse-Prediction/PDG-gen/Repository/Benchmarks/Code-Kernel-Relabelled/after_pruning/Timestamp.compareTo']


0it [00:00, ?it/s]


Processing: /home/siddharthsa/cs21mtech12001-Tamal/API-Misuse-Prediction/PDG-gen/Repository/Benchmarks/Code-Kernel-Relabelled/after_pruning/ExecutorService.submit


Number of files: 24


At file: 0


At file: 5


At file: 10


At file: 15


At file: 20



1it [00:05,  5.87s/it]


Processing: /home/siddharthsa/cs21mtech12001-Tamal/API-Misuse-Prediction/PDG-gen/Repository/Benchmarks/Code-Kernel-Relabelled/after_pruning/Lock.lock


Number of files: 6


At file: 0


At file: 5



2it [00:07,  3.28s/it]


Processing: /home/siddharthsa/cs21mtech12001-Tamal/API-Misuse-Prediction/PDG-gen/Repository/Benchmarks/Code-Kernel-Relabelled/after_pruning/Timestamp.compareTo


Number of files: 23


At file: 0


At file: 5


At file: 10


At file: 15


At file: 20



3it [00:19,  6.45s/it]


Length of the dataset:  53





## Build/Load The Model

### Contex-prediction Model

In [18]:
from model import GNN, GNN_graphpred

#set up model
num_layer = 3
emb_dim = 768
gnn_type = "gcn"
num_tasks = 1
JK = "last"
dropout_ratio = 0.5
graph_pooling = "mean"
input_model_file = "/home/siddharthsa/cs21mtech12001-Tamal/API-Misuse-Prediction/PDG-gen/Repository/Graph-Models/MuGNN/output/saved_models/gcn_1_3_5_e100_model_ck_code2seq.pth"

gnn_graphpred_model = GNN_graphpred(num_layer, emb_dim, num_tasks, JK = JK, drop_ratio = dropout_ratio, graph_pooling = graph_pooling, gnn_type = gnn_type)
gnn_graphpred_model.from_pretrained(input_model_file)

gnn_model = GNN(num_layer, emb_dim, JK, drop_ratio = dropout_ratio, gnn_type = gnn_type)
gnn_model.load_state_dict(torch.load(input_model_file))

print("Loaded the model!!")

Loaded the model!!


In [19]:
# Count the total number of parameters
total_params = sum(p.numel() for p in gnn_model.parameters())

# Count the number of trainable parameters
trainable_params = sum(p.numel() for p in gnn_model.parameters() if p.requires_grad)

print("Total parameters:", total_params)
print("Trainable parameters:", trainable_params)

Total parameters: 1882368
Trainable parameters: 1882368


### Clone-detection Model

In [7]:
from model_ng import CustomGCN


input_model_file = "/home/siddharthsa/cs21mtech12001-Tamal/API-Misuse-Prediction/PDG-gen/Repository/Graph-Models/MuGNN/output/saved_models/clone_detection_GCN_L3_e10_850k_model.pth"
gnn_model = CustomGCN(num_node_features= 768)
gnn_model.load_state_dict(torch.load(input_model_file))

print("Loaded the model!!")

Loaded the model!!


In [8]:
# Count the total number of parameters
total_params = sum(p.numel() for p in gnn_model.parameters())

# Count the number of trainable parameters
trainable_params = sum(p.numel() for p in gnn_model.parameters() if p.requires_grad)

print("Total parameters:", total_params)
print("Trainable parameters:", trainable_params)

Total parameters: 85444
Trainable parameters: 85444


## Test The Model

In [9]:
gnn_embeddings = {}
model_name = "clone-detection" # "context-prediction" or "clone-detection"
for i in range(len(gnn_dataset)):
    if model_name == "clone-detection":
        graph_representation = gnn_model(gnn_dataset[i].x, gnn_dataset[i].edge_index, batch = torch.tensor([0]*(len(gnn_dataset[i].x))))[0]
    else:
        node_representation = gnn_model(gnn_dataset[i].x, gnn_dataset[i].edge_index, gnn_dataset[i].edge_attr)
        graph_representation = global_mean_pool(x = node_representation, batch = torch.tensor([0]*(len(node_representation))))[0]
    gnn_dataset[i].embedding = graph_representation.detach().numpy()
    sample_name = gnn_dataset[i].api.split("_")[1].strip()
    api_name = gnn_dataset[i].api.split("_")[2].strip()
    try:
        gnn_embeddings[api_name].append([gnn_dataset[i].api, sample_name, gnn_dataset[i].embedding])
    except:
        gnn_embeddings[api_name] = [[gnn_dataset[i].api, sample_name, gnn_dataset[i].embedding]]

In [10]:
ground_truth_cluster_numbers = {}
for folder in project_folders:
    folder_name = folder.strip().split("/")[-1]
    sample_set = set([name.split("_")[1] for name in os.listdir(folder)])
    ground_truth_cluster_numbers[folder_name] = len(sample_set)
print(ground_truth_cluster_numbers)

{'ExecutorService.submit': 6, 'Lock.lock': 2, 'Timestamp.compareTo': 3}


In [23]:
total_precision, total_recall, total_f1_score, total_accuracy = 0, 0, 0, 0

for folder in project_folders:
    folder_name = folder.strip().split("/")[-1]
    print("\nAnalyzing: ", folder_name)
    precision, recall, f1_score, accuracy = cluster_and_compare(gnn_embeddings[folder_name], ground_truth_cluster_numbers[folder_name], "Birch")
    total_precision += precision
    total_recall += recall
    total_f1_score += f1_score
    total_accuracy += accuracy
    
print("\nAverage Precision: {}, Recall: {} and F1-Score: {} and Accuracy: {}".format(total_precision/len(project_folders), total_recall/len(project_folders), total_f1_score/len(project_folders), total_accuracy/len(project_folders)))


Analyzing:  ExecutorService.submit
Cluster Counts:  {0: 16, 5: 2, 1: 2, 2: 2, 3: 1, 4: 1}
0: ['0_sample-0_ExecutorService.submit_graph_dump.txt', '0_sample-3_ExecutorService.submit_graph_dump.txt', '0_sample-5_ExecutorService.submit_graph_dump.txt', '10_sample-0_ExecutorService.submit_graph_dump.txt', '1_sample-1_ExecutorService.submit_graph_dump.txt']
5: ['11_sample-0_ExecutorService.submit_graph_dump.txt', '4_sample-1_ExecutorService.submit_graph_dump.txt']
1: ['2_sample-3_ExecutorService.submit_graph_dump.txt', '5_sample-0_ExecutorService.submit_graph_dump.txt']
2: ['3_sample-1_ExecutorService.submit_graph_dump.txt', '5_sample-1_ExecutorService.submit_graph_dump.txt']
3: ['4_sample-3_ExecutorService.submit_graph_dump.txt']
4: ['8_sample-0_ExecutorService.submit_graph_dump.txt']


100%|██████████| 24/24 [00:00<00:00, 108123.84it/s]


total_count = 276, currect_count = 146, wrong_count = 130, both_right = 21, both_wrong = 125
{'TP': 21, 'TN': 125, 'FP': 102, 'FN': 28}
Precision: 0.171, Recall: 0.429 and F1-Score: 0.245
Accuracy: 0.529

Analyzing:  Lock.lock
Cluster Counts:  {1: 1, 0: 5}
1: ['0_sample-0_Lock.lock_graph_dump.txt']
0: ['1_sample-0_Lock.lock_graph_dump.txt', '2_sample-0_Lock.lock_graph_dump.txt', '3_sample-0_Lock.lock_graph_dump.txt', '4_sample-1_Lock.lock_graph_dump.txt', '5_sample-0_Lock.lock_graph_dump.txt']


100%|██████████| 6/6 [00:00<00:00, 60349.70it/s]


total_count = 15, currect_count = 7, wrong_count = 8, both_right = 6, both_wrong = 1
{'TP': 6, 'TN': 1, 'FP': 4, 'FN': 4}
Precision: 0.6, Recall: 0.6 and F1-Score: 0.6
Accuracy: 0.467

Analyzing:  Timestamp.compareTo
Cluster Counts:  {1: 1, 0: 21, 2: 1}
1: ['0_sample-0_Timestamp.compareTo_graph_dump.txt']
0: ['0_sample-1_Timestamp.compareTo_graph_dump.txt', '10_sample-2_Timestamp.compareTo_graph_dump.txt', '11_sample-1_Timestamp.compareTo_graph_dump.txt', '12_sample-1_Timestamp.compareTo_graph_dump.txt', '13_sample-1_Timestamp.compareTo_graph_dump.txt']
2: ['1_sample-1_Timestamp.compareTo_graph_dump.txt']


100%|██████████| 23/23 [00:00<00:00, 131429.14it/s]

total_count = 253, currect_count = 129, wrong_count = 124, both_right = 102, both_wrong = 27
{'TP': 102, 'TN': 27, 'FP': 108, 'FN': 16}
Precision: 0.486, Recall: 0.864 and F1-Score: 0.622
Accuracy: 0.51

Average Precision: 0.41900000000000004, Recall: 0.6309999999999999 and F1-Score: 0.48900000000000005 and Accuracy: 0.502



