In [None]:
import textwrap as tr
from typing import List, Optional

import matplotlib.pyplot as plt
import plotly.express as px
from scipy import spatial
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import average_precision_score, precision_recall_curve

from openai import OpenAI
import numpy as np
import pandas as pd


api_key = ''
client = OpenAI(max_retries=5,api_key = api_key, base_url ="https://api.openai-hub.com/v1")

def get_embedding(text: str, model="text-embedding-3-small", **kwargs) -> List[float]:
    # replace newlines, which can negatively affect performance.
    text = text.replace("\n", " ")

    response = client.embeddings.create(input=[text], model=model, **kwargs)

    return response.data[0].embedding


async def aget_embedding(
    text: str, model="text-embedding-3-small", **kwargs
) -> List[float]:
     
    text = text.replace("\n", " ")

    return (await client.embeddings.create(input=[text], model=model, **kwargs))[
        "data"
    ][0]["embedding"]


def get_embeddings(
    list_of_text: List[str], model="text-embedding-3-small", **kwargs
) -> List[List[float]]:
    assert len(list_of_text) <= 2048, "The batch size should not be larger than 2048."

    # replace newlines, which can negatively affect performance.
    list_of_text = [text.replace("\n", " ") for text in list_of_text]

    data = client.embeddings.create(input=list_of_text, model=model, **kwargs).data
    return [d.embedding for d in data]


async def aget_embeddings(
    list_of_text: List[str], model="text-embedding-3-small", **kwargs
) -> List[List[float]]:
    assert len(list_of_text) <= 2048, "The batch size should not be larger than 2048."

    # replace newlines, which can negatively affect performance.
    list_of_text = [text.replace("\n", " ") for text in list_of_text]

    data = (
        await client.embeddings.create(input=list_of_text, model=model, **kwargs)
    ).data
    return [d.embedding for d in data]


def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


def plot_multiclass_precision_recall(
    y_score, y_true_untransformed, class_list, classifier_name
):
    """
    Precision-Recall plotting for a multiclass problem. It plots average precision-recall, per class precision recall and reference f1 contours.

    Code slightly modified, but heavily based on https://scikit-learn.org/stable/auto_examples/model_selection/plot_precision_recall.html
    """
    n_classes = len(class_list)
    y_true = pd.concat(
        [(y_true_untransformed == class_list[i]) for i in range(n_classes)], axis=1
    ).values

    # For each class
    precision = dict()
    recall = dict()
    average_precision = dict()
    for i in range(n_classes):
        precision[i], recall[i], _ = precision_recall_curve(y_true[:, i], y_score[:, i])
        average_precision[i] = average_precision_score(y_true[:, i], y_score[:, i])

    # A "micro-average": quantifying score on all classes jointly
    precision_micro, recall_micro, _ = precision_recall_curve(
        y_true.ravel(), y_score.ravel()
    )
    average_precision_micro = average_precision_score(y_true, y_score, average="micro")
    print(
        str(classifier_name)
        + " - Average precision score over all classes: {0:0.2f}".format(
            average_precision_micro
        )
    )

    # setup plot details
    plt.figure(figsize=(9, 10))
    f_scores = np.linspace(0.2, 0.8, num=4)
    lines = []
    labels = []
    for f_score in f_scores:
        x = np.linspace(0.01, 1)
        y = f_score * x / (2 * x - f_score)
        (l,) = plt.plot(x[y >= 0], y[y >= 0], color="gray", alpha=0.2)
        plt.annotate("f1={0:0.1f}".format(f_score), xy=(0.9, y[45] + 0.02))

    lines.append(l)
    labels.append("iso-f1 curves")
    (l,) = plt.plot(recall_micro, precision_micro, color="gold", lw=2)
    lines.append(l)
    labels.append(
        "average Precision-recall (auprc = {0:0.2f})" "".format(average_precision_micro)
    )

    for i in range(n_classes):
        (l,) = plt.plot(recall[i], precision[i], lw=2)
        lines.append(l)
        labels.append(
            "Precision-recall for class `{0}` (auprc = {1:0.2f})"
            "".format(class_list[i], average_precision[i])
        )

    fig = plt.gcf()
    fig.subplots_adjust(bottom=0.25)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title(f"{classifier_name}: Precision-Recall curve for each class")
    plt.legend(lines, labels)


def distances_from_embeddings(
    query_embedding: List[float],
    embeddings: List[List[float]],
    distance_metric="cosine",
) -> List[List]:
    """Return the distances between a query embedding and a list of embeddings."""
    distance_metrics = {
        "cosine": spatial.distance.cosine,
        "L1": spatial.distance.cityblock,
        "L2": spatial.distance.euclidean,
        "Linf": spatial.distance.chebyshev,
    }
    distances = [
        distance_metrics[distance_metric](query_embedding, embedding)
        for embedding in embeddings
    ]
    return distances


def indices_of_nearest_neighbors_from_distances(distances) -> np.ndarray:
    """Return a list of indices of nearest neighbors from a list of distances."""
    return np.argsort(distances)


def pca_components_from_embeddings(
    embeddings: List[List[float]], n_components=2
) -> np.ndarray:
    """Return the PCA components of a list of embeddings."""
    pca = PCA(n_components=n_components)
    array_of_embeddings = np.array(embeddings)
    return pca.fit_transform(array_of_embeddings)


def tsne_components_from_embeddings(
    embeddings: List[List[float]], n_components=2, **kwargs
) -> np.ndarray:
    """Returns t-SNE components of a list of embeddings."""
    # use better defaults if not specified
    if "init" not in kwargs.keys():
        kwargs["init"] = "pca"
    if "learning_rate" not in kwargs.keys():
        kwargs["learning_rate"] = "auto"
    tsne = TSNE(n_components=n_components, **kwargs)
    array_of_embeddings = np.array(embeddings)
    return tsne.fit_transform(array_of_embeddings)


def chart_from_components(
    components: np.ndarray,
    labels: Optional[List[str]] = None,
    strings: Optional[List[str]] = None,
    x_title="Component 0",
    y_title="Component 1",
    mark_size=5,
    **kwargs,
):
    """Return an interactive 2D chart of embedding components."""
    empty_list = ["" for _ in components]
    data = pd.DataFrame(
        {
            x_title: components[:, 0],
            y_title: components[:, 1],
            "label": labels if labels else empty_list,
            "string": ["<br>".join(tr.wrap(string, width=30)) for string in strings]
            if strings
            else empty_list,
        }
    )
    chart = px.scatter(
        data,
        x=x_title,
        y=y_title,
        color="label" if labels else None,
        symbol="label" if labels else None,
        hover_data=["string"] if strings else None,
        **kwargs,
    ).update_traces(marker=dict(size=mark_size))
    return chart


def chart_from_components_3D(
    components: np.ndarray,
    labels: Optional[List[str]] = None,
    strings: Optional[List[str]] = None,
    x_title: str = "Component 0",
    y_title: str = "Component 1",
    z_title: str = "Compontent 2",
    mark_size: int = 5,
    **kwargs,
):
    """Return an interactive 3D chart of embedding components."""
    empty_list = ["" for _ in components]
    data = pd.DataFrame(
        {
            x_title: components[:, 0],
            y_title: components[:, 1],
            z_title: components[:, 2],
            "label": labels if labels else empty_list,
            "string": ["<br>".join(tr.wrap(string, width=30)) for string in strings]
            if strings
            else empty_list,
        }
    )
    chart = px.scatter_3d(
        data,
        x=x_title,
        y=y_title,
        z=z_title,
        color="label" if labels else None,
        symbol="label" if labels else None,
        hover_data=["string"] if strings else None,
        **kwargs,
    ).update_traces(marker=dict(size=mark_size))
    return chart


In [None]:
import pandas as pd
import tiktoken
import subprocess
import json
import pandas as pd
# # from openai._utils import embeddings_utils
# from .utils.embeddings_utils import get_embedding

embedding_model = "text-embedding-3-large"
embedding_encoding = "cl100k_base" 
max_tokens = 8000  # the maximum for text-embedding-3-small is 8191

In [None]:
# excel path
# NPM
# excel_file_path = 'NPM_sourcecode_address.xlsx'

# PyPI
# excel_file_path = 'PyPI_sourcecode_address.xlsx'

# RubyGems
excel_file_path = ''



In [None]:

import ast
import esprima
import os
from json.decoder import JSONDecodeError


def extract_api_calls(code):
    api_calls = []

    # 解析代码得到AST
    try:
        tree = ast.parse(code)

        # 遍历AST，查找函数调用
        for node in ast.walk(tree):
            if isinstance(node, ast.Call):
                if isinstance(node.func, ast.Attribute):
                    # 处理形如 obj.method() 的方法调用
                    if isinstance(node.func.value, ast.Name):
                        # 获取对象名和方法名
                        obj_name = node.func.value.id
                        method_name = node.func.attr
                        api_calls.append(f"{obj_name}.{method_name}")
                elif isinstance(node.func, ast.Name):
                    # 处理普通函数调用
                    api_calls.append(node.func.id)

    except SyntaxError as e:
        print(f"Error while parsing code: {e}")
        pass

    return api_calls


def extract_js_api_calls(js_code):
    api_calls = []

    try:
        # 解析JavaScript代码得到AST
        ast = esprima.parseScript(js_code)
        # print(ast)

        # 遍历AST，查找函数调用
        for node in ast.body:
            # print(node)
            if node.type == 'ExpressionStatement' and \
                    node.expression.type == 'CallExpression':
                callee = node.expression.callee
                
                # 获取函数调用的名称
                def get_callee_name(callee):
                    if callee.type == 'Identifier':
                        return callee.name
                    elif callee.type == 'MemberExpression':
                        # 获取属性名称（即方法名）
                        if callee.property.type == 'Identifier':
                            call = str(callee.object.name) + '.' + str(callee.property.name)
                            return call
                        return ""

                callee_name = get_callee_name(callee)
                if callee_name:
                    api_calls.append(callee_name)

    except esprima.Error as e:
        print(f"Error while parsing JavaScript code: {e}")

    return api_calls



def parse_rb_content(rb_path):
    # 调用Ruby脚本并传入.rb文件的路径
    process = subprocess.Popen(['ruby', 'get_ruby.rb', rb_path], stdout=subprocess.PIPE)
    output, _ = process.communicate()
    print(output)

    try:
        # 尝试解析Ruby脚本的输出为Python的字典
        parsed_ast = json.loads(output.decode())
        return parsed_ast
    except JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
        return None

df = pd.read_excel(excel_file_path)
file_path_column = 'SourceCode_Path' 
df['code_api'] = ''  # 创建一个新列 'code'，初始化为空字符串

    # 循环处理每个文件
for index, row in df.iterrows():
    file_path = row[file_path_column]

    # 打开文件并读取信息
    if file_path and os.path.exists(file_path):  # 检查文件是否存在
        with open(file_path, 'r') as file:
            print(file_path)
            # file_content = file.read().replace('\x00', '')
            # result_ast = parse_rb_content(file_content)
            # NPM
            # result_ast = extract_js_api_calls(file_content)

            # PyPI
            # if len(file_content) > 1:
            #     result_ast = extract_api_calls(file_content)
            

            # ruby
            result_ast = parse_rb_content(file_path)
                # 将信息逐行追加到新列 "code"
            df.at[index, 'code_api'] += str(result_ast)
    else:
        df.at[index, 'code_api'] = None
df

In [None]:
df["code_api"] = df["code_api"].astype(str)
encoding = tiktoken.get_encoding(embedding_encoding)

# omit reviews that are too long to embed
df["n_tokens"] = df.code_api.apply(lambda x: len(encoding.encode(x)))
df.to_csv('./ruby_tokens.csv')

In [None]:
# Ensure you have your API key set in your environment per the README: https://github.com/openai/openai-python#usage

# # This may take a few minutes
# df["embedding"] = df.combined.apply(lambda x: get_embedding(x, model=embedding_model))
# df.to_csv("fine_food_reviews_with_embeddings_1k.csv")
df = df[(df.n_tokens <= max_tokens) & (df.n_tokens > 1)]
print(len(df))
# This may take a few minutes
df["embedding"] = df.code_api.apply(lambda x: get_embedding(x, model=embedding_model))
df.to_csv("pypi_embeddings.csv")

In [None]:
## NPM 
import numpy as np
import pandas as pd
from ast import literal_eval

# load data
datafile_path = ''
df = pd.read_csv(datafile_path)
df["embedding"] = df.embedding.apply(literal_eval).apply(np.array) 
matrix = np.vstack(df.embedding.values)
matrix.shape

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from scipy.spatial.distance import euclidean

def intra_cluster_similarity(cluster_points):
    n = len(cluster_points)
    if n < 2:
        return 0, 0, 0
    distances = [euclidean(cluster_points[i], cluster_points[j]) for i in range(n) for j in range(i+1, n)]
    return np.mean(distances), np.max(distances), np.min(distances)

def inter_cluster_similarity(cluster1_points, cluster2_points):
    distances = [euclidean(p1, p2) for p1 in cluster1_points for p2 in cluster2_points]
    return np.mean(distances), np.max(distances), np.min(distances)

def calculate_similarities(matrix, labels, n_clusters):
    clusters = [matrix[labels == i] for i in range(n_clusters)]
    intra_similarities = [intra_cluster_similarity(cluster) for cluster in clusters]
    inter_similarities = [((i, j), inter_cluster_similarity(clusters[i], clusters[j])) for i in range(n_clusters) for j in range(i+1, n_clusters)]
    return intra_similarities, inter_similarities


# Initial Number Of Clusters
n_clusters = 2
max_clusters = 100  # Maximum Cluster Limit

previous_intra_similarities = None
change = 0
while n_clusters <= max_clusters:
    kmeans = KMeans(n_clusters=n_clusters, init="k-means++", random_state=42)
    kmeans.fit(matrix)
    labels = kmeans.labels_
    
    intra_similarities, inter_similarities = calculate_similarities(matrix, labels, n_clusters)
    
    if previous_intra_similarities is None:
        previous_intra_similarities = intra_similarities
    else:
        current_mean_intra = np.mean([intra[0] for intra in intra_similarities])
        previous_mean_intra = np.mean([intra[0] for intra in previous_intra_similarities])
        change = abs(current_mean_intra - previous_mean_intra)
        
        if change < 0.01:
            break
        
        previous_intra_similarities = intra_similarities

    n_clusters += 1

df["Cluster"] = labels
df.to_csv("./npm_cluster.csv", index=False)

print(f"Final Number Of Clusters: {n_clusters}")

print("\nFinal Intra Cluster Similarity(Average Distance, Maximum Distance,Minimum Distance):")
for i, intra in enumerate(intra_similarities):
    print(f"Cluster {i}: {intra}")

print("\nFinal Inter Cluster Similarity(Cluster Pair: Average Distance, Maximum Distance, Minimum Distance）:")
for (i, j), inter in inter_similarities:
    print(f"Cluster {i} 和 Cluster {j}: {inter}")

print('change:',change)

In [None]:
## ruby 
import numpy as np
import pandas as pd
from ast import literal_eval

# load data
datafile_path = ''
df = pd.read_csv(datafile_path)
df["embedding"] = df.embedding.apply(literal_eval).apply(np.array)  # convert string to numpy array
matrix = np.vstack(df.embedding.values)
matrix.shape

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from scipy.spatial.distance import euclidean

def intra_cluster_similarity(cluster_points):
    n = len(cluster_points)
    if n < 2:
        return 0, 0, 0
    distances = [euclidean(cluster_points[i], cluster_points[j]) for i in range(n) for j in range(i+1, n)]
    return np.mean(distances), np.max(distances), np.min(distances)

def inter_cluster_similarity(cluster1_points, cluster2_points):
    distances = [euclidean(p1, p2) for p1 in cluster1_points for p2 in cluster2_points]
    return np.mean(distances), np.max(distances), np.min(distances)

def calculate_similarities(matrix, labels, n_clusters):
    clusters = [matrix[labels == i] for i in range(n_clusters)]
    intra_similarities = [intra_cluster_similarity(cluster) for cluster in clusters]
    inter_similarities = [((i, j), inter_cluster_similarity(clusters[i], clusters[j])) for i in range(n_clusters) for j in range(i+1, n_clusters)]
    return intra_similarities, inter_similarities


# Initial Number Of Clusters
n_clusters = 2
max_clusters = 10  # Maximum Cluster Limit

previous_intra_similarities = None
change = 0
while n_clusters <= max_clusters:
    kmeans = KMeans(n_clusters=n_clusters, init="k-means++", random_state=42)
    kmeans.fit(matrix)
    labels = kmeans.labels_
    
    intra_similarities, inter_similarities = calculate_similarities(matrix, labels, n_clusters)
    
    if previous_intra_similarities is None:
        previous_intra_similarities = intra_similarities
    else:
        current_mean_intra = np.mean([intra[0] for intra in intra_similarities])
        previous_mean_intra = np.mean([intra[0] for intra in previous_intra_similarities])
        change = abs(current_mean_intra - previous_mean_intra)
        

        if change < 0.01:
            break
        

        previous_intra_similarities = intra_similarities

    n_clusters += 1

df["Cluster"] = labels
df.to_csv("./ruby_cluster.csv", index=False)

print(f"Final Number Of Clusters: {n_clusters}")

print("\nFinal Intra Cluster Similarity(Average Distance, Maximum Distance,Minimum Distance):")
for i, intra in enumerate(intra_similarities):
    print(f"Cluster {i}: {intra}")

print("\nFinal Inter Cluster Similarity(Cluster Pair: Average Distance, Maximum Distance, Minimum Distance）:")
for (i, j), inter in inter_similarities:
    print(f"Cluster {i} 和 Cluster {j}: {inter}")

print('change:',change)


In [None]:
## pypi 
import numpy as np
import pandas as pd
from ast import literal_eval

# load data
datafile_path = "pypi_embeddings.csv"

df = pd.read_csv(datafile_path)
df["embedding"] = df.embedding.apply(literal_eval).apply(np.array)  
matrix = np.vstack(df.embedding.values)
matrix.shape

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from scipy.spatial.distance import euclidean

def intra_cluster_similarity(cluster_points):
    n = len(cluster_points)
    if n < 2:
        return 0, 0, 0
    distances = [euclidean(cluster_points[i], cluster_points[j]) for i in range(n) for j in range(i+1, n)]
    return np.mean(distances), np.max(distances), np.min(distances)

def inter_cluster_similarity(cluster1_points, cluster2_points):
    distances = [euclidean(p1, p2) for p1 in cluster1_points for p2 in cluster2_points]
    return np.mean(distances), np.max(distances), np.min(distances)

def calculate_similarities(matrix, labels, n_clusters):
    clusters = [matrix[labels == i] for i in range(n_clusters)]
    intra_similarities = [intra_cluster_similarity(cluster) for cluster in clusters]
    inter_similarities = [((i, j), inter_cluster_similarity(clusters[i], clusters[j])) for i in range(n_clusters) for j in range(i+1, n_clusters)]
    return intra_similarities, inter_similarities


# Initial Number Of Clusters
n_clusters = 2
max_clusters = 100  # Maximum Cluster Limit

previous_intra_similarities = None
change = 0
while n_clusters <= max_clusters:
    kmeans = KMeans(n_clusters=n_clusters, init="k-means++", random_state=42)
    kmeans.fit(matrix)
    labels = kmeans.labels_
    
    intra_similarities, inter_similarities = calculate_similarities(matrix, labels, n_clusters)
    
    if previous_intra_similarities is None:
        previous_intra_similarities = intra_similarities
    else:
        current_mean_intra = np.mean([intra[0] for intra in intra_similarities])
        previous_mean_intra = np.mean([intra[0] for intra in previous_intra_similarities])
        change = abs(current_mean_intra - previous_mean_intra)
        
        if change < 0.01:
            break
        
        # Previous Intra Similarities
        previous_intra_similarities = intra_similarities

    n_clusters += 1

df["Cluster"] = labels
df.to_csv("./pypi_cluster.csv", index=False)

print(f"Final Number Of Clusters: {n_clusters}")

print("\nFinal Intra Cluster Similarity(Average Distance, Maximum Distance,Minimum Distance):")
for i, intra in enumerate(intra_similarities):
    print(f"Cluster {i}: {intra}")

print("\nFinal Inter Cluster Similarity(Cluster Pair: Average Distance, Maximum Distance, Minimum Distance）:")
for (i, j), inter in inter_similarities:
    print(f"Cluster {i} 和 Cluster {j}: {inter}")

print('change:',change)

In [None]:
# Horizontally
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from ast import literal_eval

colors = [
    '#006400',  # Deep Green
    '#00008B',  # Navy Blue
    
    '#800000',  # Oxblood Red
    '#D2691E',  # Khaki
    '#B03060',  # Deep Rose Color
    '#CC8800', #Dark Yellow
    '#1B2A49',  # Deep Blue Black
    '#E1C07A',  # Khaki
    '#2F4F4F',  # Dark Grey
    '#556B2F',  # Deep Olive Green


    '#5C4033',  # Dark Brown
    '#4B0082',  # Deep Purple
    '#6B8E23',  # Deep Olive Color
    '#CD5B45',  # Deep Coral Color
    

    '#4169E1',  # Deep Gemstone Green
    
    
    '#2A2D34'   # Deep Blue Gray
]

def plot_tsne(ax, embeddings_path, cluster_path, title):
    df_embeddings = pd.read_csv(embeddings_path)
    df_embeddings["embedding"] = df_embeddings.embedding.apply(literal_eval).apply(np.array)
    matrix = np.vstack(df_embeddings.embedding.values)

    df_clusters = pd.read_csv(cluster_path)
    
    tsne = TSNE(n_components=2, perplexity=15, random_state=42, init="random", learning_rate=200)
    vis_dims2 = tsne.fit_transform(matrix)

    x = [x for x, y in vis_dims2]
    y = [y for x, y in vis_dims2]

    num_clusters = df_clusters['Cluster'].nunique()

    for category in range(num_clusters):
        color = colors[category % len(colors)]
        xs = np.array(x)[df_clusters.Cluster == category]
        ys = np.array(y)[df_clusters.Cluster == category]
        ax.scatter(xs, ys, color=color, alpha=0.6, label=f'Cluster {category}')

        avg_x = xs.mean()
        avg_y = ys.mean()
        ax.scatter(avg_x, avg_y, marker="x", color=color, s=100)
    
    ax.set_title(title, fontsize=20)
    ax.legend(fontsize=12)
    ax.tick_params(axis='both', which='major', labelsize=20)

fig, axes = plt.subplots(1, 3, figsize=(30, 8))  

plot_tsne(axes[0], "npm_embeddings.csv", 
          "npm_cluster.csv", "NPM Clusters")

plot_tsne(axes[1], "pypi_embeddings.csv", 
          "pypi_cluster.csv", "PyPI Clusters")

plot_tsne(axes[2], "ruby_embeddings.csv", 
          "ruby_cluster.csv", "Ruby Clusters")
plt.tight_layout()

plt.show()