In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_distances
from sklearn.cluster import AgglomerativeClustering
from collections import Counter
import numpy as np
import random
from pathlib import Path


In [None]:
def cluster_mal_df_descs(mal_df, similarity_threshold=0.75, model_name='all-MiniLM-L6-v2'):
    # remove duplicates and NaNs
    filtered_df = mal_df.dropna(subset=['desc']).drop_duplicates(subset=['desc']).reset_index(drop=True)
    
    # get semantic embeddings
    model = SentenceTransformer(model_name)
    embeddings = model.encode(filtered_df['desc'].tolist(), convert_to_tensor=False, normalize_embeddings=True)
    
    # calculate cosine distance matrix
    distance_matrix = cosine_distances(embeddings)
    
    # clustering
    clustering = AgglomerativeClustering(
        n_clusters=None,
        affinity='precomputed',
        linkage='average',
        distance_threshold=1 - similarity_threshold
    )
    labels = clustering.fit_predict(distance_matrix)
    
    # add cluster label to original filtered_df
    filtered_df['desc_cluster'] = labels

    # trend_counts: the count of every cluster
    trend_counts = dict(Counter(labels))
    
    # move label merge to original mal_df
    mal_df = mal_df.merge(filtered_df[['desc', 'desc_cluster']], on='desc', how='left')

    return mal_df, trend_counts

In [None]:
# define mal_df data path
mal_data_path = Path.cwd().joinpath("mal_packages_with_desc.csv").as_posix()
mal_df = pd.read_csv(mal_data_path)
mal_df, desc_trend_counts = cluster_mal_df_descs(mal_df)

In [None]:
import sys
from pathlib import Path
sys.path.insert(0, Path(sys.path[0]).as_posix())
from trendscore import compute_desc_cluster_score

In [None]:
entropy_top_features_with_scores = [
    ("Readline/readline-i.ri", 1.0000), ("libxml/xmlstring.h", 1.0000),
    ("ClassMethods/commands-i.ri", 1.0000), ("bundler/plugin", 1.0000),
    ("Color/set_color-i.ri", 1.0000), ("HiddenCommand/cdesc-HiddenCommand.ri", 1.0000),
    ("Thor/Base", 1.0000), ("templates/newgem", 1.0000),
    ("Actions/inject_into_class-i.ri", 1.0000), ("source/git", 1.0000)
]

corr_top_features_with_scores = [
    ("socket unique ips", 0.3601),
    ("socket unique hostnames", 0.2438),
    ("file write count", 0.2249),
    ("file unique paths", 0.2187),
    ("dns unique types", 0.1866),
    ("dns total queries", 0.1290),
    ("dns unique hosts", 0.1287),
    ("file read count", 0.1199),
    ("cmd total count", 0.1128),
    ("cmd unique commands", 0.1072)
]

shap_top_features_with_scores = [
    ("file write count", 3.5599),
    ("cmd total envs", 3.0887),
    ("cmd total count", 1.1815),
    ("file delete count", 0.9520),
    ("file unique paths", 0.8559),
    ("cmd total args", 0.8450),
    ("cmd unique commands", 0.7696),
    ("file read count", 0.4057),
    ("socket unique ips", 0.2414),
    ("dns unique hosts", 0.1003)
]

heterogat = [
    ("tmp/pip-ephem-wheel-cache-e2vc lpv", 0.8477),
    ("simple/fakerv2", 0.1155),
    ("fakerv2 1.0", 0.1100),
    ("python3bin/analyze-python.pyâ€“version1.0installfakerv2", 0.0482),
    ("tmp/1cggeydu", 0.0392),
    ("Aix/FfiHelper", 0.0032),
    ("FfiHelper/address to string-c.ri", 0.0016),
    ("FfiHelper/log-c.ri", 0.0015),
    ("FfiHelper/read interfaces-c.ri", 0.0015),
    ("FfiHelper/read load averages-c.ri", 0.0012)
]

dheterogat = [
    ("helpers/arrayWithHoles.js", 0.0119),
    ("custom/zalgo.js", 0.0062),
    ("custom/trap.js", 0.0045),
    ("helpers/assertThisInitialized.js", 0.0041),
    ("sns/sns.provider.js", 0.0028),
    ("@google-cloud/storage", 0.0002),
    ("maps/america.js", 0.0001),
    ("storage/CHANGELOG.md", 0.0001),
    ("node modules/end-of-stream", 0.0001),
    ("end-of-stream/LICENSE", 0.0001)
]

pnhetergat = [
    ("helpers/arrayWithHoles.js", 0.0157),
    ("custom/zalgo.js", 0.0106),
    ("custom/trap.js", 0.0069),
    ("helpers/assertThisInitialized.js", 0.0032),
    ("sns/sns.provider.js", 0.0003),
    ("maps/america.js", 0.0002),
    ("maps/package.json", 0.0002),
    ("maps/rainbow.js", 0.0002),
    ("@google-cloud/storage", 0.0000),
    ("storage/CHANGELOG.md", 0.0000)
]


In [None]:
print("---- Desc Trend Score for entropy -----")
compute_desc_cluster_score(entropy_top_features_with_scores, mal_df)

In [None]:
print("---- Desc Trend Score for Correlation -----")
compute_desc_cluster_score(corr_top_features_with_scores, mal_df)

In [None]:
print("---- Desc Trend Score for SHAP -----")
compute_desc_cluster_score(shap_top_features_with_scores, mal_df)

In [None]:
print("---- Desc Trend Score for HeteroGAT -----")
compute_desc_cluster_score(heterogat, mal_df)

In [None]:
print("---- Desc Trend Score for DiffHeteroGAT -----")
compute_desc_cluster_score(dheterogat, mal_df)

In [None]:
print("---- Desc Trend Score for PNHeteroGAT -----")
compute_desc_cluster_score(pnhetergat, mal_df)