In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_distances
from sklearn.cluster import AgglomerativeClustering
from collections import Counter
import numpy as np
import random
from pathlib import Path


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def cluster_mal_df_descs(mal_df, similarity_threshold=0.75, model_name='all-MiniLM-L6-v2'):
    # remove duplicates and NaNs
    filtered_df = mal_df.dropna(subset=['Desc']).drop_duplicates(subset=['Desc']).reset_index(drop=True)
    
    # get semantic embeddings
    model = SentenceTransformer(model_name)
    embeddings = model.encode(filtered_df['Desc'].tolist(), convert_to_tensor=False, normalize_embeddings=True)
    
    # calculate cosine distance matrix
    distance_matrix = cosine_distances(embeddings)
    
    # clustering
    clustering = AgglomerativeClustering(
        n_clusters=None,
        # affinity='precomputed',
        linkage='average',
        distance_threshold=1 - similarity_threshold
    )
    labels = clustering.fit_predict(distance_matrix)
    
    # add cluster label to original filtered_df
    filtered_df['desc_cluster'] = labels

    # trend_counts: the count of every cluster
    trend_counts = dict(Counter(labels))
    
    # move label merge to original mal_df
    mal_df = mal_df.merge(filtered_df[['Desc', 'desc_cluster']], on='Desc', how='left')

    return mal_df, trend_counts

In [3]:
# define mal_df data path
mal_data_path = Path.cwd().joinpath("mal_packages_with_desc.csv").as_posix()
mal_df = pd.read_csv(mal_data_path)
mal_df, desc_trend_counts = cluster_mal_df_descs(mal_df)

  out = hierarchy.linkage(X, method=linkage, metric=affinity)


In [4]:
import sys
from pathlib import Path
sys.path.insert(0, Path(sys.path[0]).as_posix())
from trendscore import compute_desc_cluster_score

In [5]:
entropy_top_features_with_scores = [
    ("Readline/readline-i.ri", 1.0000), ("libxml/xmlstring.h", 1.0000),
    ("ClassMethods/commands-i.ri", 1.0000), ("bundler/plugin", 1.0000),
    ("Color/set_color-i.ri", 1.0000), ("HiddenCommand/cdesc-HiddenCommand.ri", 1.0000),
    ("Thor/Base", 1.0000), ("templates/newgem", 1.0000),
    ("Actions/inject_into_class-i.ri", 1.0000), ("source/git", 1.0000)
]

corr_top_features_with_scores = [
    ("socket unique ips", 0.3601),
    ("socket unique hostnames", 0.2438),
    ("file write count", 0.2249),
    ("file unique paths", 0.2187),
    ("dns unique types", 0.1866),
    ("dns total queries", 0.1290),
    ("dns unique hosts", 0.1287),
    ("file read count", 0.1199),
    ("cmd total count", 0.1128),
    ("cmd unique commands", 0.1072)
]

shap_top_features_with_scores = [
    ("file write count", 3.5599),
    ("cmd total envs", 3.0887),
    ("cmd total count", 1.1815),
    ("file delete count", 0.9520),
    ("file unique paths", 0.8559),
    ("cmd total args", 0.8450),
    ("cmd unique commands", 0.7696),
    ("file read count", 0.4057),
    ("socket unique ips", 0.2414),
    ("dns unique hosts", 0.1003)
]

heterogat = [
    ("tmp/pip-ephem-wheel-cache-e2vc lpv", 0.8477),
    ("simple/fakerv2", 0.1155),
    ("fakerv2 1.0", 0.1100),
    ("python3bin/analyze-python.pyâ€“version1.0installfakerv2", 0.0482),
    ("tmp/1cggeydu", 0.0392),
    ("Aix/FfiHelper", 0.0032),
    ("FfiHelper/address to string-c.ri", 0.0016),
    ("FfiHelper/log-c.ri", 0.0015),
    ("FfiHelper/read interfaces-c.ri", 0.0015),
    ("FfiHelper/read load averages-c.ri", 0.0012)
]

dheterogat = [
    ("custom/zalgo.js", 0.0775),
    ("helpers/assertThisInitialized.js", 0.0579),
    ("helpers/arrayWithHoles.js", 0.0477),
    ("custom/trap.js", 0.0122),
    ("@google-cloud/storage", 0.0013),
    ("maps/america.js", 0.0003),
    ("storage/CHANGELOG.md", 0.0001),
    ("node_modules/end-of-stream", 0.0001),
    ("end-of-stream/LICENSE", 0.0001),
    ("sns/sns.provider.js", 0.0001)
]

pnhetergat = [
    ("custom/zalgo.js", 0.0676),
    ("helpers/arrayWithHoles.js", 0.0559),
    ("helpers/assertThisInitialized.js", 0.0489),
    ("custom/trap.js", 0.0233),
    ("storage/CHANGELOG.md", 0.0014),
    ("node_modules/end-of-stream", 0.0014),
    ("end-of-stream/LICENSE", 0.0014),
    ("maps/america.js", 0.0007),
    ("maps/package.json", 0.0007),
    ("maps/rainbow.js", 0.0007)
]


In [6]:
print("---- Desc Trend Score for entropy -----")
compute_desc_cluster_score(entropy_top_features_with_scores, mal_df)

---- Desc Trend Score for entropy -----
[Cluster 0.0] score: 4.09 (desc: "The MicroPayments &#8211; Fans Paysite: Paid Creator Subscriptions, Digital Assets, Tokens Wallet plugin for WordPress is vulnerable to Stored Cross-Site Scripting via the plugin's 'videowhisper_content_upload_guest' shortcode in all versions up to, and including, 2.9.29 due to insufficient input sanitization and output escaping on user supplied attributes. This makes it possible for authenticated attackers, with contributor-level access and above, to inject arbitrary web scripts in pages that will execute whenever a user accesses an injected page.")
[Cluster 1.0] score: 1.62 (desc: "Heap-based buffer overflow in the px_pac_reload function in lib/pac.c in libproxy 0.2.x and 0.3.x allows remote servers to have an unspecified impact via a crafted Content-Length size in an HTTP response header for a proxy.pac file request, a different vulnerability than CVE-2012-4504.")
[Cluster 2.0] score: 4.40 (desc: "Multiple cro

(np.float32(111.6673),
 {0.0: np.float32(4.0875688),
  1.0: np.float32(1.6185412),
  2.0: np.float32(4.3984265),
  3.0: np.float32(1.946642),
  4.0: np.float32(3.67127),
  5.0: np.float32(3.3593225),
  6.0: np.float32(7.138234),
  7.0: np.float32(5.9149394),
  8.0: np.float32(1.9772828),
  9.0: np.float32(7.232401),
  10.0: np.float32(3.201203),
  11.0: np.float32(6.2080584),
  12.0: np.float32(1.8909796),
  13.0: np.float32(4.9470024),
  14.0: np.float32(5.7094526),
  15.0: np.float32(-1.1005417),
  16.0: np.float32(1.7611556),
  17.0: np.float32(2.233552),
  18.0: np.float32(1.5871699),
  19.0: np.float32(3.0477054),
  20.0: np.float32(3.2279465),
  21.0: np.float32(5.900316),
  22.0: np.float32(1.8189753),
  23.0: np.float32(6.3223825),
  24.0: np.float32(4.640504),
  25.0: np.float32(2.555738),
  26.0: np.float32(0.07701656),
  27.0: np.float32(2.8062468),
  28.0: np.float32(7.3048983),
  29.0: np.float32(2.5620239),
  30.0: np.float32(3.620883)})

In [7]:
print("---- Desc Trend Score for Correlation -----")
compute_desc_cluster_score(corr_top_features_with_scores, mal_df)

---- Desc Trend Score for Correlation -----
[Cluster 0.0] score: 2.94 (desc: "The MicroPayments &#8211; Fans Paysite: Paid Creator Subscriptions, Digital Assets, Tokens Wallet plugin for WordPress is vulnerable to Stored Cross-Site Scripting via the plugin's 'videowhisper_content_upload_guest' shortcode in all versions up to, and including, 2.9.29 due to insufficient input sanitization and output escaping on user supplied attributes. This makes it possible for authenticated attackers, with contributor-level access and above, to inject arbitrary web scripts in pages that will execute whenever a user accesses an injected page.")
[Cluster 1.0] score: 1.38 (desc: "Heap-based buffer overflow in the px_pac_reload function in lib/pac.c in libproxy 0.2.x and 0.3.x allows remote servers to have an unspecified impact via a crafted Content-Length size in an HTTP response header for a proxy.pac file request, a different vulnerability than CVE-2012-4504.")
[Cluster 2.0] score: 4.75 (desc: "Multiple

(np.float32(61.214077),
 {0.0: np.float32(2.9437945),
  1.0: np.float32(1.3814166),
  2.0: np.float32(4.745566),
  3.0: np.float32(3.3315191),
  4.0: np.float32(-1.0723442),
  5.0: np.float32(0.62662596),
  6.0: np.float32(7.205499),
  7.0: np.float32(2.6467988),
  8.0: np.float32(0.4098062),
  9.0: np.float32(0.008745551),
  10.0: np.float32(0.9245643),
  11.0: np.float32(0.11264714),
  12.0: np.float32(0.31562027),
  13.0: np.float32(5.298229),
  14.0: np.float32(2.242759),
  15.0: np.float32(-2.3717387),
  16.0: np.float32(0.9782869),
  17.0: np.float32(3.110102),
  18.0: np.float32(3.4133177),
  19.0: np.float32(4.092191),
  20.0: np.float32(0.4360606),
  21.0: np.float32(1.3054968),
  22.0: np.float32(2.0411997),
  23.0: np.float32(2.8348162),
  24.0: np.float32(-0.39223018),
  25.0: np.float32(-0.5603755),
  26.0: np.float32(1.0804791),
  27.0: np.float32(0.87731993),
  28.0: np.float32(4.7573977),
  29.0: np.float32(7.335461),
  30.0: np.float32(1.1550478)})

In [8]:
print("---- Desc Trend Score for SHAP -----")
compute_desc_cluster_score(shap_top_features_with_scores, mal_df)

---- Desc Trend Score for SHAP -----
[Cluster 0.0] score: 3.44 (desc: "The MicroPayments &#8211; Fans Paysite: Paid Creator Subscriptions, Digital Assets, Tokens Wallet plugin for WordPress is vulnerable to Stored Cross-Site Scripting via the plugin's 'videowhisper_content_upload_guest' shortcode in all versions up to, and including, 2.9.29 due to insufficient input sanitization and output escaping on user supplied attributes. This makes it possible for authenticated attackers, with contributor-level access and above, to inject arbitrary web scripts in pages that will execute whenever a user accesses an injected page.")
[Cluster 1.0] score: 2.70 (desc: "Heap-based buffer overflow in the px_pac_reload function in lib/pac.c in libproxy 0.2.x and 0.3.x allows remote servers to have an unspecified impact via a crafted Content-Length size in an HTTP response header for a proxy.pac file request, a different vulnerability than CVE-2012-4504.")
[Cluster 2.0] score: 2.48 (desc: "Multiple cross-

(np.float32(62.65231),
 {0.0: np.float32(3.4399505),
  1.0: np.float32(2.6953359),
  2.0: np.float32(2.4803488),
  3.0: np.float32(2.0694845),
  4.0: np.float32(1.4948192),
  5.0: np.float32(2.0920172),
  6.0: np.float32(11.11108),
  7.0: np.float32(-0.45446423),
  8.0: np.float32(-0.15319045),
  9.0: np.float32(3.748733),
  10.0: np.float32(-1.526169),
  11.0: np.float32(0.42870465),
  12.0: np.float32(1.5645674),
  13.0: np.float32(5.8193445),
  14.0: np.float32(4.5180025),
  15.0: np.float32(-2.0416434),
  16.0: np.float32(0.9553007),
  17.0: np.float32(0.44186845),
  18.0: np.float32(-0.9779908),
  19.0: np.float32(3.178639),
  20.0: np.float32(-2.9246073),
  21.0: np.float32(5.0413837),
  22.0: np.float32(0.06167223),
  23.0: np.float32(2.576133),
  24.0: np.float32(6.0612574),
  25.0: np.float32(0.08416088),
  26.0: np.float32(-0.2386613),
  27.0: np.float32(1.8012967),
  28.0: np.float32(8.97469),
  29.0: np.float32(0.6400014),
  30.0: np.float32(-0.30975887)})

In [9]:
print("---- Desc Trend Score for HeteroGAT -----")
compute_desc_cluster_score(heterogat, mal_df)

---- Desc Trend Score for HeteroGAT -----
[Cluster 0.0] score: 6.11 (desc: "The MicroPayments &#8211; Fans Paysite: Paid Creator Subscriptions, Digital Assets, Tokens Wallet plugin for WordPress is vulnerable to Stored Cross-Site Scripting via the plugin's 'videowhisper_content_upload_guest' shortcode in all versions up to, and including, 2.9.29 due to insufficient input sanitization and output escaping on user supplied attributes. This makes it possible for authenticated attackers, with contributor-level access and above, to inject arbitrary web scripts in pages that will execute whenever a user accesses an injected page.")
[Cluster 1.0] score: 7.99 (desc: "Heap-based buffer overflow in the px_pac_reload function in lib/pac.c in libproxy 0.2.x and 0.3.x allows remote servers to have an unspecified impact via a crafted Content-Length size in an HTTP response header for a proxy.pac file request, a different vulnerability than CVE-2012-4504.")
[Cluster 2.0] score: 5.84 (desc: "Multiple c

(np.float32(189.75403),
 {0.0: np.float32(6.1052947),
  1.0: np.float32(7.9910393),
  2.0: np.float32(5.840805),
  3.0: np.float32(7.1220303),
  4.0: np.float32(5.829635),
  5.0: np.float32(4.1344457),
  6.0: np.float32(6.3591933),
  7.0: np.float32(7.393178),
  8.0: np.float32(2.923182),
  9.0: np.float32(7.1330953),
  10.0: np.float32(6.0112224),
  11.0: np.float32(10.372228),
  12.0: np.float32(4.703109),
  13.0: np.float32(6.379855),
  14.0: np.float32(5.1490383),
  15.0: np.float32(-1.3458419),
  16.0: np.float32(3.9536648),
  17.0: np.float32(6.0214777),
  18.0: np.float32(4.1681867),
  19.0: np.float32(9.128903),
  20.0: np.float32(5.151153),
  21.0: np.float32(7.0320807),
  22.0: np.float32(6.489491),
  23.0: np.float32(8.971465),
  24.0: np.float32(9.045977),
  25.0: np.float32(4.1446657),
  26.0: np.float32(5.294366),
  27.0: np.float32(7.158735),
  28.0: np.float32(5.9839315),
  29.0: np.float32(7.356985),
  30.0: np.float32(7.7514267)})

In [10]:
print("---- Desc Trend Score for DiffHeteroGAT -----")
compute_desc_cluster_score(dheterogat, mal_df)

---- Desc Trend Score for DiffHeteroGAT -----
[Cluster 0.0] score: 7.14 (desc: "The MicroPayments &#8211; Fans Paysite: Paid Creator Subscriptions, Digital Assets, Tokens Wallet plugin for WordPress is vulnerable to Stored Cross-Site Scripting via the plugin's 'videowhisper_content_upload_guest' shortcode in all versions up to, and including, 2.9.29 due to insufficient input sanitization and output escaping on user supplied attributes. This makes it possible for authenticated attackers, with contributor-level access and above, to inject arbitrary web scripts in pages that will execute whenever a user accesses an injected page.")
[Cluster 1.0] score: 2.44 (desc: "Heap-based buffer overflow in the px_pac_reload function in lib/pac.c in libproxy 0.2.x and 0.3.x allows remote servers to have an unspecified impact via a crafted Content-Length size in an HTTP response header for a proxy.pac file request, a different vulnerability than CVE-2012-4504.")
[Cluster 2.0] score: 7.45 (desc: "Multip

(np.float32(114.83855),
 {0.0: np.float32(7.1417747),
  1.0: np.float32(2.4386287),
  2.0: np.float32(7.4498043),
  3.0: np.float32(-2.2580578),
  4.0: np.float32(6.0769854),
  5.0: np.float32(6.931835),
  6.0: np.float32(2.9490542),
  7.0: np.float32(6.618205),
  8.0: np.float32(3.9003015),
  9.0: np.float32(5.5266886),
  10.0: np.float32(0.91093147),
  11.0: np.float32(10.124095),
  12.0: np.float32(-0.73086),
  13.0: np.float32(4.540144),
  14.0: np.float32(8.745538),
  15.0: np.float32(0.6918905),
  16.0: np.float32(0.26743355),
  17.0: np.float32(-2.1744106),
  18.0: np.float32(3.7621002),
  19.0: np.float32(1.8371595),
  20.0: np.float32(5.1058116),
  21.0: np.float32(2.1538358),
  22.0: np.float32(4.300647),
  23.0: np.float32(7.4801297),
  24.0: np.float32(-3.34126),
  25.0: np.float32(2.2882454),
  26.0: np.float32(4.566842),
  27.0: np.float32(0.10222213),
  28.0: np.float32(7.4803185),
  29.0: np.float32(4.788687),
  30.0: np.float32(5.163837)})

In [11]:
print("---- Desc Trend Score for PNHeteroGAT -----")
compute_desc_cluster_score(pnhetergat, mal_df)

---- Desc Trend Score for PNHeteroGAT -----
[Cluster 0.0] score: 7.19 (desc: "The MicroPayments &#8211; Fans Paysite: Paid Creator Subscriptions, Digital Assets, Tokens Wallet plugin for WordPress is vulnerable to Stored Cross-Site Scripting via the plugin's 'videowhisper_content_upload_guest' shortcode in all versions up to, and including, 2.9.29 due to insufficient input sanitization and output escaping on user supplied attributes. This makes it possible for authenticated attackers, with contributor-level access and above, to inject arbitrary web scripts in pages that will execute whenever a user accesses an injected page.")
[Cluster 1.0] score: 3.06 (desc: "Heap-based buffer overflow in the px_pac_reload function in lib/pac.c in libproxy 0.2.x and 0.3.x allows remote servers to have an unspecified impact via a crafted Content-Length size in an HTTP response header for a proxy.pac file request, a different vulnerability than CVE-2012-4504.")
[Cluster 2.0] score: 7.52 (desc: "Multiple

(np.float32(121.85516),
 {0.0: np.float32(7.1939044),
  1.0: np.float32(3.0595741),
  2.0: np.float32(7.5153575),
  3.0: np.float32(-1.3382369),
  4.0: np.float32(6.1331687),
  5.0: np.float32(7.167495),
  6.0: np.float32(3.204744),
  7.0: np.float32(6.6772733),
  8.0: np.float32(4.596539),
  9.0: np.float32(5.4914627),
  10.0: np.float32(1.110796),
  11.0: np.float32(10.085819),
  12.0: np.float32(-0.62353796),
  13.0: np.float32(4.6633487),
  14.0: np.float32(8.846361),
  15.0: np.float32(0.5975715),
  16.0: np.float32(-0.26195714),
  17.0: np.float32(-1.5765668),
  18.0: np.float32(3.9085178),
  19.0: np.float32(1.8463621),
  20.0: np.float32(4.9932795),
  21.0: np.float32(3.242145),
  22.0: np.float32(4.677788),
  23.0: np.float32(7.735542),
  24.0: np.float32(-2.6165671),
  25.0: np.float32(2.218349),
  26.0: np.float32(4.7191505),
  27.0: np.float32(0.3291915),
  28.0: np.float32(8.01856),
  29.0: np.float32(5.1199484),
  30.0: np.float32(5.1198044)})