In [17]:
import pandas as pd
import requests
from mitrecve import crawler
import random
from pathlib import Path
import threading
from tqdm import tqdm
tqdm.pandas()

In [18]:
# read original labelled file to match
data_path = Path.cwd().parent.joinpath('data', 'label_data.csv').as_posix()
labelled_df = pd.read_csv(data_path, index_col=0)

# get all malicious packages
malicious_df = labelled_df[labelled_df['Label'] == 1]
# keep necessary columns
mal_df = malicious_df[['Name', 'Version', 'Label']].copy()

In [19]:
# match mitre for CVE Descriptions
def mitre_cve_api(package, timeout=300):
    result = {}

    def target():
        try:
            cve_simple = crawler.get_main_page(package)
            result['data'] = crawler.get_cve_detail(cve_simple)
        except Exception as e:
            result['error'] = e

    thread = threading.Thread(target=target)
    thread.start()
    thread.join(timeout)

    if thread.is_alive():
        raise TimeoutError(f"Timeout querying {package} after {timeout} seconds.")
    if 'error' in result:
        raise result['error']
    return result.get('data', {})


def extract_random_desc(package_name, timeout=300):
    try:
        result = mitre_cve_api(package_name, timeout=timeout)
        descs = [entry.get('DESC', '') for entry in result.values() if 'DESC' in entry]
        return random.choice(descs) if descs else ''
    except Exception as e:
        print(f"⚠️ Error querying {package_name}: {e}")
        return ''

In [21]:
mal_df['Desc'] = mal_df['Name'].progress_apply(lambda x: extract_random_desc(x, timeout=300))


 12%|█▏        | 279/2259 [08:34<49:45:37, 90.47s/it]

⚠️ Error querying @primeo/shell: Timeout querying @primeo/shell after 300 seconds.


 16%|█▌        | 358/2259 [12:12<26:50:29, 50.83s/it]

⚠️ Error querying @m365-admin/utilities: list index out of range


 23%|██▎       | 527/2259 [18:54<43:29:59, 90.42s/it]

⚠️ Error querying @banana-cake-pop/data: Timeout querying @banana-cake-pop/data after 300 seconds.


 27%|██▋       | 618/2259 [20:37<5:43:49, 12.57s/it] 

⚠️ Error querying @m365-admin/nav: list index out of range


 34%|███▍      | 769/2259 [27:08<37:25:04, 90.41s/it]

⚠️ Error querying @zola-helpers/client: Timeout querying @zola-helpers/client after 300 seconds.


 42%|████▏     | 946/2259 [34:07<32:59:10, 90.44s/it]

⚠️ Error querying @primeo/common: Timeout querying @primeo/common after 300 seconds.


 42%|████▏     | 950/2259 [39:09<40:43:16, 111.99s/it]

⚠️ Error querying @primeo/address: Timeout querying @primeo/address after 300 seconds.


 51%|█████     | 1144/2259 [47:48<28:04:25, 90.64s/it]

⚠️ Error querying @primeo/authentication: Timeout querying @primeo/authentication after 300 seconds.


 63%|██████▎   | 1413/2259 [57:38<21:14:53, 90.42s/it]

⚠️ Error querying @epic-mod-market/ui: Timeout querying @epic-mod-market/ui after 300 seconds.


 66%|██████▋   | 1498/2259 [1:03:41<19:06:51, 90.42s/it]

⚠️ Error querying @snapp/framework: Timeout querying @snapp/framework after 300 seconds.


 68%|██████▊   | 1535/2259 [1:09:04<18:11:10, 90.43s/it]

⚠️ Error querying asset-symlink: Timeout querying asset-symlink after 300 seconds.


 71%|███████   | 1594/2259 [1:14:47<16:42:16, 90.43s/it]

⚠️ Error querying @primeo/platform: Timeout querying @primeo/platform after 300 seconds.


 78%|███████▊  | 1766/2259 [1:22:46<12:23:01, 90.43s/it]

⚠️ Error querying @primeo/analytics: Timeout querying @primeo/analytics after 300 seconds.


 80%|███████▉  | 1796/2259 [1:28:08<11:38:27, 90.51s/it]

⚠️ Error querying @seller-ui/settings: Timeout querying @seller-ui/settings after 300 seconds.


 81%|████████▏ | 1837/2259 [1:33:34<10:36:04, 90.44s/it]

⚠️ Error querying @pagetour/sdk: Timeout querying @pagetour/sdk after 300 seconds.


 84%|████████▍ | 1896/2259 [1:39:09<9:07:06, 90.43s/it] 

⚠️ Error querying @slashkit/core: Timeout querying @slashkit/core after 300 seconds.


 84%|████████▍ | 1902/2259 [1:44:12<10:00:57, 101.00s/it]

⚠️ Error querying @clearing/models: Timeout querying @clearing/models after 300 seconds.


 90%|█████████ | 2039/2259 [1:50:43<5:31:39, 90.45s/it]  

⚠️ Error querying @gpsu/common: Timeout querying @gpsu/common after 300 seconds.


 91%|█████████ | 2054/2259 [1:55:53<5:10:37, 90.91s/it]

⚠️ Error querying @banana-cake-pop/data: Timeout querying @banana-cake-pop/data after 300 seconds.


 97%|█████████▋| 2199/2259 [2:03:35<1:30:33, 90.55s/it]

⚠️ Error querying @bitkub-moonshot/common: Timeout querying @bitkub-moonshot/common after 300 seconds.


100%|█████████▉| 2248/2259 [2:09:08<16:34, 90.42s/it]  

⚠️ Error querying @rustore-web/config: Timeout querying @rustore-web/config after 300 seconds.


100%|██████████| 2259/2259 [2:09:15<00:00,  3.43s/it]


In [22]:
# save the results
mal_df.to_csv('mal_packages_with_desc.csv', index=False)