In [1]:
import pandas as pd

In [2]:
obfuscation_count = {'basexx': 40, 'hex': 6, 'bytearray': 1, 'data_reordering': 2, 'xor': 0, 'api_obfuscation': 1}
# generate table with columns: obfuscation, count
pd.DataFrame({
    'Obfuscation': list(obfuscation_count.keys()),
    'Count': list(obfuscation_count.values()),
}).to_csv("../../claims/table6_obfuscation_live2.csv", index=False)

In [3]:
def analysis_malware_campaigns():
    malware_live2_data = pd.read_csv("../data/malware_live2_info.csv")

    total_packages = len(malware_live2_data)
    print(f"Total number of packages: {total_packages}")

    # campain analysis
    num_campaigns = len(malware_live2_data['campaign_id'].unique())
    stealer = malware_live2_data[malware_live2_data['category'] == 'stealer'].groupby('campaign_id').size().tolist()
    poc = malware_live2_data[malware_live2_data['category'] == 'poc'].groupby('campaign_id').size().tolist()
    dropper = malware_live2_data[malware_live2_data['category'] == 'dropper'].groupby('campaign_id').size().tolist()
    trojan = malware_live2_data[malware_live2_data['category'] == 'trojan'].groupby('campaign_id').size().tolist()
    print(f"\nNumber of campaigns: {num_campaigns}")
    print(f"Number of stealer campaigns: {len(stealer)}, total packages: {sum(stealer)} ({sum(stealer) / total_packages * 100:.2f}%)")
    print(f"Number of poc campaigns: {len(poc)}, total packages: {sum(poc)} ({sum(poc) / total_packages * 100:.2f}%)")
    print(f"Number of dropper campaigns: {len(dropper)}, total packages: {sum(dropper)} ({sum(dropper) / total_packages * 100:.2f}%)")
    print(f"Number of trojan campaigns: {len(trojan)}, total packages: {sum(trojan)} ({sum(trojan) / total_packages * 100:.2f}%)")

    # create table with columns: category, num_campaigns, total_packages
    pd.DataFrame({
        'Campaign': ['stealer', 'poc', 'dropper', 'trojan'],
        'Count': [len(stealer), len(poc), len(dropper), len(trojan)],
        'Num Packages': [sum(stealer), sum(poc), sum(dropper), sum(trojan)],
    }).to_csv("../../claims/table7_campaigns_live2.csv", index=False)

    # distribution of malicious code location
    num_setup_py = len(malware_live2_data[malware_live2_data['malicious_code_location'] == 'setup.py'])
    num_init_py = len(malware_live2_data[malware_live2_data['malicious_code_location'] == '__init__.py'])
    num_other = len(malware_live2_data[malware_live2_data['malicious_code_location'] == 'other'])
    print(f"\nMalicious code locations:")
    print(f"  other: {num_other} ({num_other / total_packages * 100:.1f}%)")
    print(f"  setup.py: {num_setup_py} ({num_setup_py / total_packages * 100:.1f}%)")
    print(f"  __init__.py: {num_init_py} ({num_init_py / total_packages * 100:.1f}%)")

    # create table with columns: location, num_packages
    pd.DataFrame({
        'Malicious Code Location': ['other', 'setup.py', '__init__.py'],
        'Num Packages': [num_other, num_setup_py, num_init_py],
    }).to_csv("../../claims/table8_malicious_code_locations_live2.csv", index=False)

In [4]:
analysis_malware_campaigns()

Total number of packages: 92

Number of campaigns: 22
Number of stealer campaigns: 14, total packages: 66 (71.74%)
Number of poc campaigns: 3, total packages: 13 (14.13%)
Number of dropper campaigns: 3, total packages: 8 (8.70%)
Number of trojan campaigns: 2, total packages: 5 (5.43%)

Malicious code locations:
  other: 38 (41.3%)
  setup.py: 35 (38.0%)
  __init__.py: 19 (20.7%)
