In [29]:
import os
import pandas as pd
from tqdm import tqdm

from androguard.core.bytecodes.apk import APK

In [30]:
BASE_DATASET = r"C:/Users/Sonali/Desktop/ml/dataset"   # <-- YOUR PATH
BENIGN_DIR = os.path.join(BASE_DATASET, "benign")
MALWARE_DIR = os.path.join(BASE_DATASET, "malware")

print("Benign folder:", BENIGN_DIR)
print("Malware folder:", MALWARE_DIR)

Benign folder: C:/Users/Sonali/Desktop/ml/dataset\benign
Malware folder: C:/Users/Sonali/Desktop/ml/dataset\malware


In [31]:
def extract_permissions(apk_path):
    try:
        apk = APK(apk_path)
        perms = apk.get_permissions()
        clean = [p.split(".")[-1] for p in perms]
        return clean
    except Exception as e:
        print("Error:", e)
        return []


In [32]:
apk_records = []

print("\nüì¶ Extracting from BENIGN APKs...")
for file in tqdm(os.listdir(BENIGN_DIR)):
    if file.endswith(".apk"):
        path = os.path.join(BENIGN_DIR, file)
        perms = extract_permissions(path)
        apk_records.append([file, perms, 0])   # 0 = benign


print("\n‚ö†Ô∏è Extracting from MALWARE APKs...")
for file in tqdm(os.listdir(MALWARE_DIR)):
    if file.endswith(".apk"):
        path = os.path.join(MALWARE_DIR, file)
        perms = extract_permissions(path)
        apk_records.append([file, perms, 1])   # 1 = malware


df = pd.DataFrame(apk_records, columns=["apk_name", "permissions", "label"])
print("\nSample data:")
display(df.head())



üì¶ Extracting from BENIGN APKs...


  0%|                                                                                          | 0/2 [00:00<?, ?it/s]Requested API level 33 is larger than maximum we have, returning API level 28 instead.
Requested API level 29 is larger than maximum we have, returning API level 28 instead.
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:00<00:00,  7.19it/s]



‚ö†Ô∏è Extracting from MALWARE APKs...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:00<00:00, 11.15it/s]



Sample data:


Unnamed: 0,apk_name,permissions,label
0,org.ppsspp.ppssppgold--117010000.apk,"[FULL, READ_EXTERNAL_STORAGE, WRITE_EXTERNAL_S...",0
1,PikaShow_APK_v83 [Original File] - (PikaShowHD...,"[READ_SETTINGS, WRITE_SETTINGS, BADGE_COUNT_WR...",0
2,LuckyPatchers.com_Official_Installer_11.9.6.apk,"[SYSTEM_ALERT_WINDOW, INSTALL_SHORTCUT, RECEIV...",1
3,moan_2.0_APKPure.apk,"[C2D_MESSAGE, RECEIVE, WAKE_LOCK, ACCESS_NETWO...",1


In [33]:
print("\nüîß Building permission vocabulary...")

all_perms = set()
for perm_list in df["permissions"]:
    all_perms.update(perm_list)

permission_columns = sorted(list(all_perms))

print("Total unique permissions:", len(permission_columns))




üîß Building permission vocabulary...
Total unique permissions: 47


In [34]:
print("\n‚öôÔ∏è Converting permissions ‚Üí ML feature rows...")

feature_rows = []

for _, row in df.iterrows():
    features = [1 if p in row["permissions"] else 0 for p in permission_columns]
    features.append(row["label"])  # add label
    feature_rows.append(features)

df_features = pd.DataFrame(feature_rows, columns=permission_columns + ["label"])

print("\nFeature preview:")
display(df_features.head())



‚öôÔ∏è Converting permissions ‚Üí ML feature rows...

Feature preview:


Unnamed: 0,ACCESS_FINE_LOCATION,ACCESS_NETWORK_STATE,ACCESS_NOTIFICATION_POLICY,ACCESS_SUPERUSER,ACCESS_WIFI_STATE,AD_ID,Ad_ID,BADGE_COUNT_READ,BADGE_COUNT_WRITE,BILLING,...,UPDATE_COUNT,UPDATE_SHORTCUT,USE_COMPONENT,VIBRATE,WAKE_LOCK,WRITE,WRITE_EXTERNAL_STORAGE,WRITE_MEDIA_STORAGE,WRITE_SETTINGS,label
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,1,1,1,0,1,1,0,1,1,0,...,1,1,0,1,1,1,1,0,1,0
2,0,1,0,1,0,1,1,0,0,1,...,0,0,1,1,1,0,1,1,0,1
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1


In [35]:
output_path = os.path.join(BASE_DATASET, "extracted_permission.csv")
df_features.to_csv(output_path, index=False)

print("\nüéâ DONE! ML-Ready file saved at:")
print(output_path)


üéâ DONE! ML-Ready file saved at:
C:/Users/Sonali/Desktop/ml/dataset\extracted_permission.csv


In [36]:
#5. Save the permission column list for prediction
# ==============================================
import pickle

perm_path = os.path.join(BASE_DATASET, "permission_columns.pkl")
with open(perm_path, "wb") as f:
    pickle.dump(permission_columns, f)

print("üìù Saved permission column list at:", perm_path)

üìù Saved permission column list at: C:/Users/Sonali/Desktop/ml/dataset\permission_columns.pkl
