In [1]:
import pandas as pd
import os
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score



In [2]:
BASE_DATASET = r"C:\Users\SHAIKH\OneDrive\Desktop\ML\ML\dataset"
FEATURES_PATH = os.path.join(BASE_DATASET, "extracted_permission.csv")
MODEL_DIR = r"C:\Users\SHAIKH\OneDrive\Desktop\ML\ML\models"
os.makedirs(MODEL_DIR, exist_ok=True)

In [3]:
df = pd.read_csv(FEATURES_PATH)

print("Columns found:", df.columns)

Columns found: Index(['ACCESS_ADSERVICES_AD_ID', 'ACCESS_ADSERVICES_ATTRIBUTION',
       'ACCESS_COARSE_LOCATION', 'ACCESS_DOWNLOAD_MANAGER',
       'ACCESS_DOWNLOAD_MANAGER_ADVANCED', 'ACCESS_FINE_LOCATION',
       'ACCESS_INSTANT_APPS', 'ACCESS_MEDIA_LOCATION', 'ACCESS_NETWORK_STATE',
       'ACCESS_NOTIFICATION_POLICY',
       ...
       'WRITE_CONTACTS', 'WRITE_DEVICE_CONFIG', 'WRITE_EXTERNAL_STORAGE',
       'WRITE_GSERVICES', 'WRITE_MEDIA_STORAGE', 'WRITE_SECURE_SETTINGS',
       'WRITE_SETTINGS', 'WRITE_SYNC_SETTINGS', 'WRITE_VERIFY_APPS_CONSENT',
       'label'],
      dtype='object', length=185)


In [4]:
# X = all permission columns
X = df.drop("label", axis=1)

# y = label column
y = df["label"]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=42, stratify=y
)

In [6]:
# This defines the 'clf' variable causing your Pylance error
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
print("Model trained successfully!")

with open(os.path.join(MODEL_DIR, "apk_classifier.pkl"), "wb") as f:
    pickle.dump(clf, f)

print("Model saved at:", os.path.join(MODEL_DIR, "apk_classifier.pkl"))

Model trained successfully!
Model saved at: C:\Users\SHAIKH\OneDrive\Desktop\ML\ML\models\apk_classifier.pkl


In [7]:
y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.75

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           1       0.67      1.00      0.80         2

    accuracy                           0.75         4
   macro avg       0.83      0.75      0.73         4
weighted avg       0.83      0.75      0.73         4


Confusion Matrix:
 [[1 1]
 [0 2]]
