# 1. Install packages

In [1]:
%%capture
!pip install catboost
!pip install skimpy

# 2.Load libraries

In [2]:
# Data manipulation
#=============================================================================
import pandas as pd
pd.reset_option("display.max_columns", None)
import numpy as np

# Data visualization
#=============================================================================
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns
sns.set_style('darkgrid')
from skimpy import skim

# Data preprocessing
#=============================================================================
from sklearn.model_selection import train_test_split as tts


# Models
#=============================================================================
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Metrics
#=============================================================================
from sklearn.metrics import (balanced_accuracy_score, confusion_matrix, 
                             classification_report, precision_recall_curve, 
                             roc_curve, roc_auc_score)

# tqdm
#=============================================================================
from tqdm.notebook import tqdm

# Counter
#=============================================================================
from collections import Counter

# warnings
#=============================================================================
import warnings
warnings.filterwarnings('ignore')



# 3.Load Data

In [3]:
data = pd.read_csv("/kaggle/input/android-malware-detection-dataset/Android_Malware_Benign.csv")



In [4]:
data.head()

Unnamed: 0,ACCESS_ALL_DOWNLOADS,ACCESS_CACHE_FILESYSTEM,ACCESS_CHECKIN_PROPERTIES,ACCESS_COARSE_LOCATION,ACCESS_COARSE_UPDATES,ACCESS_FINE_LOCATION,ACCESS_LOCATION_EXTRA_COMMANDS,ACCESS_MOCK_LOCATION,ACCESS_MTK_MMHW,ACCESS_NETWORK_STATE,...,com.android.launcher.permission.UNINSTALL_SHORTCUT,com.sec.android.iap.permission.BILLING,com.htc.launcher.permission.UPDATE_SHORTCUT,com.sec.android.provider.badge.permission.WRITE,android.permission.ACCESS_NETWORK_STATE,com.google.android.finsky.permission.BIND_GET_INSTALL_REFERRER_SERVICE,com.huawei.android.launcher.permission.READ_SETTINGS,android.permission.READ_SMS,android.permission.PROCESS_INCOMING_CALLS,Label
0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,Malware
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,1,0,Malware
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,1,0,Malware
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Malware
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,Malware


# 4. EDA

In [5]:
skim(data)

# 5.Data Preprocessing

In [6]:
TARGET = 'Label'
SEED = 42
TEST_SIZE = 0.3  

X = data.drop(columns = [TARGET])
y = data[TARGET]

X_train, X_test, y_train, y_test = tts( X, 
                                        y, 
                                        test_size = TEST_SIZE, 
                                        random_state = SEED, 
                                        stratify = y)

In [7]:
print(f"y_train: {Counter(y_train)}")
print(f"y_test: {Counter(y_test)}")

y_train: Counter({'Malware': 1773, 'Benign': 1351})
y_test: Counter({'Malware': 760, 'Benign': 580})


In [8]:
from sklearn.preprocessing import LabelEncoder


In [9]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

# 6.Models

In [10]:
from sklearn.ensemble import ExtraTreesClassifier

clf1 = LogisticRegression(random_state=SEED, 
                          class_weight='balanced', n_jobs=-1)

clf2 = SVC(class_weight='balanced',
             random_state=SEED)

clf3 = KNeighborsClassifier(n_jobs=-1)

clf4 = RandomForestClassifier(random_state=SEED,
                              class_weight='balanced', n_jobs=-1)

clf5 = ExtraTreesClassifier(bootstrap=True, 
                            class_weight='balanced', random_state=SEED, n_jobs=-1)

clf6 = GradientBoostingClassifier(random_state=SEED)

clf7 = AdaBoostClassifier(random_state=SEED)

clf8 = XGBClassifier(random_state=SEED, n_jobs=-1)

clf9 = LGBMClassifier(random_state=SEED,
                      class_weight='balanced', n_jobs=-1, verbosity=-1)

clf10 = CatBoostClassifier(random_state=SEED, 
                           auto_class_weights='Balanced', verbose=0)

MODELS = [clf1, clf2, clf3, clf4, clf5, clf6, clf7, clf8, clf9, clf10]



NameError: name 'RandomForestClassifier' is not defined

In [None]:
for model in tqdm(MODELS):
    name = type(model).__name__
    model.fit(X_train.to_numpy(dtype = np.float32), y_train)
    
    y_pred_train = model.predict(X_train.to_numpy(dtype = np.float32))
    y_pred_test = model.predict(X_test.to_numpy(dtype = np.float32))
    
    score_train = balanced_accuracy_score(y_train, y_pred_train)
    score_test = balanced_accuracy_score(y_test, y_pred_test)
    
    print("==" * 30)
    print(f"\033[1;33m {name} \033[0;m :\n") 
    print(f' Accuracy Train: {score_train:.4f} |', 
          f'Accuracy Test: {score_test:.4f}\n')
    print("==" * 30)

The model that generalized best is **GradientBoostingClassifier**, so we'll compute some additional metrics and visualizations.

# 7.Metrics

In [None]:
# Predictions !!!
y_pred_train_final = clf6.predict(X_train.to_numpy(dtype = np.float32))
y_pred_test_final = clf6.predict(X_test.to_numpy(dtype = np.float32))

y_pred_prob_train = clf6.predict_proba(X_train.to_numpy(dtype = np.float32))[:,1]
y_pred_prob_test = clf6.predict_proba(X_test.to_numpy(dtype = np.float32))[:,1]

In [None]:
print("##" * 40)
print(" " * 25, "Classification Report Train")
print("##" * 40)
print(classification_report(y_train, y_pred_train_final, target_names = le.classes_))
print("")

print("##" * 40)
print(" " * 25, "Classification Report Test")
print("##" * 40)
print(classification_report(y_test, y_pred_test_final, target_names = le.classes_))

*** Confusion Matrix**

In [None]:
cf_mx_train = confusion_matrix(y_train, y_pred_train_final)
cf_mx_test = confusion_matrix(y_test, y_pred_test_final)

fig,axs = plt.subplots(nrows = 1, ncols = 2, figsize = (9,4))
axs = axs.flat

sns.heatmap(cf_mx_train, cmap = 'Reds', annot = True, annot_kws = {'fontsize':11, 'fontweight':'bold'}, linewidths = 1.5, fmt = '', xticklabels = le.classes_, yticklabels = le.classes_, cbar = False, square = True, ax = axs[0])
sns.heatmap(cf_mx_test, cmap = 'Blues', annot = True, annot_kws = {'fontsize':11, 'fontweight':'bold'}, linewidths = 1.5, fmt = '', xticklabels = le.classes_, yticklabels = le.classes_, cbar = False, square = True, ax = axs[1])
axs[0].set_xlabel('Predicted', fontsize = 12, fontweight = "bold", color = "black")
axs[1].set_xlabel('Predicted', fontsize = 12, fontweight = "bold", color = "black")
axs[0].set_ylabel('True', fontsize = 12, fontweight = "bold", color = "black")
axs[1].set_ylabel('True', fontsize = 12, fontweight = "bold", color = "black")
axs[0].set_title('Confusion Matrix Train', fontsize = 14, fontweight = "bold", color = "black")
axs[1].set_title('Confusion Matrix Test', fontsize = 14, fontweight = "bold", color = "black")

fig.tight_layout()
fig.show()

*** Precision Recall Curve**

In [None]:
precision_train, recall_train, _ = precision_recall_curve(y_train, 
                                                          y_pred_prob_train)
precision_test, recall_test, _ = precision_recall_curve(y_train, 
                                                        y_pred_prob_train)

plt.figure(figsize = (4,3.5))
plt.plot(recall_train, precision_train, linestyle = '--', label = 'Train')
plt.plot(recall_test, precision_test, label = 'Test')
plt.xlabel("Recall", fontsize = 11, fontweight = 'bold', color = 'black')
plt.ylabel("Precision", fontsize = 11, fontweight = 'bold', color = 'black')
plt.title("Precision Recall Curve", fontsize = 14, fontweight = 'bold', color = 'black')
plt.legend()
plt.show()

*** ROC Curve**

In [None]:

fpr_train, tpr_train, _ = roc_curve(y_train, 
                                    y_pred_prob_train,)

fpr_test, tpr_test, _ = roc_curve(y_train, 
                                  y_pred_prob_train)

plt.figure(figsize = (4,3.5))
plt.plot(fpr_train, tpr_train, linestyle = '--', label = f'Train AUC: {roc_auc_score(y_train, y_pred_prob_train):.4f}')
plt.plot(fpr_test, tpr_test, label = f'Test AUC: {roc_auc_score(y_test, y_pred_prob_test):.4f}')
plt.xlabel("FPR", fontsize = 11, fontweight = 'bold', color = 'black')
plt.ylabel("TPR", fontsize = 11, fontweight = 'bold', color = 'black')
plt.title("ROC Curve", fontsize = 14, fontweight = 'bold', color = 'black')
plt.plot([0,1], linestyle = '--', color = 'black')
plt.legend()
plt.show()