<h2> Filter-Based Feature Selection

<h3> Database

In [1]:
import pandas as pd
import numpy as np
from sklearn import model_selection
import matplotlib.pyplot as plt

In [2]:
ds = pd.read_csv("drebin.csv", low_memory=False)

In [3]:
ds.head()


Unnamed: 0,SEND_SMS,READ_PHONE_STATE,GET_ACCOUNTS,RECEIVE_SMS,READ_SMS,USE_CREDENTIALS,MANAGE_ACCOUNTS,WRITE_SMS,READ_SYNC_SETTINGS,AUTHENTICATE_ACCOUNTS,...,READ_CONTACTS,DEVICE_POWER,HARDWARE_TEST,ACCESS_WIFI_STATE,WRITE_EXTERNAL_STORAGE,ACCESS_FINE_LOCATION,SET_WALLPAPER_HINTS,SET_PREFERRED_APPLICATIONS,WRITE_SECURE_SETTINGS,class
0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
1,1,1,0,1,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
2,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,1,0,0,1,0,0,1,0,0,...,0,0,0,1,1,1,0,0,0,1
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,1


<h2> Training Data

In [23]:
from itertools import count

def a (permission,class_id,ds):
    ds_class = ds[ds["class"] == class_id]
    return len(ds_class[permission][ds_class[permission] == 1])

def b (permission, class_id,ds):
    ds_class = ds[ds["class"] == class_id]
    return len(ds_class[permission][ds_class[permission] != 1])

def c (permission_id,class_id,ds):
    ds_class = ds[ds["class"] != class_id]
    return len(ds_class[permission_id][ds_class[permission_id] == 1])

def d (permission_id,class_id,ds):
    ds_class = ds[ds["class"] != class_id]
    return len(ds_class[permission_id][ds_class[permission_id] != 1])

def N (ds):
    return ds.shape[0]

def n1(class_id, ds):
    ds_class = ds[ds['class'] == class_id]
    return len(ds_class)

def n2(class_id, ds):
    ds_class = ds[ds['class'] != class_id]
    return len(ds_class)

def d1(permission, ds):
    ds_permission = ds[ds[permission] == 1]
    return len(ds_permission)

def d2(permission, ds):
    ds_permission = ds[ds[permission] != 1]
    return len(ds_permission)

best_features = []

<h3> Odds ratio

In [46]:
def od(permission_id,class_id,ds):
    return (a(permission_id,class_id,ds) * d(permission_id,class_id,ds)) / (b(permission_id,class_id,ds) * c(permission_id,class_id,ds))

def get_od_list(class_id, ds):
    od_list = []
    for permission in ds.columns[:-1]:
        od_list.append([permission, od(permission, class_id, ds)])
    return sorted(od_list, key=lambda x: x[1], reverse=True)

def get_od_df(class_id, ds):
    od_list = get_od_list(class_id, ds)
    od_df = pd.DataFrame(od_list, columns=['permission', 'od'])
    od_df.rename(columns={'od': 'value'}, inplace=True)
    return od_df
    
od_df = get_od_df(1, ds)
od_df.head(10)

Unnamed: 0,permission,value
0,SEND_SMS,18.67987
1,READ_PHONE_STATE,8.673253
2,RECEIVE_SMS,8.152409
3,READ_SMS,7.2639
4,INSTALL_PACKAGES,7.099093
5,WRITE_APN_SETTINGS,6.957784
6,DELETE_CACHE_FILES,6.886442
7,WRITE_HISTORY_BOOKMARKS,6.550344
8,ACCESS_LOCATION_EXTRA_COMMANDS,6.527379
9,WRITE_SMS,5.675233


<h3> Chi-Square

In [47]:
def chi_squared(permission_id,class_id,ds):
    aa = a(permission_id,class_id,ds)
    bb = b(permission_id,class_id,ds)
    cc = c(permission_id,class_id,ds)
    dd = d(permission_id,class_id,ds)
    return (N(ds)*((aa*dd - bb*cc)**2) / ((aa+bb)*(cc+dd)*(aa+cc)*(bb+dd)))
    

def get_chi_squared_list(class_id, ds):
    chi_squared_list = []
    for permission in ds.columns[:-1]:
        chi_squared_list.append([permission, chi_squared(permission, class_id, ds)])
    return sorted(chi_squared_list, key=lambda x: x[1], reverse=True)

def get_chi_squared_df(class_id, ds):
    chi_squared_list = get_chi_squared_list(class_id, ds)
    chi_squared_df = pd.DataFrame(chi_squared_list, columns=['permission', 'chi_squared'])
    chi_squared_df.rename(columns={'chi_squared': 'value'}, inplace=True)
    return chi_squared_df

chi_squared_df = get_chi_squared_df(1, ds)
chi_squared_df.head(10)

Unnamed: 0,permission,value
0,SEND_SMS,4476.733167
1,READ_PHONE_STATE,2509.602423
2,RECEIVE_SMS,2263.520719
3,READ_SMS,2058.523558
4,GET_ACCOUNTS,2021.06777
5,WRITE_SMS,1074.021107
6,MANAGE_ACCOUNTS,989.819878
7,USE_CREDENTIALS,981.587411
8,WRITE_HISTORY_BOOKMARKS,880.938484
9,INSTALL_PACKAGES,833.6957


<h3> Inverse Document Frequency

In [49]:
def idf(permission_id,class_id,ds):
    return np.log(N(ds) / (a(permission_id,class_id,ds) + c(permission_id,class_id,ds)))

def get_idf_list(class_id, ds):
    idf_list = []
    for permission in ds.columns[:-1]:
        idf_list.append([permission, idf(permission, class_id, ds)])
    return sorted(idf_list, key=lambda x: x[1], reverse=True)

def get_idf_df(class_id, ds):
    idf_list = get_idf_list(class_id, ds)
    idf_df = pd.DataFrame(idf_list, columns=['permission', 'idf'])
    idf_df.rename(columns={'idf': 'value'}, inplace=True)
    return idf_df

idf_df = get_idf_df(1, ds)
idf_df.head(10)

Unnamed: 0,permission,value
0,BIND_VPN_SERVICE,7.538761
1,SET_ACTIVITY_WATCHER,6.845614
2,BIND_TEXT_SERVICE,6.482708
3,ADD_VOICEMAIL,6.217005
4,INSTALL_LOCATION_PROVIDER,6.184215
5,SET_PROCESS_LIMIT,6.121695
6,MOUNT_FORMAT_FILESYSTEMS,5.929323
7,BIND_ACCESSIBILITY_SERVICE,5.880533
8,ACCESS_SURFACE_FLINGER,5.857002
9,CLEAR_APP_USER_DATA,5.834013


<h3> Document Frequency Threshold

In [50]:
def dft(permission_id,class_id,ds):
    return a(permission_id,class_id,ds) + c(permission_id,class_id,ds)

def get_dft_list(class_id, ds):
    dft_list = []
    for permission in ds.columns[:-1]:
        dft_list.append([permission, dft(permission, class_id, ds)])
    return sorted(dft_list, key=lambda x: x[1], reverse=True)

def get_dft_df(class_id, ds):
    dft_list = get_dft_list(class_id, ds)
    dft_df = pd.DataFrame(dft_list, columns=['permission', 'dft'])
    dft_df.rename(columns={'dft': 'value'}, inplace=True)
    return dft_df

dft_df = get_dft_df(1, ds)
dft_df.head(10)

Unnamed: 0,permission,value
0,INTERNET,13111
1,ACCESS_NETWORK_STATE,10889
2,WRITE_EXTERNAL_STORAGE,10016
3,READ_PHONE_STATE,9509
4,WAKE_LOCK,7326
5,ACCESS_WIFI_STATE,6532
6,RECEIVE_BOOT_COMPLETED,5779
7,VIBRATE,5453
8,GET_ACCOUNTS,4492
9,ACCESS_FINE_LOCATION,4373


<h3> Acc and Acc2

In [51]:
def acc(permission_id,class_id,ds):
    return a(permission_id,class_id,ds) - c(permission_id,class_id,ds)

def acc2(permission_id,class_id,ds):
    return (a(permission_id,class_id,ds)/n1(class_id,ds)) - (c(permission_id,class_id,ds)/n2(class_id,ds))

def get_acc_list(class_id, ds):
    acc_list = []
    for permission in ds.columns[:-1]:
        acc_list.append([permission, acc(permission, class_id, ds)])
    return sorted(acc_list, key=lambda x: x[1], reverse=True)

def get_acc_df(class_id, ds):
    acc_list = get_acc_list(class_id, ds)
    acc_df = pd.DataFrame(acc_list, columns=['permission', 'acc'])
    acc_df.rename(columns={'acc': 'value'}, inplace=True)
    return acc_df

acc_df = get_acc_df(1, ds)
acc_df.head(10)

def get_acc2_list(class_id, ds):
    acc2_list = []
    for permission in ds.columns[:-1]:
        acc2_list.append([permission, acc2(permission, class_id, ds)])
    return sorted(acc2_list, key=lambda x: x[1], reverse=True)

def get_acc2_df(class_id, ds):
    acc2_list = get_acc2_list(class_id, ds)
    acc2_df = pd.DataFrame(acc2_list, columns=['permission', 'acc2'])
    acc2_df.rename(columns={'acc2': 'value'}, inplace=True)
    return acc2_df

acc2_df = get_acc2_df(1, ds)
acc2_df.head(10)

Unnamed: 0,permission,value
0,SEND_SMS,0.480397
1,READ_PHONE_STATE,0.408036
2,RECEIVE_SMS,0.31348
3,READ_SMS,0.298702
4,WRITE_SMS,0.175408
5,RECEIVE_BOOT_COMPLETED,0.160685
6,INTERNET,0.14036
7,READ_HISTORY_BOOKMARKS,0.138817
8,WRITE_HISTORY_BOOKMARKS,0.135516
9,INSTALL_PACKAGES,0.124426


<h3> M2 Method

In [52]:
def m2(permission_id,class_id,ds):
    return dft(permission_id,class_id,ds) * (np.absolute(a(permission_id,class_id,ds)/d1(permission_id,ds) - (b(permission_id,class_id,ds)/d2(permission_id,ds))))

def get_m2_list(class_id, ds):
    m2_list = []
    for permission in ds.columns[:-1]:
        m2_list.append([permission, m2(permission, class_id, ds)])
    return sorted(m2_list, key=lambda x: x[1], reverse=True)

def get_m2_df(class_id, ds):
    m2_list = get_m2_list(class_id, ds)
    m2_df = pd.DataFrame(m2_list, columns=['permission', 'm2'])
    m2_df.rename(columns={'m2': 'value'}, inplace=True)
    return m2_df

m2_df = get_m2_df(1, ds)
m2_df.head(10)

Unnamed: 0,permission,value
0,READ_PHONE_STATE,3889.635607
1,INTERNET,3841.6
2,SEND_SMS,2205.130162
3,GET_ACCOUNTS,1736.966237
4,RECEIVE_SMS,1351.347079
5,READ_SMS,1287.011776
6,ACCESS_NETWORK_STATE,1256.416687
7,WAKE_LOCK,1127.217121
8,RECEIVE_BOOT_COMPLETED,914.546829
9,WRITE_SMS,693.090145


<h3> Relevance frequency feature selection

In [53]:
def rffs(permission_id,class_id,ds):
    return dft(permission_id,class_id,ds) * np.absolute(np.log(2 + (a(permission_id,class_id,ds)/c(permission_id,class_id,ds))))

def get_rffs_list(class_id, ds):
    rffs_list = []
    for permission in ds.columns[:-1]:
        rffs_list.append([permission, rffs(permission, class_id, ds)])
    return sorted(rffs_list, key=lambda x: x[1], reverse=True)

def get_rffs_df(class_id, ds):
    rffs_list = get_rffs_list(class_id, ds)
    rffs_df = pd.DataFrame(rffs_list, columns=['permission', 'rffs'])
    rffs_df.rename(columns={'rffs': 'value'}, inplace=True)
    return rffs_df

rffs_df = get_rffs_df(1, ds)
rffs_df.head(10)

Unnamed: 0,permission,value
0,INTERNET,12960.075184
1,READ_PHONE_STATE,10709.099563
2,ACCESS_NETWORK_STATE,10023.010783
3,WRITE_EXTERNAL_STORAGE,9541.940053
4,SEND_SMS,7104.367292
5,WAKE_LOCK,6444.75993
6,ACCESS_WIFI_STATE,6224.220963
7,RECEIVE_BOOT_COMPLETED,6106.762553
8,VIBRATE,4836.028654
9,RECEIVE_SMS,4622.438909


<h3> Information Gain

In [33]:
def pci(class_id,ds):
    return len(ds[ds["class"] == class_id])/len(ds)

def pp(permission_id,ds):
    return len(ds[ds[permission_id] == 1])/len(ds)

def pcip(permission_id,class_id,ds,is_used=True):
    ds_class = ds[ds["class"] == class_id]
    if is_used:
        return len(ds_class[ds_class[permission_id] == 1])/len(ds_class)
    else:
        return len(ds_class[ds_class[permission_id] == 0])/len(ds_class)

In [55]:
def ig(permission_id,ds):
    class_distribution = ds["class"].value_counts()
    a = 0.0
    b = 0.0
    c = 0.0
    for i in class_distribution.index:
        f_value = pci(i,ds)
        if f_value > 0.0:
            a += f_value * np.log2(f_value)
        f_value = pcip(permission_id,i,ds)
        if f_value > 0.0:
            b += f_value * np.log2(f_value)
        f_value = pcip(permission_id,i,ds,False)
        if f_value > 0.0:
            c += f_value * np.log2(f_value)
    b *= pp(permission_id,ds)
    c *= (1.0 - pp(permission_id,ds))
    return -1.0 * (a + b + c)

def get_ig_list(class_id, ds):
    ig_list = []
    for permission in ds.columns[:-1]:
        ig_list.append([permission, ig(permission, ds)])
    return sorted(ig_list, key=lambda x: x[1], reverse=True)

def get_ig_df(class_id, ds):
    ig_list = get_ig_list(class_id, ds)
    ig_df = pd.DataFrame(ig_list, columns=['permission', 'ig'])
    ig_df.rename(columns={'ig': 'value'}, inplace=True)
    return ig_df

ig_df = get_ig_df(1, ds)
ig_df.head(10)

Unnamed: 0,permission,value
0,ACCESS_WIFI_STATE,1.930803
1,WAKE_LOCK,1.926565
2,RECEIVE_BOOT_COMPLETED,1.886873
3,VIBRATE,1.839433
4,WRITE_EXTERNAL_STORAGE,1.822385
5,ACCESS_FINE_LOCATION,1.753622
6,ACCESS_COARSE_LOCATION,1.745439
7,ACCESS_NETWORK_STATE,1.735259
8,READ_PHONE_STATE,1.676185
9,READ_CONTACTS,1.629811


In [61]:
def best_features(class_id, ds):
    best_features_set = set()
    best_features_set.update([x[0] for x in get_acc_list(class_id, ds)[:10]])
    best_features_set.update([x[0] for x in get_acc2_list(class_id, ds)[:10]])
    best_features_set.update([x[0] for x in get_m2_list(class_id, ds)[:10]])
    best_features_set.update([x[0] for x in get_rffs_list(class_id, ds)[:10]])
    best_features_set.update([x[0] for x in get_ig_list(class_id, ds)[:10]])
    best_features_set.update([x[0] for x in get_od_list(class_id, ds)[:10]])
    best_features_set.update([x[0] for x in get_chi_squared_list(class_id, ds)[:10]])
    return best_features_set

best_features_set = best_features(1, ds)

TypeError: 'list' object is not callable