# 老化数据集使用说明
## 1 数据集介绍
我们遵循drebin方法，反编译apk文件得到了11种类型的特征（具体类型名见下表），并为每一个apk文件生成了特征向量，以稀疏矩阵的形式存储到当前文件夹下的.pkl文件中：
* data-2012_normal.pkl: 包括2012年中所有apk的特征向量
* data-2013-1_normal.pkl: 包括2013年1月所有apk的特征向量
* data-2013-2_normal.pkl: 同上

## 2 简单实例
下面我们介绍，如何使用该数据集，训练一个最简单的线性支持向量机分类器，并且在测试集上验证模型老化现象（训练集选取2012年的apk，测试集选取2013年各月的apk）

In [11]:
import pickle as pkl
import os
import json
import pandas as pd
import numpy as np
import pickle as pkl
import scipy.sparse as sp
from sklearn import metrics
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

  from numpy.core.umath_tests import inner1d


In [2]:
pwd

'D:\\Documents\\jupyter_notebook\\drebin_data2'

In [5]:
with open("data-2012_normal.pkl", 'rb') as f:
    train_set = pkl.load(f)
train_set

{'sha256s': ['19780be5261c3a22235909aa8ba3cc013bb3ecec7bd801f06f55468a17c0af8d',
  '0cbd23c1f4886c186605d6098870c5c8ab922c900dff7f3c8377bfb2dc77dae8',
  '4ab493330e468fba3add9640797e12cce12a911fcb3acbff8d0ba9973b215ee6',
  'c0dec53548d5e1d774ff0b267d8f4cf76d503e9e1f9e2e71cee94dbfe50dc4dc',
  'e14d57bb5e7679e908c86007eafd0398782f9aff34fa9590f961e557e66bf5b0',
  '44bce0c3a767aff0f28457fd074f281ef4e30760eb68081ec2bb17d68f71ba3b',
  'c3d8c04049e61c1b7834e8fc64059b8091e1245aa4c78cfa4a7a5a33b9c92670',
  '91bd2bbd31f5bbafd36a45b3a77ce5318003c6d368c2f45129091ca4744f44cb',
  '0ff8756fe9d8db5ac106cf816c050b04175be6983045a543c8d141c8500ba5b1',
  '21c9232f8d343f1f3b44519db73cc1a0b33031a664840bc01fc393b720aa0fd0',
  '3f3af60ad19d7ee8ae33824e3d7fd829b3a31d51d5007b0b2ee24d9f9fb46732',
  '1c917ef1b01c17d418be72a00df3863f833fe0b68cbf577b1d4ab4e550e25212',
  'b2e233edcd6276ca4acc8e8dfdb4510005e2a9821932810c15e8be761d024ab3',
  'b61e0098229d0c00cbd2c11e10ecc7ab1ac6b54afe5813190e940356387be65d',
  '4e4c0d

In [6]:
train_set.keys()

dict_keys(['sha256s', 'x', 'y'])

In [8]:
print(train_set['x'])

  (0, 2502)	1
  (0, 2504)	1
  (0, 2492)	1
  (0, 2697)	1
  (0, 2491)	1
  (0, 2392)	1
  (0, 2703)	1
  (0, 2696)	1
  (0, 2316)	1
  (0, 2160)	1
  (0, 2496)	1
  (0, 2872)	1
  (0, 3402)	1
  (0, 2698)	1
  (0, 1966)	1
  (0, 2876)	1
  (0, 3212)	1
  (0, 2485)	1
  (0, 2700)	1
  (0, 2701)	1
  (0, 2400)	1
  (0, 2297)	1
  (0, 2490)	1
  (0, 2850)	1
  (0, 2704)	1
  :	:
  (30609, 3781)	1
  (30609, 3019)	1
  (30609, 26067)	1
  (30609, 3025)	1
  (30609, 1962)	1
  (30609, 3024)	1
  (30609, 3037)	1
  (30609, 1954)	1
  (30609, 3425)	1
  (30609, 2731)	1
  (30609, 3038)	1
  (30609, 2739)	1
  (30609, 3316)	1
  (30609, 2003)	1
  (30609, 3030)	1
  (30609, 3426)	1
  (30609, 87812)	1
  (30609, 87815)	1
  (30609, 87813)	1
  (30609, 26068)	1
  (30609, 78439)	1
  (30609, 78441)	1
  (30609, 78442)	1
  (30609, 78440)	1
  (30609, 163327)	1


In [9]:
print(train_set['y'])

[0 0 0 ... 1 1 1]


In [14]:
def evl_index(label, pred, detail=False):
    """
    metrics：
    f1, accuracy, precision, recall
    confusion_matrix
    """
    f1 = metrics.f1_score(label, pred)
    acc = metrics.accuracy_score(label, pred)
    precision = metrics.precision_score(label, pred)
    recall = metrics.recall_score(label, pred)
    if detail is True:
        print("\nConfusion_Matrix:\n{}".format(metrics.confusion_matrix(label, pred)))
    return f1, acc, precision, recall

In [16]:
x_train = train_set['x']
y_train = train_set['y']

# train the model
Parameters= {'C': [10]} #[0.001, 0.01, 0.1, 1, 10, 100, 1000]
svm = GridSearchCV(LinearSVC(), Parameters, cv= 5, scoring= 'f1', n_jobs=-1)
svm.fit(x_train, y_train)

# test the model
rets = []
year = 2013
for month in range(1, 13):
    period = "{}-{}".format(year, month)
    with open("data-{}_normal.pkl".format(period), 'rb') as f:
        testing_set = pkl.load(f)
    x_test = testing_set['x']
    y_test = testing_set['y']
    y_pred = svm.predict(x_test)
    f1, acc, precision, recall = evl_index(y_test, y_pred)
    rets.append([period, f1, acc, precision, recall])
df = pd.DataFrame(rets, columns=["period", "f1", "acc", "precision", "recall"])
display(df)

Unnamed: 0,period,f1,acc,precision,recall
0,2013-1,0.938296,0.9874,0.919386,0.958
1,2013-2,0.89689,0.980263,0.938356,0.858934
2,2013-3,0.882083,0.977239,0.914286,0.852071
3,2013-4,0.890357,0.978702,0.915761,0.866324
4,2013-5,0.861233,0.974212,0.930952,0.80123
5,2013-6,0.856209,0.974072,0.959707,0.772861
6,2013-7,0.881764,0.976753,0.894309,0.869565
7,2013-8,0.913793,0.98362,0.959276,0.872428
8,2013-9,0.887234,0.978783,0.939189,0.840726
9,2013-10,0.847917,0.970794,0.882863,0.815631
