In [5]:
import psutil #系统内存处理工具，单位是比特
import os

mem = psutil.virtual_memory()
print("总内存： ", mem.total/1024/1024)
print("已使用内存： ", mem.used/1024/1024)
print("空闲内存： ", mem.free/1024/1024)
print("使用占比： ", mem.percent)
print("当前线程： ", os.getpid())

总内存：  385589.26953125
已使用内存：  190206.390625
空闲内存：  111184.1484375
使用占比：  50.7
当前线程：  58045


In [5]:
import pandas as pd

#数据加载
# 对于大数据的处理可以选择chunksize 或者 读取文件指针

def get_data(filename):
    result = []
    for df in pd.read_csv(open(filename, 'r'), chunksize=100000):
        result.append(df)
    result = pd.concat(result, ignore_index=True, axis=0)
    return result

#获取全量数据

train = get_data('./data/security_train.csv')
test = get_data('./data/security_test.csv')

In [7]:
import pickle #管理python对象

# 写成pkl文件
with open('./data/trian.pkl', 'wb') as f:
    pickle.dump(train, f)

with open('./data/test.pkl', 'wb') as f:
    pickle.dump(test, f)

In [1]:
import pandas as pd
import pickle

In [2]:
# 读取pkl 文件
with open('./data/trian.pkl', 'rb') as f:
    train = pickle.load(f)
with open('./data/test.pkl', 'rb') as f:
    test = pickle.load(f)

In [5]:
# 1.数据探索
train['api'].describe()

count                   89806693
unique                       295
top       LdrGetProcedureAddress
freq                    10704305
Name: api, dtype: object

In [3]:
# 对api字段进行LabelEncoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
# 存在一种可能性就是在测试集中出现的内容并不一定出现在训练集中（尤其是文本类数据），
# 因此在特征工程工作中，我们将测试集和训练集合并在一起进行处理
df_all = pd.concat([train, test])
df_all['api'] = le.fit_transform(df_all['api'])

In [4]:
df_all['api']

0           135
1           134
2           134
3           134
4           134
           ... 
79288370    266
79288371    266
79288372    152
79288373    281
79288374    197
Name: api, Length: 169095068, dtype: int64

In [5]:
train

Unnamed: 0,file_id,label,api,tid,index
0,1,5,LdrLoadDll,2488,0
1,1,5,LdrGetProcedureAddress,2488,1
2,1,5,LdrGetProcedureAddress,2488,2
3,1,5,LdrGetProcedureAddress,2488,3
4,1,5,LdrGetProcedureAddress,2488,4
...,...,...,...,...,...
89806688,13887,2,NtClose,2336,618
89806689,13887,2,NtClose,2336,619
89806690,13887,2,NtClose,2336,620
89806691,13887,2,NtClose,2336,621


In [6]:
# 分离数据集和测试集
train['api'] = df_all[df_all['label'].notnull()]['api']
test['api'] = df_all[df_all['label'].isnull()]['api']

In [7]:
train

Unnamed: 0,file_id,label,api,tid,index
0,1,5,135,2488,0
1,1,5,134,2488,1
2,1,5,134,2488,2
3,1,5,134,2488,3
4,1,5,134,2488,4
...,...,...,...,...,...
89806688,13887,2,152,2336,618
89806689,13887,2,152,2336,619
89806690,13887,2,152,2336,620
89806691,13887,2,152,2336,621


In [10]:
test

Unnamed: 0,file_id,api,tid,index
0,1,226,2332,0
1,1,13,2332,1
2,1,205,2332,2
3,1,23,2332,3
4,1,226,2468,0
...,...,...,...,...
79288370,12955,266,2740,1446
79288371,12955,266,2740,1447
79288372,12955,152,2740,1448
79288373,12955,281,2740,1449


In [8]:
# 查看某个具体资源的使用情况
import sys
sys.getsizeof(df_all)/1024/1024

7740.55793762207

In [9]:
# 释放资源
import gc
del df_all
gc.collect

<function gc.collect(generation=2)>

In [33]:
# 聚合统计，构造新的特征
def get_features(df):
    #按照 file_id 进行分组
    df_file = df.groupby('file_id')
    if 'label' in df.columns: #如果df是训练集
        df1 = df.drop_duplicates(subset=['file_id', 'label'], keep='first')  # drop_duplicates() ：后续没有处理的话，会造成很大的特征损失
    else: #测试集
        df1 = df.drop_duplicates(subset=['file_id'], keep='first')
    df1 = df1.sort_values('file_id')
    #提取多个统计特征
    features = ['api', 'tid', 'index']
    for f in features:
        df1[f+'_count'] = df_file[f].count().values
        df1[f+'_nunique'] = df_file[f].nunique().values
        df1[f+'_min'] = df_file[f].min().values
        df1[f+'_max'] = df_file[f].max().values
        df1[f+'_mean'] = df_file[f].mean().values
        df1[f+'_median'] = df_file[f].median().values
        df1[f+'_std'] = df_file[f].std().values
        df1[f+'_ptp'] = df1[f+'_max'] - df1[f+'_min']
    return df1



In [34]:
df_train = get_features(train)
df_train

Unnamed: 0,file_id,label,api,tid,index,api_count,api_nunique,api_min,api_max,api_mean,...,tid_std,tid_ptp,index_count,index_nunique,index_min,index_max,index_mean,index_median,index_std,index_ptp
0,1,5,135,2488,0,6786,116,6,298,171.965223,...,83.881299,324,6786,5001,0,5000,2000.806955,1607.5,1510.694221,5000
6786,2,2,95,2320,0,816,30,89,298,159.696078,...,101.506783,284,816,204,0,203,101.500000,101.5,58.925137,203
7602,3,0,151,2208,0,463,42,9,258,164.948164,...,0.000000,0,463,463,0,462,231.000000,231.0,133.800847,462
8065,4,0,95,2284,0,2046,51,9,257,154.939883,...,150.460506,696,2046,1028,0,1027,511.012219,511.0,295.407885,1027
10111,5,0,249,2500,0,10002,65,6,254,201.893421,...,49.556301,176,10002,5001,0,5000,2500.000000,2500.0,1443.736493,5000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89620181,13883,2,95,100,0,178221,71,6,279,156.643100,...,1405.045515,6468,178221,5001,0,5000,401.480987,47.0,1008.636040,5000
89798402,13884,5,95,2592,0,1319,39,6,279,163.025019,...,4.295386,156,1319,1319,0,1318,659.000000,659.0,380.906813,1318
89799721,13885,0,151,2240,0,1033,71,8,259,174.896418,...,33.152020,504,1033,1033,0,1032,516.000000,516.0,298.345717,1032
89800754,13886,1,95,2324,0,5316,80,9,281,168.313017,...,154.796790,512,5316,2503,0,2502,1173.050414,1165.5,755.545651,2502


In [35]:
df_test = get_features(test)
df_test

Unnamed: 0,file_id,api,tid,index,api_count,api_nunique,api_min,api_max,api_mean,api_median,...,tid_std,tid_ptp,index_count,index_nunique,index_min,index_max,index_mean,index_median,index_std,index_ptp
0,1,226,2332,0,97,15,13,262,155.989691,152.0,...,57.218548,236,97,31,0,30,14.443299,14.0,9.210466,30
97,2,226,2472,0,1361,40,6,261,138.025716,138.0,...,104.399149,276,1361,681,0,680,339.750184,340.0,196.515744,680
1458,3,95,2344,0,16,9,16,257,111.375000,134.0,...,0.000000,0,16,16,0,15,7.500000,7.5,4.760952,15
1474,4,135,2452,0,193,34,13,262,172.217617,170.0,...,50.951508,132,193,193,0,192,96.000000,96.0,55.858452,192
1667,5,95,2332,0,803,34,16,261,168.490660,153.0,...,201.826813,448,803,268,0,267,133.333748,133.0,77.317048,267
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79277890,12951,151,2644,0,289,37,9,269,140.536332,151.0,...,75.402526,336,289,145,0,144,71.750865,72.0,41.786414,144
79278179,12952,151,2264,0,112,28,56,261,163.669643,152.0,...,0.000000,0,112,112,0,111,55.500000,55.5,32.475632,111
79278291,12953,135,2324,0,5095,72,6,286,200.063199,214.0,...,196.695730,560,5095,1464,0,1463,538.423749,454.0,393.605016,1463
79283386,12954,135,2424,0,2951,65,9,298,191.007794,139.0,...,126.124152,276,2951,1445,0,1444,596.701796,555.0,397.358069,1444


In [36]:
import pickle
df_train.to_pickle('./data/df_train.pkl')
df_test.to_pickle('./data/df_test.pkl')

In [38]:
import pickle
with open('./data/df_train.pkl', 'rb') as f:
    df_trian = pickle.load(f)

with open('./data/df_test.pkl', 'rb') as f:
    df_test = pickle.load(f)

In [41]:
# 祖传参数进行建模
import lightgbm as lgb
clf = lgb.LGBMClassifier(
            num_leaves=2**5-1, reg_alpha=0.25, reg_lambda=0.25, objective='multiclass',
            max_depth=-1, learning_rate=0.005, min_child_samples=3, random_state=2021,
            n_estimators=2000, subsample=1, colsample_bytree=1)
clf.fit(df_train.drop('label', axis=1), df_train['label'])

In [42]:
result = clf.predict_proba(df_test)
result

array([[1.29909605e-02, 2.97339711e-03, 1.16246386e-01, ...,
        1.83853784e-02, 8.72276580e-03, 7.27260036e-01],
       [8.75509718e-01, 9.38637189e-04, 5.76710702e-03, ...,
        5.39843985e-02, 1.81612459e-02, 3.86443780e-02],
       [9.98084893e-01, 3.75487851e-05, 2.17619646e-04, ...,
        7.40405189e-04, 1.50884474e-04, 6.83814865e-04],
       ...,
       [7.35136405e-04, 2.55666091e-04, 1.29494421e-03, ...,
        9.73190114e-01, 1.77537208e-02, 6.27093167e-03],
       [1.97081231e-03, 2.12985158e-05, 7.65178893e-04, ...,
        9.94096075e-01, 3.71521420e-04, 2.38030394e-03],
       [9.75722977e-04, 1.24267767e-04, 1.79147617e-03, ...,
        9.89291667e-01, 1.04264279e-03, 6.17081464e-03]])

In [45]:
result = pd.DataFrame(result, columns=['prob0', 'prob1', 'prob2', 'prob3', 'prob4', 'prob5', 'prob6', 'prob7'
])
result['file_id'] = df_test['file_id'].values
result

Unnamed: 0,prob0,prob1,prob2,prob3,prob4,prob5,prob6,prob7,file_id
0,0.012991,0.002973,0.116246,0.112650,0.000771,0.018385,0.008723,0.727260,1
1,0.875510,0.000939,0.005767,0.006461,0.000534,0.053984,0.018161,0.038644,2
2,0.998085,0.000038,0.000218,0.000072,0.000013,0.000740,0.000151,0.000684,3
3,0.045739,0.001098,0.004962,0.198713,0.001311,0.132568,0.026041,0.589569,4
4,0.989740,0.000035,0.001995,0.000887,0.000016,0.005407,0.000468,0.001452,5
...,...,...,...,...,...,...,...,...,...
12950,0.833071,0.003057,0.014412,0.003555,0.000654,0.034868,0.003584,0.106798,12951
12951,0.946363,0.000961,0.002724,0.002242,0.013856,0.023608,0.002090,0.008157,12952
12952,0.000735,0.000256,0.001295,0.000469,0.000031,0.973190,0.017754,0.006271,12953
12953,0.001971,0.000021,0.000765,0.000369,0.000025,0.994096,0.000372,0.002380,12954


In [46]:
result_columns =['file_id','prob0', 'prob1', 'prob2', 'prob3', 'prob4', 'prob5', 'prob6', 'prob7'
]
result.to_csv('./data/baseline_lgb_2000.csv', index=False, columns=result_columns)