In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [6]:
path = '../datasets/'
train = pd.read_csv(path + 'security_train.csv')
test = pd.read_csv(path + 'security_test.csv')

In [7]:
def simple_sts_features(df):
    simple_fea = pd.DataFrame()
    simple_fea['file_id'] = df['file_id'].unique()
    simple_fea = simple_fea.sort_values('file_id')

    df_grp = df.groupby('file_id')
    
    simple_fea['file_id_api_count'] = df_grp['api'].count().values
    simple_fea['file_id_api_nunique'] = df_grp['api'].nunique().values
    
    simple_fea['file_id_tid_count'] = df_grp['tid'].count().values
    simple_fea['file_id_tid_nunique'] = df_grp['tid'].nunique().values

    simple_fea['file_id_index_count'] = df_grp['index'].count().values
    simple_fea['file_id_index_nunique'] = df_grp['index'].nunique().values

    return simple_fea

In [8]:
def simple_numberical_sts_features(df):
    simple_numberical_fea = pd.DataFrame()
    simple_numberical_fea['file_id'] = df['file_id'].unique()
    simple_numberical_fea = simple_numberical_fea.sort_values('file_id')

    df_grp = df.groupby('file_id')

    simple_numberical_fea['file_id_tid_mean'] = df_grp['tid'].mean().values
    simple_numberical_fea['file_id_tid_min'] = df_grp['tid'].min().values
    simple_numberical_fea['file_id_tid_std'] = df_grp['tid'].std().values
    simple_numberical_fea['file_id_tid_max'] = df_grp['tid'].max().values

    simple_numberical_fea['file_id_index_mean'] = df_grp['index'].mean().values
    simple_numberical_fea['file_id_index_min'] = df_grp['index'].min().values
    simple_numberical_fea['file_id_index_std'] = df_grp['index'].std().values
    simple_numberical_fea['file_id_index_max'] = df_grp['index'].max().values

    return simple_numberical_fea

In [9]:
%%time
simple_train_fea1 = simple_sts_features(train)

CPU times: user 52.7 s, sys: 10.5 s, total: 1min 3s
Wall time: 1min 3s


In [10]:
%%time
simple_test_fea1 = simple_sts_features(test)

CPU times: user 42.1 s, sys: 9.21 ms, total: 42.1 s
Wall time: 42.1 s


In [15]:
%%time
simple_train_fea2 = simple_numberical_sts_features(train)

CPU times: user 5.38 s, sys: 374 ms, total: 5.75 s
Wall time: 5.75 s


In [16]:
%%time
simple_test_fea2 = simple_numberical_sts_features(test)

CPU times: user 4.68 s, sys: 417 ms, total: 5.09 s
Wall time: 5.09 s


In [17]:
train_label = train[['file_id', 'label']].drop_duplicates(subset=['file_id', 'label'], keep='first')
test_label = test[['file_id']].drop_duplicates(subset=['file_id'], keep='first')

In [18]:
train_data = train_label.merge(simple_train_fea1, on='file_id', how='left')
train_data = train_data.merge(simple_train_fea2, on='file_id', how='left')

test_submit = test_label.merge(simple_test_fea1, on='file_id', how='left')
test_submit = test_submit.merge(simple_test_fea2, on='file_id', how='left')

In [19]:
def lgb_logloss(preds, data):
    labels_ = data.get_label()
    classes_ = np.unique(labels_)
    preds_prob = []
    for i in range(len(classes_)):
        preds_prob.append(preds[i * len(labels_) : (i + 1) * len(labels_)])
    
    preds_prob_ = np.vstack(preds_prob)

    loss = []
    for i in range(preds_prob_.shape[1]):
        sum_ = 0
        for j in range(preds_prob_.shape[0]):
            pred = preds_prob_[j, i]
            if j == labels_[i]:
                sum_ += np.log(pred)
            else:
                sum_ += np.log(1 - pred)
        loss.append(sum_)
    return 'loss is: ', -1 * (np.sum(loss) / preds_prob_.shape[1]), False

In [20]:
train_features = [col for col in train_data.columns if col not in ['label', 'file_id']]
train_label = 'label'

In [21]:
%%time
from sklearn.model_selection import StratifiedKFold, KFold
params = {
    'task': 'train',
    'num_leaves': 255,
    'objective': 'multiclass',
    'num_class': 8,
    'min_data_in_leaf': 50,
    'learning_rate': 0.05,
    'feature_fraction': 0.85,
    'bagging_fraction': 0.85,
    'bagging_freq': 5,
    'max_bin': 128,
    'random_state': 100
}

folds = KFold(n_splits=5, shuffle=True, random_state=15)
oof = np.zeros(len(train))

predict_res = 0
models = []
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_data)):
    print(f"fold n°{fold_}")
    trn_data = lgb.Dataset(train_data.iloc[trn_idx][train_features], label=train_data.iloc[trn_idx][train_label].values)
    val_data = lgb.Dataset(train_data.iloc[val_idx][train_features], label=train_data.iloc[val_idx][train_label].values)

    clf = lgb.train(params, 
                    trn_data,
                    num_boost_round=2000,
                    valid_sets=[trn_data, val_data],
                    verbose_eval=50,
                    early_stopping_rounds=100,
                    feval=lgb_logloss)
    models.append(clf)

fold n°0
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1664
[LightGBM] [Info] Number of data points in the train set: 11109, number of used features: 13
[LightGBM] [Info] Start training from score -1.029745
[LightGBM] [Info] Start training from score -3.344249
[LightGBM] [Info] Start training from score -2.487882
[LightGBM] [Info] Start training from score -2.805253
[LightGBM] [Info] Start training from score -4.861164
[LightGBM] [Info] Start training from score -1.173157
[LightGBM] [Info] Start training from score -3.277640
[LightGBM] [Info] Start training from score -2.214659
Training until validation scores don't improve for 100 rounds
[50]	training's multi_logloss: 0.431298	training's loss is: : 0.771542	valid_1's multi_logloss: 0.681515	valid_1's loss is: : 1.13324


Exception ignored on calling ctypes callback function: <function _log_callback at 0x7f63244241f0>
Traceback (most recent call last):
  File "/home/xuwp/anaconda3/lib/python3.9/site-packages/lightgbm/basic.py", line 76, in _log_callback
    def _log_callback(msg):
KeyboardInterrupt: 


No further splits with positive gain, best gain: -inf


In [None]:
feature_importance = pd.DataFrame()
feature_importance['fea_name'] = train_features
feature_importance['fea_imp'] = clf.feature_importance()
feature_importance = feature_importance.sort_values('fea_imp', ascending=False)
plt.figure(figsize=[20,10,])
sns.barplot(x=feature_importance['fea_name'], y=feature_importance['fea_imp'])

In [None]:
pred_res = 0
fold = 5
for model in models:
    pred_res += model.predict(test_submit[train_features]) * 1.0 / fold
test_submit['prob0'] = 0
test_submit['prob1'] = 0
test_submit['prob2'] = 0
test_submit['prob3'] = 0
test_submit['prob4'] = 0
test_submit['prob5'] = 0
test_submit['prob6'] = 0
test_submit['prob7'] = 0
test_submit[['prob0', 'prob1', 'prob2', 'prob3', 'prob4', 'prob5', 'prob6', 'prob7']] = pred_res
test_submit[['file_id', 'prob0', 'prob1', 'prob2', 'prob3', 'prob4', 'prob5', 'prob6', 'prob7']].to_csv('baseline.csv', index=None)