## Baseline3

### 必要依赖包加载

In [2]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt

import os
import warnings
from tqdm import tqdm,tqdm_notebook

from sklearn.model_selection import KFold,StratifiedKFold,train_test_split
from sklearn import metrics

import lightgbm as lgb
from lightgbm import LGBMClassifier

pd.set_option('display.max_columns',100)
warnings.filterwarnings('ignore')

### 加载数据

In [3]:
# train_path = './input/train'
# test_path = './input/test'
# # os.listdir() 方法用于返回指定的文件夹包含的文件或文件夹的名字的列表
# train_file = os.listdir(train_path)
# test_file = os.listdir(test_path)

# 正式加载数据并提取时间特征
# h5格式的数据已经在baseline1和baseline2下生成好
train = pd.read_hdf('./input/train.h5')
test = pd.read_hdf('./input/test.h5')

### 特征工程 <br>
- 根据Baseline1和Baseline2对经纬度x,y以及速度v的图表分析，再根据模型最后展现的特征重要性的排名，决定对x,y,v做更多的特征。<br><br>
- 新特征的添加完全是“玄学”，首先上一波描述性统计，如分位数、偏度、中位数、标准差、最大值、最小值，经过对单个特征添加描述性统计的新特征以后，再对这些新特征做交互特征，如x_max减去y_max或是y_max除以x_min。<br><br>
- 客观来说这些特征的实际意义很难分析出来，但通过尝试发现一些特征对模型在训练集上的提分很明显，如v_per_9(速度的九十百分位数)，y_max_over_x_min（y的最大值除以x的最小值）。

In [4]:
# 关于v的分位数特征
def get_v_fea(df):
    try1 = df
    t = try1.groupby('ship')['v'].agg({'v_per_1':lambda x:sp.stats.mstats.hdquantiles(x,[0.1])}).reset_index()
    try1 = pd.merge(try1, t, on='ship', how='left')
    t = try1.groupby('ship')['v'].agg({'v_per_2':lambda x:sp.stats.mstats.hdquantiles(x,[0.2])}).reset_index()
    try1 = pd.merge(try1, t, on='ship', how='left')
    t = try1.groupby('ship')['v'].agg({'v_per_3':lambda x:sp.stats.mstats.hdquantiles(x,[0.3])}).reset_index()
    try1 = pd.merge(try1, t, on='ship', how='left')
    t = try1.groupby('ship')['v'].agg({'v_per_4':lambda x:sp.stats.mstats.hdquantiles(x,[0.4])}).reset_index()
    try1 = pd.merge(try1, t, on='ship', how='left')
    t = try1.groupby('ship')['v'].agg({'v_per_5':lambda x:sp.stats.mstats.hdquantiles(x,[0.5])}).reset_index()
    try1 = pd.merge(try1, t, on='ship', how='left')
    t = try1.groupby('ship')['v'].agg({'v_per_6':lambda x:sp.stats.mstats.hdquantiles(x,[0.6])}).reset_index()
    try1 = pd.merge(try1, t, on='ship', how='left')
    t = try1.groupby('ship')['v'].agg({'v_per_7':lambda x:sp.stats.mstats.hdquantiles(x,[0.7])}).reset_index()
    try1 = pd.merge(try1, t, on='ship', how='left')
    t = try1.groupby('ship')['v'].agg({'v_per_8':lambda x:sp.stats.mstats.hdquantiles(x,[0.8])}).reset_index()
    try1 = pd.merge(try1, t, on='ship', how='left')
    t = try1.groupby('ship')['v'].agg({'v_per_9':lambda x:sp.stats.mstats.hdquantiles(x,[0.9])}).reset_index()
    try1 = pd.merge(try1, t, on='ship', how='left')
    df = try1
    return df

In [5]:
train_prepare_1 = get_v_fea(train)
test_prepare_1 = get_v_fea(test)
print(train_prepare_1.shape,test_prepare_1.shape)

(2699638, 16) (782378, 15)


对x也做分位数特征后发现，x的百分之十分位数的特征在特征重要性方面得到相当高的分数，而其他分位数则很低。

In [6]:
def get_x_fea(df):
    try1 = df
    t = try1.groupby('ship')['x'].agg({'x_per_1':lambda x:sp.stats.mstats.hdquantiles(x,[0.1])}).reset_index()
    try1 = pd.merge(try1, t, on='ship', how='left')
    df = try1
    return df

In [7]:
train_prepare_2 = get_x_fea(train_prepare_1)
test_prepare_2 = get_x_fea(test_prepare_1)
print(train_prepare_2.shape,test_prepare_2.shape)

(2699638, 17) (782378, 16)


In [9]:
# 特征工程
def group_feature(df,key,target,aggs):
    agg_dict = {}
    for ag in aggs:
        agg_dict[f'{target}_{ag}'] = ag
    print(agg_dict)
    # 以key进行分组,选择特定的特征提取其函数结果作为新特征,函数包含在字典中
    t = df.groupby(key)[target].agg(agg_dict).reset_index()
    return t

def extract_feature(df,train):
    # ship指船号,对每一艘船的x,y,v,d的数据的特征进行提取
    t = group_feature(df, 'ship','x',['max','min','mean','std','skew','sum','count','median','mad'])
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','y',['max','min','mean','std','skew','sum','median','mad'])
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','v',['max','min','mean','std','skew','sum','median','mad'])
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','d',['max','min','mean','std','skew','sum'])
    train = pd.merge(train, t, on='ship', how='left')
    # 加和乘效果都不好，减和除对模型提分有帮助
    train['x_max_x_min'] = train['x_max'] - train['x_min']
    train['y_max_y_min'] = train['y_max'] - train['y_min']
    train['y_max_x_min'] = train['y_max'] - train['x_min']
    train['x_max_y_min'] = train['x_max'] - train['y_min']

    train['x_max_over_x_min'] = train['x_max'] / train['x_min']
    train['y_max_over_y_min'] = train['y_max'] / train['y_min']
    train['y_max_over_x_min'] = train['y_max'] / train['x_min']
    train['x_max_over_y_min'] = train['x_max'] / train['y_min']
    
    train['slope'] = train['y_max_y_min'] / np.where(train['x_max_x_min']==0, 0.001, train['x_max_x_min'])
    train['area'] = train['x_max_x_min'] * train['y_max_y_min']
    
    train['v_max_v_min'] = train['v_max'] - train['v_min']
    
    # mode_hour特征是指船号对应的出现频率最大的hour
    mode_hour = df.groupby('ship')['hour'].agg(lambda x:x.value_counts().index[0]).to_dict()
    # 用map将船号改为船号对应的mode_hour特征
    train['mode_hour'] = train['ship'].map(mode_hour)
    
    # 提取hour和date的独特数量的特征
    date_nunique = df.groupby('ship')['date'].nunique().to_dict()
    train['date_nunique'] = train['ship'].map(date_nunique)
    
    # 提取时间差特征,时间数据的形式缩写为dt
    t = df.groupby('ship')['time'].agg({'diff_time':lambda x:np.max(x)-np.min(x)}).reset_index()
    t['diff_day'] = t['diff_time'].dt.days
    t['diff_second'] = t['diff_time'].dt.seconds
    train = pd.merge(train, t, on='ship', how='left')
    return train

# 时间特征处理
def extract_dt(df):
    df['time'] = pd.to_datetime(df['time'], format='%m%d %H:%M:%S')
    df['date'] = df['time'].dt.date
    df['hour'] = df['time'].dt.hour
    df['weekday'] = df['time'].dt.weekday
    return df

In [10]:
# 提取时间特征
new_train = extract_dt(train_prepare_2)
new_test = extract_dt(test_prepare_2)

In [11]:
print(new_train.shape,new_test.shape)

(2699638, 20) (782378, 19)


In [12]:
# 提取所有统计特征
train_label = extract_feature(new_train,new_train)
test_label = extract_feature(new_test,new_test)

{'x_max': 'max', 'x_min': 'min', 'x_mean': 'mean', 'x_std': 'std', 'x_skew': 'skew', 'x_sum': 'sum', 'x_count': 'count', 'x_median': 'median', 'x_mad': 'mad'}
{'y_max': 'max', 'y_min': 'min', 'y_mean': 'mean', 'y_std': 'std', 'y_skew': 'skew', 'y_sum': 'sum', 'y_median': 'median', 'y_mad': 'mad'}
{'v_max': 'max', 'v_min': 'min', 'v_mean': 'mean', 'v_std': 'std', 'v_skew': 'skew', 'v_sum': 'sum', 'v_median': 'median', 'v_mad': 'mad'}
{'d_max': 'max', 'd_min': 'min', 'd_mean': 'mean', 'd_std': 'std', 'd_skew': 'skew', 'd_sum': 'sum'}
{'x_max': 'max', 'x_min': 'min', 'x_mean': 'mean', 'x_std': 'std', 'x_skew': 'skew', 'x_sum': 'sum', 'x_count': 'count', 'x_median': 'median', 'x_mad': 'mad'}
{'y_max': 'max', 'y_min': 'min', 'y_mean': 'mean', 'y_std': 'std', 'y_skew': 'skew', 'y_sum': 'sum', 'y_median': 'median', 'y_mad': 'mad'}
{'v_max': 'max', 'v_min': 'min', 'v_mean': 'mean', 'v_std': 'std', 'v_skew': 'skew', 'v_sum': 'sum', 'v_median': 'median', 'v_mad': 'mad'}
{'d_max': 'max', 'd_min':

In [13]:
# 将预测变量改为数值形式
type_map = dict(zip(train_label['type'].unique(),np.arange(3)))
type_map_rev = {v:k for k,v in type_map.items()}
train_label['type'] = train_label['type'].map(type_map)

In [14]:
train_label.shape

(2699638, 67)

In [17]:
train_label.head()

Unnamed: 0,ship,x,y,v,d,time,type,v_per_1,v_per_2,v_per_3,v_per_4,v_per_5,v_per_6,v_per_7,v_per_8,v_per_9,x_per_1,date,hour,weekday,x_max,x_min,x_mean,x_std,x_skew,x_sum,x_count,x_median,x_mad,y_max,y_min,y_mean,y_std,y_skew,y_sum,y_median,y_mad,v_max,v_min,v_mean,v_std,v_skew,v_sum,v_median,v_mad,d_max,d_min,d_mean,d_std,d_skew,d_sum,x_max_x_min,y_max_y_min,y_max_x_min,x_max_y_min,x_max_over_x_min,y_max_over_y_min,y_max_over_x_min,x_max_over_y_min,slope,area,v_max_v_min,mode_hour,date_nunique,diff_time,diff_day,diff_second
0,0,6152038.0,5124873.0,2.59,102,1900-11-10 11:58:19,0,1.042499e-15,1.042499e-15,1.042499e-15,1.042499e-15,1.042499e-15,1.042499e-15,1.042499e-15,1.673567e-09,0.036908,6118352.0,1900-11-10,11,5,6152038.0,6118352.0,6119351.0,5037.320747,5.255558,2533411000.0,414,6118352.0,1905.724573,5130781.0,5124873.0,5130494.0,850.264541,-4.762308,2124025000.0,5130672.0,341.078408,9.39,0.0,0.265966,1.321248,5.520205,110.11,0.0,0.49496,129,0,4.613527,21.24777,4.483093,1910,33686.667453,5907.975523,-987570.399385,1027165.0,1.005506,1.001153,0.838589,1.200427,0.17538,199020000.0,9.39,15,4,2 days 23:48:51,2,85731
1,0,6151230.0,5125218.0,2.7,113,1900-11-10 11:48:19,0,1.042499e-15,1.042499e-15,1.042499e-15,1.042499e-15,1.042499e-15,1.042499e-15,1.042499e-15,1.673567e-09,0.036908,6118352.0,1900-11-10,11,5,6152038.0,6118352.0,6119351.0,5037.320747,5.255558,2533411000.0,414,6118352.0,1905.724573,5130781.0,5124873.0,5130494.0,850.264541,-4.762308,2124025000.0,5130672.0,341.078408,9.39,0.0,0.265966,1.321248,5.520205,110.11,0.0,0.49496,129,0,4.613527,21.24777,4.483093,1910,33686.667453,5907.975523,-987570.399385,1027165.0,1.005506,1.001153,0.838589,1.200427,0.17538,199020000.0,9.39,15,4,2 days 23:48:51,2,85731
2,0,6150421.0,5125563.0,2.7,116,1900-11-10 11:38:19,0,1.042499e-15,1.042499e-15,1.042499e-15,1.042499e-15,1.042499e-15,1.042499e-15,1.042499e-15,1.673567e-09,0.036908,6118352.0,1900-11-10,11,5,6152038.0,6118352.0,6119351.0,5037.320747,5.255558,2533411000.0,414,6118352.0,1905.724573,5130781.0,5124873.0,5130494.0,850.264541,-4.762308,2124025000.0,5130672.0,341.078408,9.39,0.0,0.265966,1.321248,5.520205,110.11,0.0,0.49496,129,0,4.613527,21.24777,4.483093,1910,33686.667453,5907.975523,-987570.399385,1027165.0,1.005506,1.001153,0.838589,1.200427,0.17538,199020000.0,9.39,15,4,2 days 23:48:51,2,85731
3,0,6149612.0,5125907.0,3.29,95,1900-11-10 11:28:19,0,1.042499e-15,1.042499e-15,1.042499e-15,1.042499e-15,1.042499e-15,1.042499e-15,1.042499e-15,1.673567e-09,0.036908,6118352.0,1900-11-10,11,5,6152038.0,6118352.0,6119351.0,5037.320747,5.255558,2533411000.0,414,6118352.0,1905.724573,5130781.0,5124873.0,5130494.0,850.264541,-4.762308,2124025000.0,5130672.0,341.078408,9.39,0.0,0.265966,1.321248,5.520205,110.11,0.0,0.49496,129,0,4.613527,21.24777,4.483093,1910,33686.667453,5907.975523,-987570.399385,1027165.0,1.005506,1.001153,0.838589,1.200427,0.17538,199020000.0,9.39,15,4,2 days 23:48:51,2,85731
4,0,6148803.0,5126252.0,3.18,108,1900-11-10 11:18:19,0,1.042499e-15,1.042499e-15,1.042499e-15,1.042499e-15,1.042499e-15,1.042499e-15,1.042499e-15,1.673567e-09,0.036908,6118352.0,1900-11-10,11,5,6152038.0,6118352.0,6119351.0,5037.320747,5.255558,2533411000.0,414,6118352.0,1905.724573,5130781.0,5124873.0,5130494.0,850.264541,-4.762308,2124025000.0,5130672.0,341.078408,9.39,0.0,0.265966,1.321248,5.520205,110.11,0.0,0.49496,129,0,4.613527,21.24777,4.483093,1910,33686.667453,5907.975523,-987570.399385,1027165.0,1.005506,1.001153,0.838589,1.200427,0.17538,199020000.0,9.39,15,4,2 days 23:48:51,2,85731


In [18]:
# 把之前的标签保存一下，有备无患
tr = train_label
te = test_label

In [19]:
tr_train = tr
tr_test = te

In [20]:
# 做一些更复杂的交叉特征
def get_interact_fea(df):
    tr_train = df

    tr_train['x_over_y'] = tr_train['x'] / tr_train['y']
    tr_train['y_over_x'] = tr_train['y'] / tr_train['x']
    
    tr_train['v_skew_v_std'] = tr_train['v_skew'] - tr_train['v_std']
    tr_train['v_std_v_skew'] = tr_train['v_std'] - tr_train['v_skew']
    tr_train['v_skew_over_v_std'] = tr_train['v_skew'] / tr_train['v_std']
    tr_train['v_std_over_v_skew'] = tr_train['v_std'] / tr_train['v_skew']
    
    tr_train['y_skew_x_skew'] = tr_train['y_skew'] - tr_train['x_skew']
    tr_train['x_skew_y_skew'] = tr_train['x_skew'] - tr_train['y_skew']
    tr_train['y_skew_over_x_skew'] = tr_train['y_skew'] / tr_train['x_skew']
    tr_train['x_skew_over_y_skew'] = tr_train['x_skew'] / tr_train['y_skew']

    tr_train['y_me_x_me'] = tr_train['y_median'] - tr_train['x_median']
    tr_train['x_me_y_me'] = tr_train['x_median'] - tr_train['y_median']
    tr_train['y_me_over_x_me'] = tr_train['y_median'] / tr_train['x_median']
    tr_train['x_me_over_y_me'] = tr_train['x_median'] / tr_train['y_median']
    df = tr_train
    return df

In [22]:
tr_temp = get_interact_fea(tr)
te_temp = get_interact_fea(te)

In [23]:
# 每艘船都有很多数据,针对不同(x,y,v,d)提取的特征都是单个值,所以需要去重操作
train_label = tr_temp.drop_duplicates('ship')
test_label = te_temp.drop_duplicates('ship')

In [24]:
train_label.shape

(7000, 81)

In [25]:
test_label.shape

(2000, 80)

In [26]:
train_label.head()

Unnamed: 0,ship,x,y,v,d,time,type,v_per_1,v_per_2,v_per_3,v_per_4,v_per_5,v_per_6,v_per_7,v_per_8,v_per_9,x_per_1,date,hour,weekday,x_max,x_min,x_mean,x_std,x_skew,x_sum,x_count,x_median,x_mad,y_max,y_min,y_mean,y_std,y_skew,y_sum,y_median,y_mad,v_max,v_min,v_mean,v_std,v_skew,v_sum,v_median,v_mad,d_max,d_min,d_mean,d_std,d_skew,d_sum,x_max_x_min,y_max_y_min,y_max_x_min,x_max_y_min,x_max_over_x_min,y_max_over_y_min,y_max_over_x_min,x_max_over_y_min,slope,area,v_max_v_min,mode_hour,date_nunique,diff_time,diff_day,diff_second,x_over_y,y_over_x,v_skew_v_std,v_std_v_skew,v_skew_over_v_std,v_std_over_v_skew,y_skew_x_skew,x_skew_y_skew,y_skew_over_x_skew,x_skew_over_y_skew,y_me_x_me,x_me_y_me,y_me_over_x_me,x_me_over_y_me
0,0,6152038.0,5124873.0,2.59,102,1900-11-10 11:58:19,0,1.042499e-15,1.042499e-15,1.042499e-15,1.042499e-15,1.042499e-15,1.042499e-15,1.042499e-15,1.673567e-09,0.036908,6118352.0,1900-11-10,11,5,6152038.0,6118352.0,6119351.0,5037.320747,5.255558,2533411000.0,414,6118352.0,1905.724573,5130781.0,5124873.0,5130494.0,850.264541,-4.762308,2124025000.0,5130672.0,341.078408,9.39,0.0,0.265966,1.321248,5.520205,110.11,0.0,0.49496,129,0,4.613527,21.24777,4.483093,1910,33686.667453,5907.975523,-987570.4,1027165.0,1.005506,1.001153,0.838589,1.200427,0.17538,199020000.0,9.39,15,4,2 days 23:48:51,2,85731,1.200427,0.833037,4.198958,-4.198958,4.178025,0.239348,-10.017866,10.017866,-0.906147,-1.103574,-987679.4,987679.4,0.838571,1.192505
414,1,6076254.0,5061743.0,3.99,278,1900-11-10 11:40:21,0,1.162404e-15,2.591953e-11,0.002226016,0.04963061,0.05762401,0.287186,2.877807,3.824403,4.496411,6064379.0,1900-11-10,11,5,6102450.0,6049472.0,6091460.0,16543.394419,-1.058454,2345212000.0,385,6102450.0,14574.134228,5112874.0,5042857.0,5094050.0,26764.042729,-0.802446,1961209000.0,5112760.0,24712.754023,10.47,0.0,1.607922,2.412688,1.590284,619.05,0.05,2.006849,336,0,56.153247,91.449382,1.418867,21619,52978.013345,70016.655842,-936597.9,1059593.0,1.008757,1.013884,0.845177,1.210117,1.321617,3709343000.0,10.47,19,4,2 days 23:39:47,2,85187,1.200427,0.833037,-0.822404,0.822404,0.659134,1.517143,0.256008,-0.256008,0.75813,1.319035,-989690.2,989690.2,0.837821,1.193573
799,10,6321032.0,5242805.0,4.48,213,1900-11-10 11:49:36,0,1.47539e-06,0.09985986,0.1100002,0.1424801,0.2199736,0.2293631,0.3215302,2.3651,4.847991,6246119.0,1900-11-10,11,5,6346913.0,6246119.0,6262484.0,32280.567149,1.62304,2486206000.0,397,6246120.0,25318.018154,5265810.0,5229867.0,5242458.0,5975.460236,2.198003,2081256000.0,5240937.0,3556.709015,10.09,0.0,1.313854,2.442825,2.14541,521.6,0.22,1.779098,359,0,108.758186,112.515081,0.727645,43177,100794.674835,35942.703641,-980308.7,1117046.0,1.016137,1.006873,0.843053,1.21359,0.356593,3622833000.0,10.09,23,4,2 days 23:33:53,2,84833,1.205658,0.829422,-0.297415,0.297415,0.87825,1.138629,0.574962,-0.574962,1.35425,0.738416,-1005182.0,1005182.0,0.839071,1.191794
1196,100,6102751.0,5112534.0,0.0,0,1900-10-30 23:50:05,0,0.000127743,1.643454,2.913645,3.2565,3.414852,3.583038,3.764893,3.925214,4.077306,6102751.0,1900-10-30,23,1,6151439.0,6102326.0,6123711.0,14451.941954,0.02186,2516845000.0,411,6123431.0,12392.343016,5112752.0,5069616.0,5085480.0,14020.260117,1.055676,2090132000.0,5082056.0,10839.543586,8.69,0.0,2.965864,1.647069,-0.215287,1218.97,3.45,1.196892,353,0,161.727494,115.409256,-0.020073,66470,49113.022232,43135.705758,-989574.0,1081823.0,1.008048,1.008509,0.837837,1.213393,0.878295,2118525000.0,8.69,11,3,2 days 23:48:47,2,85727,1.193684,0.837742,-1.862355,1.862355,-0.130709,-7.650588,1.033816,-1.033816,48.29267,0.020707,-1041375.0,1041375.0,0.829936,1.204912
1607,1000,6843713.0,5480538.0,2.0,216,1900-11-06 23:42:30,1,0.04343208,0.1354494,0.2249088,0.320698,0.5643282,1.068284,2.515945,5.217504,6.816046,6756150.0,1900-11-06,23,1,6844414.0,6748890.0,6807536.0,26263.537565,-0.77019,2566441000.0,377,6809206.0,18041.318374,5540087.0,5440815.0,5464764.0,30135.645906,1.412544,2060216000.0,5450299.0,23649.086501,8.9,0.0,2.08557,2.649306,1.110173,786.26,0.59,2.247028,358,0,159.143236,101.832626,0.217397,59997,95524.035775,99271.486171,-1208803.0,1403598.0,1.014154,1.018246,0.820889,1.257976,1.03923,9482813000.0,8.9,0,3,2 days 23:37:11,2,85031,1.24873,0.800814,-1.539134,1.539134,0.419043,2.386392,2.182734,-2.182734,-1.83402,-0.54525,-1358907.0,1358907.0,0.800431,1.249327


### 选出特征，准备训练数据

In [27]:
feats = [i for i in train_label.columns if i not in ['ship','type','time','diff_time','date']]
target = 'type'

In [28]:
X = train_label[feats].copy()
y = train_label[target]

### 交叉验证训练模型

注意数据量只有7000行，10折或许会过对训练集拟合！

In [29]:
# 5折交叉验证
fold = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
# 准备预测
models = []
pred = np.zeros((len(test_label),3))
oof = np.zeros((len(X),3))

params = {
    'n_estimators': 5000,
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': 3,
    'early_stopping_rounds': 500,
}

for index, (train_idx, val_idx) in enumerate(fold.split(X, y)):

    train_set = lgb.Dataset(X.iloc[train_idx], y.iloc[train_idx])
    val_set = lgb.Dataset(X.iloc[val_idx], y.iloc[val_idx])

    model = lgb.train(params, train_set, valid_sets=[train_set, val_set], verbose_eval=350)
    models.append(model)
    val_pred = model.predict(X.iloc[val_idx])
    oof[val_idx] = val_pred
    val_y = y.iloc[val_idx]
    val_pred = np.argmax(val_pred, axis=1)
    print(index, 'val f1', metrics.f1_score(val_y, val_pred, average='macro'))

    test_pred = model.predict(test_label[feats])
    pred += test_pred/5

oof = np.argmax(oof, axis=1)
print('oof f1', metrics.f1_score(oof, y, average='macro'))

Training until validation scores don't improve for 500 rounds.
[350]	training's multi_logloss: 0.00110168	valid_1's multi_logloss: 0.2768
Early stopping, best iteration is:
[124]	training's multi_logloss: 0.0386674	valid_1's multi_logloss: 0.228699
0 val f1 0.8809932104840357
Training until validation scores don't improve for 500 rounds.
[350]	training's multi_logloss: 0.00113219	valid_1's multi_logloss: 0.255248
Early stopping, best iteration is:
[133]	training's multi_logloss: 0.0333124	valid_1's multi_logloss: 0.222356
1 val f1 0.8898144859109749
Training until validation scores don't improve for 500 rounds.
[350]	training's multi_logloss: 0.00119017	valid_1's multi_logloss: 0.28558
Early stopping, best iteration is:
[159]	training's multi_logloss: 0.0223236	valid_1's multi_logloss: 0.248391
2 val f1 0.8846188692802449
Training until validation scores don't improve for 500 rounds.
[350]	training's multi_logloss: 0.00121903	valid_1's multi_logloss: 0.253208
Early stopping, best itera

In [30]:
# 查看对测试集预测标签的比例
pred = np.argmax(pred, axis=1)
sub = test_label[['ship']]
sub['pred'] = pred
print(sub['pred'].value_counts(1))

0    0.6390
1    0.2305
2    0.1305
Name: pred, dtype: float64


In [31]:
# 对比训练集标签的比例
print(train_label['type'].value_counts(1))

0    0.623000
1    0.231571
2    0.145429
Name: type, dtype: float64


In [32]:
type_map

{'拖网': 0, '围网': 1, '刺网': 2}

In [33]:
# 创建提交文件
# sub['pred'] = sub['pred'].map(type_map_rev)
# sub.to_csv('./output/baseline3_5F_oof89202.csv', index=None, header=None)

In [34]:
# 查看特征重要性
ret = []
for index, model in enumerate(models):
    df = pd.DataFrame()
    df['name'] = model.feature_name()
    df['score'] = model.feature_importance()
    df['fold'] = index
    ret.append(df)
    
df = pd.concat(ret)
df = df.groupby('name', as_index=False)['score'].mean()
df = df.sort_values(['score'], ascending=False)
df

Unnamed: 0,name,score
52,x_per_1,327.8
60,y_max,324.0
66,y_me_x_me,319.2
54,x_skew_over_y_skew,292.2
29,v_per_9,291.8
27,v_per_7,275.4
69,y_min,274.2
61,y_max_over_x_min,272.0
13,slope,271.8
65,y_me_over_x_me,266.4


### 根据重要特征再次训练
- 手动选取比较重要的特征

In [35]:
# df.to_csv('./temp/bl3_fea_importances.csv',index=0,header=0)

In [36]:
good_fea = list(df.name)

In [37]:
# 手动尝试过发现前40个特征效果最佳
new_feats = [i for i in good_fea[:40]]

In [38]:
X = train_label[new_feats].copy()
y = train_label[target]

In [39]:
def get_oof_score(X,y,test_label,feats,n_splits):
    fold = StratifiedKFold(n_splits=n_splits,shuffle=True,random_state=42)
    pred, oof = np.zeros((len(test_label),3)), np.zeros((len(X),3))

    params = {
        'n_estimators': 5000,'boosting_type': 'gbdt','objective': 'multiclass',
        'num_class': 3,'early_stopping_rounds': 500,
    }

    for index, (train_idx, val_idx) in enumerate(fold.split(X, y)):
        train_set = lgb.Dataset(X.iloc[train_idx], y.iloc[train_idx])
        val_set = lgb.Dataset(X.iloc[val_idx], y.iloc[val_idx])

        model = lgb.train(params, train_set, valid_sets=[train_set, val_set], verbose_eval=0)
        val_pred = model.predict(X.iloc[val_idx])
        oof[val_idx] = val_pred
        val_y = y.iloc[val_idx]
        val_pred = np.argmax(val_pred, axis=1)
#         print(index, 'val f1', metrics.f1_score(val_y, val_pred, average='macro'))

        test_pred = model.predict(test_label[feats])
        pred += test_pred/n_splits

    oof = np.argmax(oof, axis=1)
    print('oof f1', metrics.f1_score(oof, y, average='macro'))
    print(pd.DataFrame({'pred':np.argmax(pred,axis=1)})['pred'].value_counts(1))
    return pred

In [40]:
# 手动选取特征
# pred_results = []
# for i in [20,25,30,35,40,45,50,55]:
#     new_feats = good_fea[:i]
#     X = train_label[new_feats].copy()
#     y = train_label[target]
#     print('feature nums: {}'.format(i))
#     pred = get_oof_score(X,y,test_label,new_feats,10)
#     pred_results.append(np.argmax(pred, axis=1))
#     print('-*-'*10)

40个特征将oof的分数从0.881提到0.891，说明特征还是有不少冗余的成分，尽管特征重要性都大于0

In [41]:
pred = get_oof_score(X,y,test_label,good_fea[:40],5)

oof f1 0.891424920160552
0    0.6340
1    0.2375
2    0.1285
Name: pred, dtype: float64


预测标签比例和训练集标签比例大致相等，可以相信这个结果不会发生严重过拟合

In [42]:
sub = test_label[['ship']]
sub['pred'] = np.argmax(pred,axis=1)
print(sub['pred'].value_counts(1))

0    0.6340
1    0.2375
2    0.1285
Name: pred, dtype: float64


In [43]:
# 创建提交文件
sub['pred'] = sub['pred'].map(type_map_rev)
sub.to_csv('./output/baseline3_5F_oof89142.csv', index=None, header=None)