## Baseline3

### 必要依赖包加载

In [105]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt

import os
import warnings
from tqdm import tqdm,tqdm_notebook

from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.model_selection import KFold,StratifiedKFold,train_test_split
from sklearn import metrics

from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import lightgbm as lgb

pd.set_option('display.max_columns',100)
warnings.filterwarnings('ignore')

### 加载数据

In [2]:
train = pd.read_hdf('./input/train.h5')
test = pd.read_hdf('./input/test.h5')

test['type'] = -1
data = pd.concat([train,test])

### 特征工程 <br>
- 对速度v做出va加速度的特征来,并且把加速度的各种描述性统计特征做出

In [3]:
# 关于分位数特征
def get_v_fea(df):
    try1 = df
    t = try1.groupby('ship')['v'].agg({'v_per_1':lambda x:sp.stats.mstats.hdquantiles(x,[0.1])}).reset_index()
    try1 = pd.merge(try1, t, on='ship', how='left')
    t = try1.groupby('ship')['v'].agg({'v_per_2':lambda x:sp.stats.mstats.hdquantiles(x,[0.2])}).reset_index()
    try1 = pd.merge(try1, t, on='ship', how='left')
    t = try1.groupby('ship')['v'].agg({'v_per_3':lambda x:sp.stats.mstats.hdquantiles(x,[0.3])}).reset_index()
    try1 = pd.merge(try1, t, on='ship', how='left')
    t = try1.groupby('ship')['v'].agg({'v_per_4':lambda x:sp.stats.mstats.hdquantiles(x,[0.4])}).reset_index()
    try1 = pd.merge(try1, t, on='ship', how='left')
    t = try1.groupby('ship')['v'].agg({'v_per_5':lambda x:sp.stats.mstats.hdquantiles(x,[0.5])}).reset_index()
    try1 = pd.merge(try1, t, on='ship', how='left')
    t = try1.groupby('ship')['v'].agg({'v_per_6':lambda x:sp.stats.mstats.hdquantiles(x,[0.6])}).reset_index()
    try1 = pd.merge(try1, t, on='ship', how='left')
    t = try1.groupby('ship')['v'].agg({'v_per_7':lambda x:sp.stats.mstats.hdquantiles(x,[0.7])}).reset_index()
    try1 = pd.merge(try1, t, on='ship', how='left')
    t = try1.groupby('ship')['v'].agg({'v_per_8':lambda x:sp.stats.mstats.hdquantiles(x,[0.8])}).reset_index()
    try1 = pd.merge(try1, t, on='ship', how='left')
    t = try1.groupby('ship')['v'].agg({'v_per_9':lambda x:sp.stats.mstats.hdquantiles(x,[0.9])}).reset_index()
    try1 = pd.merge(try1, t, on='ship', how='left')
    df = try1
    return df

def get_x_fea(df):
    try1 = df
    t = try1.groupby('ship')['x'].agg({'x_per_1':lambda x:sp.stats.mstats.hdquantiles(x,[0.1])}).reset_index()
    try1 = pd.merge(try1, t, on='ship', how='left')
    df = try1
    return df

In [4]:
def group_feature(df,key,target,aggs):
    agg_dict = {}
    for ag in aggs:
        agg_dict[f'{target}_{ag}'] = ag
    print(agg_dict)
    # 以key进行分组,选择特定的特征提取其函数结果作为新特征,函数包含在字典中
    t = df.groupby(key)[target].agg(agg_dict).reset_index()
    return t

def extract_feature(df,train):
    # ship指船号,对每一艘船的x,y,v,d的数据的特征进行提取
    t = group_feature(df, 'ship','x',['max','min','mean','std','skew','sum','count','median','mad'])
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','y',['max','min','mean','std','skew','sum','median','mad'])
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','v',['max','min','mean','std','skew','sum','median','mad'])
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','d',['max','min','mean','std','skew','sum'])
    train = pd.merge(train, t, on='ship', how='left')
    # 加和乘效果都不好，减和除对模型提分有帮助
    train['x_max_x_min'] = train['x_max'] - train['x_min']
    train['y_max_y_min'] = train['y_max'] - train['y_min']
    train['y_max_x_min'] = train['y_max'] - train['x_min']
    train['x_max_y_min'] = train['x_max'] - train['y_min']

    train['x_max_over_x_min'] = train['x_max'] / train['x_min']
    train['y_max_over_y_min'] = train['y_max'] / train['y_min']
    train['y_max_over_x_min'] = train['y_max'] / train['x_min']
    train['x_max_over_y_min'] = train['x_max'] / train['y_min']
    
    train['slope'] = train['y_max_y_min'] / np.where(train['x_max_x_min']==0, 0.001, train['x_max_x_min'])
    train['area'] = train['x_max_x_min'] * train['y_max_y_min']
    
    train['v_max_v_min'] = train['v_max'] - train['v_min']
    
    # mode_hour特征是指船号对应的出现频率最大的hour
    mode_hour = df.groupby('ship')['hour'].agg(lambda x:x.value_counts().index[0]).to_dict()
    # 用map将船号改为船号对应的mode_hour特征
    train['mode_hour'] = train['ship'].map(mode_hour)
    
    # 提取hour和date的独特数量的特征
    date_nunique = df.groupby('ship')['date'].nunique().to_dict()
    train['date_nunique'] = train['ship'].map(date_nunique)
    
    # 提取时间差特征,时间数据的形式缩写为dt
    t = df.groupby('ship')['time'].agg({'diff_time':lambda x:np.max(x)-np.min(x)}).reset_index()
    t['diff_day'] = t['diff_time'].dt.days
    t['diff_second'] = t['diff_time'].dt.seconds
    train = pd.merge(train, t, on='ship', how='left')
    return train

# 时间特征处理
def extract_dt(df):
    df['time'] = pd.to_datetime(df['time'], format='%m%d %H:%M:%S')
    df['date'] = df['time'].dt.date
    df['hour'] = df['time'].dt.hour
    df['weekday'] = df['time'].dt.weekday
    return df

In [5]:
# 做一些更复杂的交叉特征
def get_interact_fea(df):
    tr_train = df

    tr_train['y_mad_over_x_mad'] = tr_train['y_mad'] / tr_train['x_mad']
    tr_train['x_mad_over_y_mad'] = tr_train['x_mad'] / tr_train['y_mad']
    
    tr_train['x_over_y'] = tr_train['x'] / tr_train['y']
    tr_train['y_over_x'] = tr_train['y'] / tr_train['x']
    
    tr_train['v_skew_v_std'] = tr_train['v_skew'] - tr_train['v_std']
    tr_train['v_std_v_skew'] = tr_train['v_std'] - tr_train['v_skew']
    tr_train['v_skew_over_v_std'] = tr_train['v_skew'] / tr_train['v_std']
    tr_train['v_std_over_v_skew'] = tr_train['v_std'] / tr_train['v_skew']
    
    tr_train['y_skew_x_skew'] = tr_train['y_skew'] - tr_train['x_skew']
    tr_train['x_skew_y_skew'] = tr_train['x_skew'] - tr_train['y_skew']
    tr_train['y_skew_over_x_skew'] = tr_train['y_skew'] / tr_train['x_skew']
    tr_train['x_skew_over_y_skew'] = tr_train['x_skew'] / tr_train['y_skew']

    tr_train['y_me_x_me'] = tr_train['y_median'] - tr_train['x_median']
    tr_train['x_me_y_me'] = tr_train['x_median'] - tr_train['y_median']
    tr_train['y_me_over_x_me'] = tr_train['y_median'] / tr_train['x_median']
    tr_train['x_me_over_y_me'] = tr_train['x_median'] / tr_train['y_median']
    df = tr_train
    return df

In [7]:
t1 = get_v_fea(data)
t2 = get_x_fea(t1)
t3 = extract_dt(t2)
t4 = extract_feature(t3,t3)
t5 = get_interact_fea(t4)
a,b,c,d,e,f = [i.shape[1] for i in [data,t1,t2,t3,t4,t5]]
print('Features from {} -> {} -> {} -> {} -> {} to {}'.format(a,b,c,d,e,f))

{'x_max': 'max', 'x_min': 'min', 'x_mean': 'mean', 'x_std': 'std', 'x_skew': 'skew', 'x_sum': 'sum', 'x_count': 'count', 'x_median': 'median', 'x_mad': 'mad'}
{'y_max': 'max', 'y_min': 'min', 'y_mean': 'mean', 'y_std': 'std', 'y_skew': 'skew', 'y_sum': 'sum', 'y_median': 'median', 'y_mad': 'mad'}
{'v_max': 'max', 'v_min': 'min', 'v_mean': 'mean', 'v_std': 'std', 'v_skew': 'skew', 'v_sum': 'sum', 'v_median': 'median', 'v_mad': 'mad'}
{'d_max': 'max', 'd_min': 'min', 'd_mean': 'mean', 'd_std': 'std', 'd_skew': 'skew', 'd_sum': 'sum'}
Features from 3482016 -> 3482016 -> 3482016 -> 3482016 -> 3482016 to 3482016


In [8]:
# 速度分箱特征研究
def get_bins(df,key,target,n):
    tr = df
    score_list = tr[tr[key]==n][target]
    ma = np.ceil(max(score_list))
    mi = np.floor(min(score_list))
    bins = [mi] + [round((ma-mi)/i,2) for i in range(11,1,-1)] + [ma]
    try:
        score_cat = pd.cut(score_list,bins)
        res = list(pd.value_counts(score_cat))
        return [n,max(res)/sum(res)]
    except ValueError:
        return [n,0]

# 速度加减变化的次数研究
def get_v_change(df,key,target,n):
    tr = df
    p = tr[tr[key]==n][target]
    i = j = k = 0
    for v1,v2 in zip(p[:-1],p[1:]):
        diff = v2 - v1
        if diff > 0:
            i += 1
        elif diff == 0:
            j += 1
        else:
            k += 1
    return [n,i,j,k]

# 速度加减变化的加速度研究
def get_va_change(df,key,target,n):
    tr = df
    p = tr[tr[key]==n][target]
    tt,t = [], []
    for v1,v2 in zip(p[:-1],p[1:]):
        diff = v2 - v1
        t.append(diff)
    va_sum,va_max,va_min = sum(t), max(t), min(t)
    tt += [va_sum,va_max,va_min]
    va_mean,va_median,va_std = np.mean(t),np.median(t),np.std(t)
    tt += [va_mean,va_median,va_std]
    va_skew = sp.stats.skew(t)
    tt += [va_skew]
    return [n] + tt

In [11]:
temp1,temp2,temp3 = [], [], []
cols = ['ship','va_sum','va_max','va_min','va_mean','va_median','va_std','va_skew']

for i in tqdm(range(9000)):
    list1 = get_bins(data,'ship','v',i)
    list2 = get_v_change(data,'ship','v',i)
    list3 = get_va_change(data,'ship','v',i)
    temp_df1 = pd.DataFrame([list1],columns=['ship','por_v'])
    temp_df2 = pd.DataFrame([list2],columns=['ship','v_incre','v_keep','v_decre'])
    temp_df3 = pd.DataFrame([list3],columns=cols)
    temp1.append(temp_df1)
    temp2.append(temp_df2)
    temp3.append(temp_df3)
    
df1 = pd.concat(temp1)
df2 = pd.concat(temp2)
df3 = pd.concat(temp3)

100%|██████████████████████████████████████████████████████████████████████████████| 9000/9000 [03:34<00:00, 41.95it/s]


In [12]:
print(df1.head(),'\n',df2.head(),'\n',df3.head())
print('\n')
print(df1.shape,df2.shape,df3.shape)

new_data = t5
new_data = pd.merge(new_data, df1, on='ship', how='left')
new_data = pd.merge(new_data, df2, on='ship', how='left')
new_data = pd.merge(new_data, df3, on='ship', how='left')

new_data.shape

   ship     por_v
0     0  0.577778
0     1  0.464567
0     2  0.977654
0     3  0.611321
0     4  0.729412 
    ship  v_incre  v_keep  v_decre
0     0       30     354       29
0     1      140     106      138
0     2       87      59       86
0     3      122      89      123
0     4      179      61      160 
    ship  va_sum  va_max  va_min   va_mean  va_median    va_std   va_skew
0     0   -2.59    6.80   -6.31 -0.006271        0.0  0.498274  0.802494
0     1   -3.99    3.19   -4.58 -0.010391        0.0  0.640811 -1.459248
0     2   -0.21   49.97  -50.46 -0.000905        0.0  4.839439 -0.139071
0     3    1.35    8.58   -9.39  0.004042        0.0  1.613107 -0.212757
0     4   -0.22    7.40   -9.39 -0.000550        0.0  2.028397 -0.354715


(9000, 2) (9000, 4) (9000, 8)


(3482016, 94)

In [47]:
# 每艘船都有很多数据,针对不同(x,y,v,d)提取的特征都是单个值,所以需要去重操作
data_label = new_data.drop_duplicates('ship')

{'拖网': 0, '围网': 1, '刺网': 2}
(9000, 94)


In [48]:
data_label.to_csv('bl4_features.csv')

### 选出特征，准备训练数据

In [49]:
feats = [i for i in new_data.columns if i not in ['ship','type','time','diff_time','date']]
target = 'type'

In [50]:
X = data_label[data_label['type']!=-1][feats].copy()
y = data_label[data_label['type']!=-1][target]
T = data_label[data_label['type']==-1][feats]

In [56]:
# 将预测变量改为数值形式
type_map = dict(zip(y.unique(),np.arange(3)))
type_map_rev = {v:k for k,v in type_map.items()}
y = y.map(type_map)

print(type_map)
print(data_label.shape)

{'拖网': 0, '围网': 1, '刺网': 2}
(9000, 94)


In [62]:
scaler1 = MinMaxScaler()
scaler2 = StandardScaler()

X1,T1 = X,T

# X1.replace([np.inf,-np.inf],np.nan)
# T1.replace([np.inf,-np.inf],np.nan)
X1[np.isinf(X1)] = np.nan
T1[np.isinf(T1)] = np.nan

X1.fillna(0,inplace=True)
T1.fillna(0,inplace=True)

X1 = scaler1.fit_transform(X1)
X2 = scaler2.fit_transform(X1)
T1 = scaler1.fit_transform(T1)
T2 = scaler2.fit_transform(T1)

### 交叉验证训练模型

注意数据量只有7000行，10折或许会过对训练集拟合！

In [94]:
def lgb_oof_score(X,y,T,n_splits,verbose=100):
    fold = StratifiedKFold(n_splits=n_splits,shuffle=True,random_state=42)
    pred, oof = np.zeros((len(T),3)), np.zeros((len(X),3))

    params = {
        'n_estimators': 5000,'boosting_type': 'gbdt','objective': 'multiclass',
        'num_class': 3,'early_stopping_rounds': 200,
    }

    for index, (train_idx, val_idx) in enumerate(fold.split(X, y)):
        train_set = lgb.Dataset(X.iloc[train_idx], y.iloc[train_idx])
        val_set = lgb.Dataset(X.iloc[val_idx], y.iloc[val_idx])

        model = lgb.train(params, train_set, valid_sets=[train_set, val_set], verbose_eval=verbose)
        val_pred = model.predict(X.iloc[val_idx])
        oof[val_idx] = val_pred
        val_y = y.iloc[val_idx]
        val_pred = np.argmax(val_pred, axis=1)
        score = metrics.f1_score(val_y, val_pred, average='macro')
        print('Fold ',index+1 ,' Val F1-Score: ', round(score,5))

        test_pred = model.predict(T)
        pred += test_pred/n_splits

    oof = np.argmax(oof, axis=1)
    t_score = metrics.f1_score(oof, y, average='macro')
    print('oof F1-Score',round(t_score,5),'\n' )
    print('Proportion of Prediction Label')
    print(pd.DataFrame({'pred':np.argmax(pred,axis=1)})['pred'].value_counts(1))
    return pred,model

In [66]:
# 对比训练集标签的比例
print(type_map)
print(y.value_counts(1))

{'拖网': 0, '围网': 1, '刺网': 2}
0    0.623000
1    0.231571
2    0.145429
Name: type, dtype: float64


In [84]:
pred, model = lgb_oof_score(X,y,T,5)

Training until validation scores don't improve for 200 rounds.
[100]	training's multi_logloss: 0.0550215	valid_1's multi_logloss: 0.237831
[200]	training's multi_logloss: 0.0101541	valid_1's multi_logloss: 0.241664
[300]	training's multi_logloss: 0.0020164	valid_1's multi_logloss: 0.268311
Early stopping, best iteration is:
[137]	training's multi_logloss: 0.0292724	valid_1's multi_logloss: 0.234268
Fold  1  Val F1-Score:  0.88609
Training until validation scores don't improve for 200 rounds.
[100]	training's multi_logloss: 0.0546533	valid_1's multi_logloss: 0.234812
[200]	training's multi_logloss: 0.0103901	valid_1's multi_logloss: 0.224471
[300]	training's multi_logloss: 0.00206653	valid_1's multi_logloss: 0.240765
Early stopping, best iteration is:
[187]	training's multi_logloss: 0.0128151	valid_1's multi_logloss: 0.223033
Fold  2  Val F1-Score:  0.89458
Training until validation scores don't improve for 200 rounds.
[100]	training's multi_logloss: 0.0546749	valid_1's multi_logloss: 0

In [85]:
# 查看特征重要性
ret = []
for index, model in enumerate([model]):
    df = pd.DataFrame()
    df['name'] = model.feature_name()
    df['score'] = model.feature_importance()
    df['fold'] = index
    ret.append(df)
    
df = pd.concat(ret)
df = df.groupby('name', as_index=False)['score'].mean()
df = df.sort_values(['score'], ascending=False)
df

Unnamed: 0,name,score
64,x_per_1,348
73,y_max,335
79,y_me_x_me,315
82,y_min,304
57,x_max_y_min,295
...,...,...
48,weekday,13
11,hour,10
9,diff_day,7
4,d_min,1


### 特征重选

#### 手动选取重要性靠前的特征

In [87]:
all_feas = list(df.name)
good_feas = all_feas[:45]

In [88]:
lgb_oof_score(X[good_feas],y,T[good_feas],5)

Training until validation scores don't improve for 200 rounds.
[100]	training's multi_logloss: 0.0610283	valid_1's multi_logloss: 0.231908
[200]	training's multi_logloss: 0.0128107	valid_1's multi_logloss: 0.229281
[300]	training's multi_logloss: 0.00276735	valid_1's multi_logloss: 0.246715
Early stopping, best iteration is:
[148]	training's multi_logloss: 0.0289301	valid_1's multi_logloss: 0.225003
Fold  1  Val F1-Score:  0.89311
Training until validation scores don't improve for 200 rounds.
[100]	training's multi_logloss: 0.0605514	valid_1's multi_logloss: 0.229683
[200]	training's multi_logloss: 0.0127828	valid_1's multi_logloss: 0.219484
[300]	training's multi_logloss: 0.00283041	valid_1's multi_logloss: 0.233977
Early stopping, best iteration is:
[181]	training's multi_logloss: 0.0171628	valid_1's multi_logloss: 0.217532
Fold  2  Val F1-Score:  0.89246
Training until validation scores don't improve for 200 rounds.
[100]	training's multi_logloss: 0.0618003	valid_1's multi_logloss: 

(array([[1.41408635e-04, 9.99416104e-01, 4.42487782e-04],
        [9.99516228e-01, 2.90891004e-04, 1.92881251e-04],
        [3.94046693e-02, 9.57297155e-01, 3.29817565e-03],
        ...,
        [4.97882825e-03, 3.47923244e-01, 6.47097928e-01],
        [9.98557392e-01, 8.04798595e-04, 6.37809095e-04],
        [1.53383131e-02, 4.14102219e-01, 5.70559468e-01]]),
 <lightgbm.basic.Booster at 0x1e1636ce7b8>)

#### PCA降维

In [89]:
pca = PCA(n_components=5)
X_pca = pca.fit_transform(X)
T_pca = pca.fit_transform(T)
pca.explained_variance_ratio_

array([9.99804785e-01, 1.94957232e-04, 2.57334479e-07, 1.04026280e-10,
       3.16291421e-12])

In [97]:
_,_ = lgb_oof_score(pd.DataFrame(X_pca),y,pd.DataFrame(T_pca),5,0)

Fold  1  Val F1-Score:  0.78814
Fold  2  Val F1-Score:  0.76164
Fold  3  Val F1-Score:  0.78811
Fold  4  Val F1-Score:  0.78404
Fold  5  Val F1-Score:  0.7735
oof F1-Score 0.77915 

Proportion of Prediction Label
0    0.5530
2    0.2435
1    0.2035
Name: pred, dtype: float64


In [98]:
for i in range(5,16):
    pca = PCA(n_components=i)
    X_pca = pca.fit_transform(X)
    T_pca = pca.fit_transform(T)  
    _,_ = lgb_oof_score(pd.DataFrame(X_pca),y,pd.DataFrame(T_pca),5,0)

Fold  1  Val F1-Score:  0.78814
Fold  2  Val F1-Score:  0.76164
Fold  3  Val F1-Score:  0.78811
Fold  4  Val F1-Score:  0.78404
Fold  5  Val F1-Score:  0.7735
oof F1-Score 0.77915 

Proportion of Prediction Label
0    0.5530
2    0.2435
1    0.2035
Name: pred, dtype: float64
Fold  1  Val F1-Score:  0.78278
Fold  2  Val F1-Score:  0.75495
Fold  3  Val F1-Score:  0.77759
Fold  4  Val F1-Score:  0.80343
Fold  5  Val F1-Score:  0.77105
oof F1-Score 0.77817 

Proportion of Prediction Label
0    0.6135
1    0.1990
2    0.1875
Name: pred, dtype: float64
Fold  1  Val F1-Score:  0.77356
Fold  2  Val F1-Score:  0.75521
Fold  3  Val F1-Score:  0.79022
Fold  4  Val F1-Score:  0.80109
Fold  5  Val F1-Score:  0.77389
oof F1-Score 0.779 

Proportion of Prediction Label
0    0.6905
1    0.2065
2    0.1030
Name: pred, dtype: float64
Fold  1  Val F1-Score:  0.77323
Fold  2  Val F1-Score:  0.76751
Fold  3  Val F1-Score:  0.78456
Fold  4  Val F1-Score:  0.79938
Fold  5  Val F1-Score:  0.7761
oof F1-Score 

### 模型对比

因为一些模型的输入要归一化或标准化，所以先用lgb试一试normlized data

针对T1的预测值偏差太大，0类标签高达0.777，显然过拟合

In [101]:
lgb_oof_score(pd.DataFrame(X1),y,pd.DataFrame(T1),5,0)

Fold  1  Val F1-Score:  0.89141
Fold  2  Val F1-Score:  0.88912
Fold  3  Val F1-Score:  0.8832
Fold  4  Val F1-Score:  0.88258
Fold  5  Val F1-Score:  0.87515
oof F1-Score 0.88423 

Proportion of Prediction Label
0    0.777
1    0.207
2    0.016
Name: pred, dtype: float64


(array([[6.50800362e-04, 9.98200602e-01, 1.14859786e-03],
        [9.98737635e-01, 8.67281186e-04, 3.95083640e-04],
        [1.58283046e-01, 7.96367217e-01, 4.53497373e-02],
        ...,
        [8.69451159e-02, 7.47675733e-01, 1.65379152e-01],
        [9.30889745e-01, 5.33091642e-02, 1.58010913e-02],
        [4.86451764e-01, 2.95419458e-01, 2.18128778e-01]]),
 <lightgbm.basic.Booster at 0x1e1632655c0>)

标准化的数据X2训练后用T2预测也得到过拟合的结果

In [102]:
lgb_oof_score(pd.DataFrame(X2),y,pd.DataFrame(T2),5,0)

Fold  1  Val F1-Score:  0.89541
Fold  2  Val F1-Score:  0.87838
Fold  3  Val F1-Score:  0.88144
Fold  4  Val F1-Score:  0.88105
Fold  5  Val F1-Score:  0.87523
oof F1-Score 0.88228 

Proportion of Prediction Label
0    0.573
1    0.296
2    0.131
Name: pred, dtype: float64


(array([[2.92252615e-04, 9.98559896e-01, 1.14785153e-03],
        [9.94863447e-01, 2.23298309e-03, 2.90356949e-03],
        [3.46825127e-02, 9.60111875e-01, 5.20561244e-03],
        ...,
        [7.50721000e-03, 5.55302914e-01, 4.37189876e-01],
        [9.85980376e-01, 9.21983504e-03, 4.79978849e-03],
        [2.63574584e-02, 6.93902437e-01, 2.79740104e-01]]),
 <lightgbm.basic.Booster at 0x1e1636c2d68>)

In [130]:
def get_model_score(Model,X,y,T,n_split):
    fold = StratifiedKFold(n_splits=n_split,shuffle=True,random_state=42)
    models = []
    pred = np.zeros((len(T),3))
    oof = np.zeros((len(X),1))

    for index, (train_idx, val_idx) in enumerate(fold.split(X, y)):
        train_x = X.iloc[train_idx]
        train_y = y.iloc[train_idx]
        val_x = X.iloc[val_idx]
        val_y = y.iloc[val_idx]
        if type(Model) == CatBoostClassifier:
            model = Model.fit(train_x,train_y,silent=True)
        else:
            model = Model.fit(train_x,train_y)
        models.append(model)
        p = val_pred = model.predict(X.iloc[val_idx])
        oof[val_idx] = val_pred.reshape(-1,1)
        val_y = y.iloc[val_idx]
        
        print(index+1, 'Val F1-Score', metrics.f1_score(val_y, val_pred, average='macro'))

        test_proba = model.predict_proba(T)
        pred += test_proba/n_split
    print('oof f1', metrics.f1_score(oof, y, average='macro'))
    return np.argmax(pred,axis=1),model

In [106]:
clf1 = LogisticRegression()
clf2 = svm.SVC()
clf3 = GaussianNB()
clf4 = KNeighborsClassifier()
clf5 = RandomForestClassifier()
clf6 = AdaBoostClassifier()
clf7 = GradientBoostingClassifier()
clf8 = XGBClassifier()
clf9 = LGBMClassifier()
clf10 = CatBoostClassifier()
clf11 = ExtraTreesClassifier()

clfs = [clf1,clf2,clf3,clf4,clf5,clf6,clf7,clf8,clf9,clf10,clf11]

In [114]:
for clf in clfs:
    print(type(clf))
    _,_ = get_model_score(clf,pd.DataFrame(X2),y,pd.DataFrame(T2),5)
    print('-*-'*30)
    print('\n')

<class 'sklearn.linear_model.logistic.LogisticRegression'>
1 Val F1-Score 0.6398210420056181
2 Val F1-Score 0.6441856360771774
3 Val F1-Score 0.6236970420775785
4 Val F1-Score 0.6297478310078696
5 Val F1-Score 0.6160391891837704
oof f1 0.6311750471744221
-*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*-


<class 'sklearn.svm.classes.SVC'>
1 Val F1-Score 0.6888400538615457
2 Val F1-Score 0.6995513124562258
3 Val F1-Score 0.6983390667829065
4 Val F1-Score 0.694829578653509
5 Val F1-Score 0.6856762573411775
oof f1 0.6935640568701166
-*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*-


<class 'sklearn.naive_bayes.GaussianNB'>
1 Val F1-Score 0.5595013219952458
2 Val F1-Score 0.5726557234730362
3 Val F1-Score 0.5398686958595917
4 Val F1-Score 0.543938957185248
5 Val F1-Score 0.5709789018872683
oof f1 0.5575281496116499
-*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*-


<

#### 对表现较好的模型进行简单调参

极端随机树展现出惊人的实力！

In [135]:
new_clf11 = ExtraTreesClassifier(n_estimators=800)
pred,_ = get_model_score(new_clf11,X[good_feas],y,T[good_feas],5)
print(pd.DataFrame(pred)[0].value_counts(1))

1 Val F1-Score 0.9007356871434542
2 Val F1-Score 0.8899338549139572
3 Val F1-Score 0.9017125988572889
4 Val F1-Score 0.8899536174269292
5 Val F1-Score 0.8857607540369178
oof f1 0.893604078161521
0    0.6405
1    0.2385
2    0.1210
Name: 0, dtype: float64


随机森林也不错！

In [139]:
new_clf5 = RandomForestClassifier(n_estimators=800)
pred,_ = get_model_score(new_clf5,X[good_feas],y,T[good_feas],5)
print(pd.DataFrame(pred)[0].value_counts(1))

1 Val F1-Score 0.893772634787218
2 Val F1-Score 0.8950270431940347
3 Val F1-Score 0.8936606817935931
4 Val F1-Score 0.8898857931040767
5 Val F1-Score 0.8681637187607357
oof f1 0.8880473073375735
0    0.6405
1    0.2375
2    0.1220
Name: 0, dtype: float64


In [157]:
params = {
    'n_estimators':500,
    'learning_rate': 0.01,
    'max_depth': 10,
    'random_state':42
}

XGB有点失望,速度慢,分数也不高

In [158]:
new_clf8 = XGBClassifier(**params)
pred,_ = get_model_score(new_clf8,X[good_feas],y,T[good_feas],5)
print(pd.DataFrame(pred)[0].value_counts(1))

1 Val F1-Score 0.8914344019577346
2 Val F1-Score 0.8875030261498814
3 Val F1-Score 0.8828054858982693
4 Val F1-Score 0.8856627669381975
5 Val F1-Score 0.8759302123254148
oof f1 0.8846495722541001
0    0.639
1    0.242
2    0.119
Name: 0, dtype: float64


CAT效果一般，速度太慢

In [160]:
params = {
    'iterations':500,
    'learning_rate': 0.01,
    'depth': 10,
    'random_state':42
}

In [161]:
new_clf10 = CatBoostClassifier(**params)
pred,_ = get_model_score(new_clf10,X[good_feas],y,T[good_feas],5)
print(pd.DataFrame(pred)[0].value_counts(1))

1 Val F1-Score 0.7901082583518124


KeyboardInterrupt: 

LGB

In [164]:
params = {
    'n_estimators':500,
    'learning_rate': 0.01,
    'random_state':42,
    'silent':True
}

In [165]:
new_clf9 = LGBMClassifier(**params)
pred,_ = get_model_score(new_clf9,X[good_feas],y,T[good_feas],5)
print(pd.DataFrame(pred)[0].value_counts(1))

1 Val F1-Score 0.8737584738357386
2 Val F1-Score 0.8649272415130641
3 Val F1-Score 0.8707301768206724
4 Val F1-Score 0.8739250445906485
5 Val F1-Score 0.8574317988112056
oof f1 0.8681745934137491
0    0.6315
1    0.2430
2    0.1255
Name: 0, dtype: float64


### 模型融合

In [166]:
def Stacking(Models,X,y,T,feats,n_split):
    X = X[feats]
    New_features = np.zeros((X.shape[0],len(Models)))
    New_test = np.zeros((T.shape[0],len(Models)))
    
    for i,clf in enumerate(Models):
        # 第i个模型对测试集的预测
        New_test_i = np.zeros((T.shape[0],3))
        # 第i个模型对训练集的预测概率
        New_features_i = np.zeros((X.shape[0],3))
        
        pred = np.zeros((len(T),1))
        oof = np.zeros((len(X),1))
        
        fold = StratifiedKFold(n_splits=n_split,shuffle=True,random_state=42)
        models = []
        
        for j, (train_idx, val_idx) in enumerate(fold.split(X, y)):
            train_x = X.iloc[train_idx]
            train_y = y.iloc[train_idx]
            val_x = X.iloc[val_idx]
            val_y = y.iloc[val_idx]
            if type(clf) == CatBoostClassifier:
                clf.fit(train_x,train_y,silent=True)
            else:
                clf.fit(train_x,train_y)
            models.append(clf)
            val_pred = clf.predict(X.iloc[val_idx])
            oof[val_idx] = val_pred.reshape(-1,1)
            val_y = y.iloc[val_idx]
            score = metrics.f1_score(val_y, val_pred, average='macro')
            print('Model {} in {} Fold Val F1-Score:'.format(i+1,j),round(score,5))

            val_pred_prob = clf.predict_proba(X.iloc[val_idx])
            New_features_i[val_idx] = val_pred_prob
            
            # 第i个模型在第j折的情况下对全部测试集的预测
            test_prob = clf.predict_proba(T[feats])
            New_test_i += test_prob/n_split
        t_score = metrics.f1_score(oof, y, average='macro')
        print('Model {} oof F1-Score:'.format(i+1),round(t_score,5) )
        
    # 简单算数融合训练集预测值作为新特征
    New_features += New_features_i/len(Models)
    New_test += New_test_i/len(Models)
    # 输出第一层模型得到的新特征，新测试集，以及训练好的模型
    return New_features,New_test,models

第一层：clf5-RF,clf8-XGB,clf11-ET

In [167]:
M1 = [new_clf5,new_clf8,new_clf11]
X_2,T_2,M1_models = Stacking(M1,X,y,T,good_fea[:45],5)

Model 1 in 0 Fold Val F1-Score: 0.89365
Model 1 in 1 Fold Val F1-Score: 0.89599
Model 1 in 2 Fold Val F1-Score: 0.8948
Model 1 in 3 Fold Val F1-Score: 0.88789
Model 1 in 4 Fold Val F1-Score: 0.86771
Model 1 oof F1-Score: 0.88797
Model 2 in 0 Fold Val F1-Score: 0.89143
Model 2 in 1 Fold Val F1-Score: 0.8875
Model 2 in 2 Fold Val F1-Score: 0.88281
Model 2 in 3 Fold Val F1-Score: 0.88566
Model 2 in 4 Fold Val F1-Score: 0.87593
Model 2 oof F1-Score: 0.88465
Model 3 in 0 Fold Val F1-Score: 0.89886
Model 3 in 1 Fold Val F1-Score: 0.8867
Model 3 in 2 Fold Val F1-Score: 0.90107
Model 3 in 3 Fold Val F1-Score: 0.8891
Model 3 in 4 Fold Val F1-Score: 0.88436
Model 3 oof F1-Score: 0.89201


In [168]:
print(X_2.shape,T_2.shape)

(7000, 3) (2000, 3)


第二层：lgb or ET

In [170]:
fold = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
pred = np.zeros((len(T),3))
oof = np.zeros((len(X),3))

params = {
    'n_estimators': 5000,
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': 3,
    'early_stopping_rounds': 500,
}

for index, (train_idx, val_idx) in enumerate(fold.split(X_2, y)):

    train_set = lgb.Dataset(X.iloc[train_idx], y.iloc[train_idx])
    val_set = lgb.Dataset(X.iloc[val_idx], y.iloc[val_idx])

    model = lgb.train(params, train_set, valid_sets=[train_set, val_set], verbose_eval=300)
    val_pred = model.predict(X.iloc[val_idx])
    oof[val_idx] = val_pred
    val_y = y.iloc[val_idx]
    val_pred = np.argmax(val_pred, axis=1)
    print(index, 'val f1', metrics.f1_score(val_y, val_pred, average='macro'))

    test_pred = model.predict(T_2)
    pred += test_pred/5

oof = np.argmax(oof, axis=1)
print('oof f1', metrics.f1_score(oof, y, average='macro'))

Training until validation scores don't improve for 500 rounds.
[300]	training's multi_logloss: 0.0020164	valid_1's multi_logloss: 0.268311
[600]	training's multi_logloss: 1.65539e-05	valid_1's multi_logloss: 0.384211
Early stopping, best iteration is:
[137]	training's multi_logloss: 0.0292724	valid_1's multi_logloss: 0.234268
0 val f1 0.8860908934907261
Training until validation scores don't improve for 500 rounds.
[300]	training's multi_logloss: 0.00206653	valid_1's multi_logloss: 0.240765
[600]	training's multi_logloss: 1.72e-05	valid_1's multi_logloss: 0.324521
Early stopping, best iteration is:
[187]	training's multi_logloss: 0.0128151	valid_1's multi_logloss: 0.223033
1 val f1 0.8945801905665299
Training until validation scores don't improve for 500 rounds.
[300]	training's multi_logloss: 0.00201991	valid_1's multi_logloss: 0.277985
[600]	training's multi_logloss: 1.67566e-05	valid_1's multi_logloss: 0.38493
Early stopping, best iteration is:
[171]	training's multi_logloss: 0.0164

In [173]:
Final_clf11 = ExtraTreesClassifier(n_estimators=800)
pred,_ = get_model_score(Final_clf11,pd.DataFrame(X_2),y,pd.DataFrame(T_2),5)
print(pd.DataFrame(pred)[0].value_counts(1))

1 Val F1-Score 0.8910725913440749
2 Val F1-Score 0.871716676763367
3 Val F1-Score 0.8855261987123426
4 Val F1-Score 0.8676795395466496
5 Val F1-Score 0.8572779032290363
oof f1 0.8746172085703309
0    0.6315
1    0.2285
2    0.1400
Name: 0, dtype: float64


在LGB和ET分别作为第二层模型的情况下，效果都不如单个模型表现好，尝试使用所用的模型做第二层

In [177]:
clf1 = LogisticRegression()
clf2 = svm.SVC(probability=True)
clf3 = GaussianNB()
clf4 = KNeighborsClassifier()
clf5 = RandomForestClassifier(n_estimators=818)
clf6 = AdaBoostClassifier()
clf7 = GradientBoostingClassifier()
clf8 = XGBClassifier()
clf9 = LGBMClassifier()
clf10 = CatBoostClassifier()
clf11 = ExtraTreesClassifier(n_estimators=818)

clfs = [clf1,clf2,clf3,clf4,clf5,clf6,clf7,clf8,clf9,clf10,clf11]

In [179]:
result = []
i = 0
for clf in clfs:
    i += 1
    if clf == clf9 or clf == clf11:
        continue
    print('Model ',i,type(clf))
    pred,_ = get_model_score(clf,pd.DataFrame(X_2),y,pd.DataFrame(T_2),5)
    result.append(pred)
    print(pd.DataFrame(pred)[0].value_counts(1))
    print('-*-'*30)

Model  1 <class 'sklearn.linear_model.logistic.LogisticRegression'>
1 Val F1-Score 0.8992799715388822
2 Val F1-Score 0.8879686115323421
3 Val F1-Score 0.9007923484039239
4 Val F1-Score 0.8908328440197274
5 Val F1-Score 0.8832457152314848
oof f1 0.8924028053963067
0    0.648
1    0.233
2    0.119
Name: 0, dtype: float64
-*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*-
Model  2 <class 'sklearn.svm.classes.SVC'>
1 Val F1-Score 0.8937041758535003
2 Val F1-Score 0.8926043706210587
3 Val F1-Score 0.9070373903957112
4 Val F1-Score 0.8895557881905288
5 Val F1-Score 0.8899510806756107
oof f1 0.8945451587072305
0    0.6325
1    0.2330
2    0.1345
Name: 0, dtype: float64
-*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*-
Model  3 <class 'sklearn.naive_bayes.GaussianNB'>
1 Val F1-Score 0.8897350764807016
2 Val F1-Score 0.9007955992272758
3 Val F1-Score 0.9060219912056936
4 Val F1-Score 0.8850131270506068
5 Val F1-Sco

在胡乱实验的情况下发现SVM竟然可以进一步提升效果！

In [185]:
sub = data_label[data_label['type']==-1][['ship']]
sub['pred'] = result[1]
sub['pred'] = sub['pred'].map(type_map_rev)

In [186]:
sub['pred'].value_counts(1)

拖网    0.6325
围网    0.2330
刺网    0.1345
Name: pred, dtype: float64

In [188]:
sub.to_csv('./output/bs4_stacking_89454.csv',index=False,header=False)