In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import os
# from tqdm import tqdm
from tqdm import tqdm_notebook as tqdm
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
import xgboost as xgb
import warnings
import matplotlib.pyplot as plt
from inspect import isfunction
from functools import partial


pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
warnings.filterwarnings('ignore')

In [520]:
def group_feature(df, key, target, aggs):   
    agg_dict = {}
    for ag in aggs:
        if isfunction(ag):
            if (ag==pd.Series.kurt):
                agg_dict[f'{target}_kurt'] = ag
            elif (ag==q_25):
                agg_dict[f'{target}_25'] = ag
            elif (ag==q_75):
                agg_dict[f'{target}_75'] = ag
            else:
                agg_dict[f'{target}_mode'] = ag
        else:
            agg_dict[f'{target}_{ag}'] = ag
    print(agg_dict)
    t = df.groupby(key)[target].agg(agg_dict).reset_index()
    return t

def q_25(x):
    return x.quantile(q=0.25)
def q_75(x):
    return x.quantile(q=0.75)
def modex(x):
    return np.mean(pd.Series.mode(x))
def corrxy(x):
    return train['x'].corr(train['y'])

def extract_feature(df, train):
    t = group_feature(df, 'ship','x',['max','min','median','mean','std','skew','sum',q_25,q_75,pd.Series.kurt,modex])
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','x',['count'])
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','y',['max','min','median','mean','std','skew','sum',q_25,q_75,pd.Series.kurt,modex])
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','v',['max','min','median','mean','std','skew','sum',q_25,q_75,pd.Series.kurt,modex])
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','d',['max','min','median','mean','std','skew','sum',q_25,q_75,pd.Series.kurt,modex])
    train = pd.merge(train, t, on='ship', how='left')

    df['diffx']=df['x'].diff()
    df['diffy']=df['y'].diff()
    df['diffv']=df['v'].diff()
    df['diffd']=df['d'].diff()
    df['difflen']=(df['diffx']**2+df['diffy']**2)**0.5
    t = group_feature(df, 'ship','diffx',['max','min','median','mean','std','skew','sum',q_25,q_75,pd.Series.kurt,modex])
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','diffy',['max','min','median','mean','std','skew','sum',q_25,q_75,pd.Series.kurt,modex])
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','diffv',['max','min','median','mean','std','skew','sum',q_25,q_75,pd.Series.kurt,modex])
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','diffd',['max','min','median','mean','std','skew','sum',q_25,q_75,pd.Series.kurt,modex])
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','difflen',['max','min','median','mean','std','skew','sum',q_25,q_75,pd.Series.kurt,modex])
    train = pd.merge(train, t, on='ship', how='left')
    
    t=df[(df['v']==0)].groupby(['ship','x','y']).size().groupby('ship').idxmax().apply(pd.Series).iloc[:,1:]
    t.rename(columns={1:'x_0', 2:'y_0'},inplace = True)
    train = pd.merge(train, t, on='ship', how='left')
    
    train['x_0_x_max']=train['x_max']-train['x_0']
    train['x_0_x_mean']=train['x_0']-train['x_mean']
    train['x_0_x_min']=train['x_0']-train['x_min']
    train['x_0_x_25']=train['x_0']-train['x_25']
    train['x_0_x_median']=train['x_0']-train['x_median']
    train['x_0_x_75']=train['x_0']-train['x_75']
    train['x_0_x_mode']=train['x_0']-train['x_mode']

    train['y_0_y_max']=train['y_max']-train['y_0']
    train['y_0_y_mean']=train['y_0']-train['y_mean']
    train['y_0_y_min']=train['y_0']-train['y_min']
    train['y_0_y_25']=train['y_0']-train['y_25']
    train['y_0_y_median']=train['y_0']-train['y_median']
    train['y_0_y_75']=train['y_0']-train['y_75']
    train['y_0_y_mode']=train['y_0']-train['y_mode']

    train['x_max_x_min'] = train['x_max'] - train['x_min']
    train['y_max_y_min'] = train['y_max'] - train['y_min']
    
    train['y_max_x_min'] = train['y_max'] - train['x_min']
    train['y_max_x_max'] = train['y_max'] - train['x_max']
    train['x_max_y_min'] = train['x_max'] - train['y_min']
    train['x_min_y_min'] = train['x_min'] - train['y_min']
    
    train['slope'] = train['y_max_y_min'] / np.where(train['x_max_x_min']==0, 0.001, train['x_max_x_min'])
    train['area'] = train['x_max_x_min'] * train['y_max_y_min']

    t = df.groupby('ship')['time'].agg({'diff_time':lambda x:np.max(x)-np.min(x)}).reset_index()
    t['diff_second'] = t['diff_time'].dt.seconds
    train = pd.merge(train, t, on='ship', how='left')
    
    df['v_cos'] = df['v'] * np.cos(df['d'])
    df['v_sin'] = df['v'] * np.sin(df['d'])
    t = group_feature(df, 'ship','v_cos',['max','min','median','mean','std','skew','sum',q_25,q_75,pd.Series.kurt,modex])
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','v_sin',['max','min','median','mean','std','skew','sum',q_25,q_75,pd.Series.kurt,modex])
    train = pd.merge(train, t, on='ship', how='left')  
    
    df['k']=df['y']/df['x']
    df['b']=df['y']-df['k'].mean()*df['x']
    t = group_feature(df, 'ship','k',['max','min','median','mean','std','skew','sum',q_25,q_75,pd.Series.kurt,modex])
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','b',['max','min','median','mean','std','skew','sum',q_25,q_75,pd.Series.kurt,modex])
    train = pd.merge(train, t, on='ship', how='left') 
    
    df_night=df[(df['hour']<6) | (df['hour']>=18)]
    t = group_feature(df_night, 'ship','x',['max','min','median','mean','std','skew','sum',q_25,q_75,pd.Series.kurt,modex])
    t.columns=['ship']+[x+'_night' for x in t.columns[1:].tolist()]
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df_night, 'ship','y',['max','min','median','mean','std','skew','sum',q_25,q_75,pd.Series.kurt,modex])
    t.columns=['ship']+[x+'_night' for x in t.columns[1:].tolist()]
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df_night, 'ship','v',['max','min','median','mean','std','skew','sum',q_25,q_75,pd.Series.kurt,modex])
    t.columns=['ship']+[x+'_night' for x in t.columns[1:].tolist()]
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df_night, 'ship','d',['max','min','median','mean','std','skew','sum',q_25,q_75,pd.Series.kurt,modex])
    t.columns=['ship']+[x+'_night' for x in t.columns[1:].tolist()]
    train = pd.merge(train, t, on='ship', how='left')
    
    df_day=df[(df['hour']>=6) & (df['hour']<18)]    
    t = group_feature(df_day, 'ship','x',['max','min','median','mean','std','skew','sum',q_25,q_75,pd.Series.kurt,modex])
    t.columns=['ship']+[x+'_day' for x in t.columns[1:].tolist()]
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df_day, 'ship','y',['max','min','median','mean','std','skew','sum',q_25,q_75,pd.Series.kurt,modex])
    t.columns=['ship']+[x+'_day' for x in t.columns[1:].tolist()]
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df_day, 'ship','v',['max','min','median','mean','std','skew','sum',q_25,q_75,pd.Series.kurt,modex])
    t.columns=['ship']+[x+'_day' for x in t.columns[1:].tolist()]
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df_day, 'ship','d',['max','min','median','mean','std','skew','sum',q_25,q_75,pd.Series.kurt,modex])
    t.columns=['ship']+[x+'_day' for x in t.columns[1:].tolist()]
    train = pd.merge(train, t, on='ship', how='left')
    
    v_medain_list=np.repeat(df.groupby(['ship'])['v'].median().values,df.groupby(['ship'])['v'].count().values, axis=0)
    df_vsmall=df[df['v']<v_medain_list]
    t = group_feature(df_vsmall, 'ship','x',['max','min','median','mean','std','skew','sum',q_25,q_75,pd.Series.kurt,modex])
    t.columns=['ship']+[x+'_vsamll' for x in t.columns[1:].tolist()]
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df_vsmall, 'ship','y',['max','min','median','mean','std','skew','sum',q_25,q_75,pd.Series.kurt,modex])
    t.columns=['ship']+[x+'_vsamll' for x in t.columns[1:].tolist()]
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df_vsmall, 'ship','v',['max','min','median','mean','std','skew','sum',q_25,q_75,pd.Series.kurt,modex])
    t.columns=['ship']+[x+'_vsamll' for x in t.columns[1:].tolist()]
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df_vsmall, 'ship','d',['max','min','median','mean','std','skew','sum',q_25,q_75,pd.Series.kurt,modex])
    t.columns=['ship']+[x+'_vsamll' for x in t.columns[1:].tolist()]
    train = pd.merge(train, t, on='ship', how='left')

    df_vlarge=df[df['v']>v_medain_list]
    t = group_feature(df_vlarge, 'ship','x',['max','min','median','mean','std','skew','sum',q_25,q_75,pd.Series.kurt,modex])
    t.columns=['ship']+[x+'_vlarge' for x in t.columns[1:].tolist()]
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df_vlarge, 'ship','y',['max','min','median','mean','std','skew','sum',q_25,q_75,pd.Series.kurt,modex])
    t.columns=['ship']+[x+'_vlarge' for x in t.columns[1:].tolist()]
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df_vlarge, 'ship','v',['max','min','median','mean','std','skew','sum',q_25,q_75,pd.Series.kurt,modex])
    t.columns=['ship']+[x+'_vlarge' for x in t.columns[1:].tolist()]
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df_vlarge, 'ship','d',['max','min','median','mean','std','skew','sum',q_25,q_75,pd.Series.kurt,modex])
    t.columns=['ship']+[x+'_vlarge' for x in t.columns[1:].tolist()]
    train = pd.merge(train, t, on='ship', how='left')
    return train

def extract_dt(df):
    df['time'] = pd.to_datetime(df['time'], format='%m%d %H:%M:%S')
    # df['month'] = df['time'].dt.month
    # df['day'] = df['time'].dt.day
    df['date'] = df['time'].dt.date
    df['hour'] = df['time'].dt.hour
    # df = df.drop_duplicates(['ship','month'])
    df['weekday'] = df['time'].dt.weekday
    return df

def map_feature(train):
    m,n=3,3
    df=train
    ymax,ymin,xmax,xmin=df['y'].max(),df['y'].min(),df['x'].max(),df['x'].min()
    yy=(df['y']-ymin)/(ymax-ymin)
    xx=(df['x']-xmin)/(xmax-xmin)
    count,count10,vmean,dmean,vstd,dstd,xstd,ystd=[],[],[],[],[],[],[],[]
    for i in range(m):
        count.append([]),count10.append([]),vmean.append([]),dmean.append([]),dstd.append([]),vstd.append([]),xstd.append([]),ystd.append([])
        for j in range(n):
            if ((i==m)|(j==n)):
                dfij=df[((i/m)<=yy) & (yy<=((i+1)/m)) & ((j/n)<=xx) & (xx<=((j+1)/n))]
            else:
                dfij=df[((i/m)<=yy) & (yy<((i+1)/m)) & ((j/n)<=xx) & (xx<((j+1)/n))]
            cc=dfij.shape[0]
            count[i].append(cc)
            count10[i].append(cc!=0)
            xstd[i].append(dfij['x'].std())
            ystd[i].append(dfij['y'].std())
            vmean[i].append(dfij['v'].mean())
            vstd[i].append(dfij['v'].std())
            dmean[i].append(dfij['d'].mean())
            dstd[i].append(dfij['d'].std())
    map_f=np.array([np.array(count),np.array(count10),np.array(vmean),np.array(dmean),
                    np.array(vstd),np.array(dstd),np.array(xstd),np.array(ystd)]).reshape(m*n*8).T
    return pd.DataFrame(map_f)

In [None]:
train_label

In [None]:
test = pd.read_hdf('/home/sunnyu/yuchuan/data/test.h5')
test = extract_dt(test)
test_label = test.drop_duplicates('ship')
test_label = extract_feature(test, test_label)

In [None]:
test_m_f=test.groupby(['ship']).apply(map_feature).unstack()
test_label=pd.merge(test_label, test_m_f, on='ship', how='left')
test_label.columns=test_label.columns[:130].tolist()+[str(i) for i in range(36)]
test_label.iloc[:,130:]=test_label.iloc[:,130:].fillna(0)

In [522]:
train = pd.read_hdf('/home/sunnyu/yuchuan/data/train2.h5')
train = extract_dt(train)
train_label = train.drop_duplicates('ship')
type_map = dict(zip(train_label['type'].unique(), np.arange(3)))
type_map_rev = {v:k for k,v in type_map.items()}
train_label['type'] = train_label['type'].map(type_map)

In [523]:
train_label = extract_feature(train, train_label)

{'x_max': 'max', 'x_min': 'min', 'x_median': 'median', 'x_mean': 'mean', 'x_std': 'std', 'x_skew': 'skew', 'x_sum': 'sum', 'x_25': <function q_25 at 0x7f232e2ff440>, 'x_75': <function q_75 at 0x7f232e2ff710>, 'x_kurt': <function Series.kurt at 0x7f235107d050>, 'x_mode': <function modex at 0x7f2329fddcb0>}
{'x_count': 'count'}
{'y_max': 'max', 'y_min': 'min', 'y_median': 'median', 'y_mean': 'mean', 'y_std': 'std', 'y_skew': 'skew', 'y_sum': 'sum', 'y_25': <function q_25 at 0x7f232e2ff440>, 'y_75': <function q_75 at 0x7f232e2ff710>, 'y_kurt': <function Series.kurt at 0x7f235107d050>, 'y_mode': <function modex at 0x7f2329fddcb0>}
{'v_max': 'max', 'v_min': 'min', 'v_median': 'median', 'v_mean': 'mean', 'v_std': 'std', 'v_skew': 'skew', 'v_sum': 'sum', 'v_25': <function q_25 at 0x7f232e2ff440>, 'v_75': <function q_75 at 0x7f232e2ff710>, 'v_kurt': <function Series.kurt at 0x7f235107d050>, 'v_mode': <function modex at 0x7f2329fddcb0>}
{'d_max': 'max', 'd_min': 'min', 'd_median': 'median', 'd_

In [525]:
train_m_f=train.groupby(['ship']).apply(map_feature).unstack()
train_m_f.columns=[str(i) for i in range(72)]
train_label=pd.merge(train_label, train_m_f, on='ship', how='left')
# train_label.iloc[:,125:]=train_label.iloc[:,125:].fillna(0)

In [526]:
train_co=train.groupby('ship').apply(lambda x :x['x'].corr(x['y'])).reset_index()
train_co.columns=['ship','corr']
train_label=pd.merge(train_label, train_co, on='ship', how='left')

In [756]:
test_co=test.groupby('ship').apply(lambda x :x['x'].corr(x['y'])).reset_index()
test_co.columns=['ship','corr']
test_label=pd.merge(test_label, test_co, on='ship', how='left')

In [605]:
test_cnn_feature=pd.read_csv('test_cnn_feature.csv')
train_cnn_feature=pd.read_csv('train_cnn_feature.csv')
train_cnn_feature.columns=['ship']+[str(i)+'_cnn' for i in range(16)]
test_cnn_feature.columns=['ship']+[str(i)+'_cnn' for i in range(16)]
train_label=pd.merge(train_label, train_cnn_feature, on='ship', how='left')
test_label=pd.merge(test_label, test_cnn_feature, on='ship', how='left')

In [282]:
# train_label.drop(columns=train_label.columns[-37:-1],inplace=True)
# test_label.drop(columns=test_label.columns[-32:],inplace=True)

In [700]:
features = [x for x in train_label.columns if x not in ['ship','type','time','diff_time','date',

#                                                       'x','y','v','d',                                                    
#                                                          'x_0_x_25','x_0_x_median','x_0_x_75','x_0_x_mean','x_0_x_max','x_0_x_min','x_0_x_mode',
#                                                          'y_0_y_25','y_0_y_median','y_0_y_75','y_0_y_mean','y_0_y_max','y_0_y_min','y_0_y_mode',
#                                                          'y_max_x_min','y_max_x_max','x_max_y_min','x_min_y_min'
                                                       ]+train_label.columns[268:-73].values.tolist()]
target = 'type'
len(features),np.array(features)

(336, array(['x', 'y', 'v', 'd', 'hour', 'weekday', 'x_max', 'x_min',
        'x_median', 'x_mean', 'x_std', 'x_skew', 'x_sum', 'x_25', 'x_75',
        'x_kurt', 'x_mode', 'x_count', 'y_max', 'y_min', 'y_median',
        'y_mean', 'y_std', 'y_skew', 'y_sum', 'y_25', 'y_75', 'y_kurt',
        'y_mode', 'v_max', 'v_min', 'v_median', 'v_mean', 'v_std',
        'v_skew', 'v_sum', 'v_25', 'v_75', 'v_kurt', 'v_mode', 'd_max',
        'd_min', 'd_median', 'd_mean', 'd_std', 'd_skew', 'd_sum', 'd_25',
        'd_75', 'd_kurt', 'd_mode', 'diffx_max', 'diffx_min',
        'diffx_median', 'diffx_mean', 'diffx_std', 'diffx_skew',
        'diffx_sum', 'diffx_25', 'diffx_75', 'diffx_kurt', 'diffx_mode',
        'diffy_max', 'diffy_min', 'diffy_median', 'diffy_mean',
        'diffy_std', 'diffy_skew', 'diffy_sum', 'diffy_25', 'diffy_75',
        'diffy_kurt', 'diffy_mode', 'diffv_max', 'diffv_min',
        'diffv_median', 'diffv_mean', 'diffv_std', 'diffv_skew',
        'diffv_sum', 'diffv_25', 'diff

In [701]:
params = {
     'num_leaves':64,
#        'max_depth':6, 
    'lambda_l1':0.1,
#     'lambda_l2':0.2,
    'n_estimators': 200,
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': 3,
#     'early_stopping_rounds': 200,
    'num_threads':20
}

In [702]:
def mac_f1(predss,mm):
    labell=mm.get_label().values
    predss=np.argmax(predss.reshape(3,len(labell)), axis=0)
    f11=metrics.f1_score(labell,predss,average='macro')
    return 'f1-score',f11,True


fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=513)

X = train_label[features].copy()
y = train_label[target]
models = []
#a=[350,300,400,450,350]
i=0

# pred = np.zeros((len(test_label),3))
oof = np.zeros((len(X), 3))
for index, (train_idx, val_idx) in enumerate(fold.split(X, y)):

    train_set = lgb.Dataset(X.iloc[train_idx], y.iloc[train_idx])
    val_set = lgb.Dataset(X.iloc[val_idx], y.iloc[val_idx])
#    params['n_estimators']=a[i]
    i+=1
    model = lgb.train(params, train_set, valid_sets=[train_set,val_set],feval=mac_f1, verbose_eval=100)
    models.append(model)
    val_pred = model.predict(X.iloc[val_idx])
    oof[val_idx] = val_pred
    val_y = y.iloc[val_idx]
    val_pred = np.argmax(val_pred, axis=1)
    print(index, 'val f1', metrics.f1_score(val_y, val_pred, average='macro'))
    # 0.8695539641133697
    # 0.8866211724839532

#     test_pred = model.predict(test_label[features])
#     pred += test_pred/7

[100]	training's multi_logloss: 0.0102085	training's f1-score: 1	valid_1's multi_logloss: 0.285088	valid_1's f1-score: 0.880102
[200]	training's multi_logloss: 0.00157557	training's f1-score: 1	valid_1's multi_logloss: 0.327029	valid_1's f1-score: 0.882175
0 val f1 0.8821751537838004
[100]	training's multi_logloss: 0.0101008	training's f1-score: 1	valid_1's multi_logloss: 0.253507	valid_1's f1-score: 0.886134
[200]	training's multi_logloss: 0.00156062	training's f1-score: 1	valid_1's multi_logloss: 0.279855	valid_1's f1-score: 0.894683
1 val f1 0.8946830628229248
[100]	training's multi_logloss: 0.00993858	training's f1-score: 1	valid_1's multi_logloss: 0.278343	valid_1's f1-score: 0.880296
[200]	training's multi_logloss: 0.00155869	training's f1-score: 1	valid_1's multi_logloss: 0.316421	valid_1's f1-score: 0.88588
2 val f1 0.8858802277319885
[100]	training's multi_logloss: 0.0100595	training's f1-score: 1	valid_1's multi_logloss: 0.286028	valid_1's f1-score: 0.883848
[200]	training's 

In [537]:
for i in range(5):
    models[i].save_model('/home/sunnyu/yuchuan/sub/fusaisub/model/lgb'+str(i)+'.txt')
pd.DataFrame(features).to_csv('/home/sunnyu/yuchuan/sub/fusaisub/model/features.csv',index=None, header=None)

In [685]:
oof = np.argmax(oof, axis=1)
print('oof f1', metrics.f1_score(oof, y, average='macro'))
# 0.8701544575329372

oof f1 0.8904708820452548


In [82]:
ret = []
for index, model in enumerate(models):
    df = pd.DataFrame()
    df['name'] = model.feature_name()
    df['score'] = model.feature_importance()
    df['fold'] = index
    ret.append(df)
    
df = pd.concat(ret)
df = df.groupby('name', as_index=False)['score'].mean()
df = df.sort_values(['score'],  ascending=False)
del_f_sp=df[df['score']>=400]['name'].tolist()
len(del_f_sp)

77

In [703]:
ret = []
for index, model in enumerate(models):
    df = pd.DataFrame()
    df['name'] = model.feature_name()
    df['score'] = model.feature_importance('gain')
    df['fold'] = index
    ret.append(df)
    
df = pd.concat(ret)
df = df.groupby('name', as_index=False)['score'].mean()
df = df.sort_values(['score'],  ascending=False)
del_f_ga=df[df['score']>=200]['name'].tolist()
len(del_f_ga)

62

In [573]:
features=set(del_f_ga)
len(features),np.array(features)

(62,
 array({'v_std', 'v_std_x', '25', 'v_75_y', 'x_min_y', 'v_mean_y', 'b_75', 'y_max_x_max', 'diffx_25', 'x_max_x', 'v_median_y', 'y_median_x', '18', 'v_median_x', 'y', 'y_mode_y', 'x_max_y_min', 'k_75', '24', 'y_mode_x', 'x_mode_x', 'x_mode_y', 'y_25_y', 'y_min_x', 'diffy_25', 'x_75_x', 'x_min_x', 'v_median', 'b_min', 'k_25', 'x_mode', 'b_median', 'b_mode', 'k_mode', 'x', 'k_median', 'slope', 'd_std', 'diff_second', 'y_max_x_min', 'b_25', 'diffd_std', 'x_min_y_min', 'y_min_y', 'b_max', 'y_std', 'b_sum', 'v_75_x', 'diffd_25', 'v_kurt_x', 'diffd_kurt', 'v_75', 'k_min', 'x_max', 'v_kurt_y', 'y_0', 'x_min', 'k_max', 'y_min', 'x_max_y', 'x_0', 'diffd_75'},
       dtype=object))

In [709]:
ret = []
for index, model in enumerate(models):
    df = pd.DataFrame()
    df['name'] = model.feature_name()
    df['score'] = model.feature_importance('gain')
    df['fold'] = index
    ret.append(df)
    
df = pd.concat(ret)
df = df.groupby('name', as_index=False)['score'].mean()
df = df.sort_values(['score'],  ascending=False)
del_f=df[df['score']<=150]['name'].tolist()
len(del_f)

0

In [705]:
features = [x for x in train_label.columns if x not in ['ship','type','time','diff_time','date',
#                                                        'x','y','v','d',
#                                                          'x_0_x_25','x_0_x_median','x_0_x_75','x_0_x_mean','x_0_x_max','x_0_x_min','x_0_x_mode',
#                                                          'y_0_y_25','y_0_y_median','y_0_y_75','y_0_y_mean','y_0_y_max','y_0_y_min','y_0_y_mode',
#                                                          'y_max_x_min','y_max_x_max','x_max_y_min','x_min_y_min'
                                                       ]+del_f+train_label.columns[268:-73].values.tolist()]
target = 'type'
len(features),np.array(features)

(62, array(['x', 'y', 'x_min', 'x_mode', 'y_min', 'y_median', 'y_25', 'y_mode',
        'v_std', 'v_75', 'v_kurt', 'diffx_25', 'diffy_25', 'diffd_std',
        'diffd_25', 'diffd_75', 'diffd_kurt', 'difflen_median',
        'difflen_75', 'x_0', 'y_0', 'y_max_x_min', 'y_max_x_max',
        'x_max_y_min', 'x_min_y_min', 'diff_second', 'k_max', 'k_min',
        'k_median', 'k_sum', 'k_25', 'k_75', 'k_mode', 'b_max', 'b_min',
        'b_median', 'b_25', 'b_75', 'b_mode', 'x_max_night', 'x_75_night',
        'x_mode_night', 'y_min_night', 'y_mode_night', 'v_median_night',
        'v_75_night', 'v_kurt_night', 'x_max_day', 'x_min_day', 'x_75_day',
        'x_mode_day', 'y_min_day', 'y_mode_day', 'v_median_day',
        'v_std_day', 'v_75_day', 'v_kurt_day', 'd_std_day', '24', '36',
        '44', 'corr'], dtype='<U14'))

In [696]:
features=pd.read_csv('~/yuchuan/sub/best_model/feature.csv',header=None)[0].values.tolist()

In [707]:
params = {
    'num_leaves':40,
#        'max_depth':6,
    'n_estimators':2500,
    'boosting_type': 'dart',
    'objective': 'multiclassova',
    'num_class': 3,
    'num_threads':20,
    #     'early_stopping_rounds': 200,
#     'lambda_l1':0.1,
#   'lambda_l2':0.3,
}

In [712]:
fold = StratifiedKFold(n_splits=6, shuffle=True, random_state=513)

X = train_label[features].copy()
y = train_label[target]
models = []
a40=[900,1450,1370,630,750,610]
i=0

oof = np.zeros((len(X), 3))
for index, (train_idx, val_idx) in enumerate(fold.split(X, y)):

    X_train = pd.concat([X.iloc[train_idx],X.iloc[train_idx][train_label.iloc[train_idx]['type']==1]])
    y_train = pd.concat([y.iloc[train_idx],y.iloc[train_idx][train_label.iloc[train_idx]['type']==1]])   
    train_set = lgb.Dataset(X_train, y_train)
    val_set = lgb.Dataset(X.iloc[val_idx], y.iloc[val_idx])
    params['n_estimators']=a40[i]
    i+=1
    model = lgb.train(params, train_set, valid_sets=val_set,feval=mac_f1, verbose_eval=1500)
    models.append(model)
    val_pred = model.predict(X.iloc[val_idx])
    oof[val_idx] = val_pred
    val_y = y.iloc[val_idx]
    val_pred = np.argmax(val_pred, axis=1)
    print(index, 'val f1', metrics.f1_score(val_y, val_pred, average='macro'))

0 val f1 0.9182937946946083
1 val f1 0.9173847913333896
2 val f1 0.9207770103300387
3 val f1 0.9100071418172314
4 val f1 0.927211641561838
5 val f1 0.9251732473811441


In [436]:
for i in range(7):
    models[i].save_model('/home/sunnyu/yuchuan/sub/fusaisub/model/lgb'+str(i)+'.txt')
pd.DataFrame(features).to_csv('/home/sunnyu/yuchuan/sub/fusaisub/model/features.csv',index=None, header=None)

In [688]:
type_map

{'拖网': 0, '刺网': 1, '围网': 2}

In [156]:
from collections import Counter
Counter(train_label['type'].values)

Counter({0: 3292, 1: 1333, 2: 3541})

In [435]:
oof = np.argmax(oof, axis=1)
print('oof f1', metrics.f1_score(oof, y, average='macro'))
# 0.8701544575329372

oof f1 0.918953702108967


In [768]:
pred = np.argmax(pred, axis=1)
sub = test_label[['ship']]
sub['pred'] = pred

print(sub['pred'].value_counts(1))
sub['pred'] = sub['pred'].map(type_map_rev)
sub.to_csv('210-dart-r513-k7-result.csv', index=None, header=None)

0    0.6350
1    0.2355
2    0.1295
Name: pred, dtype: float64


In [639]:
X = pd.concat([train_label,train_label[train_label['type']==1]])[features].copy()
y = pd.concat([train_label,train_label[train_label['type']==1]])[target]
train_set = lgb.Dataset(X,y)
params = {
    'num_leaves':40,
    'n_estimators': 1000,
    'boosting_type': 'dart',
    'objective': 'multiclassova',
    'num_class': 3,
    'num_threads':20,
}
model1 = lgb.train(params, train_set)
# test_pred = model.predict(test_label[features])
# pred=np.argmax(test_pred, axis=1)
# sub = test_label[['ship']]
# sub['pred'] = pred
# print(sub['pred'].value_counts(1))
# sub['pred'] = sub['pred'].map(type_map_rev)
# sub.to_csv('28-dart-1model-result.csv', index=None, header=None)

In [642]:
model1.save_model('/home/sunnyu/yuchuan/sub/fusaisub/model/lgb_1.txt')
pd.DataFrame(features).to_csv('/home/sunnyu/yuchuan/sub/fusaisub/model/features.csv',index=None, header=None)

In [413]:
# model1.save_model('/home/sunnyu/yuchuan/sub/best_model/lgb_1_89178.txt')
# pd.DataFrame(features).to_csv('/home/sunnyu/yuchuan/sub/best_model/features.csv',index=None, header=None)

In [369]:
ret = []
for index, model in enumerate(models):
    df = pd.DataFrame()
    df['name'] = model.feature_name()
    df['score'] = model.feature_importance(importance_type='gain')
    df['fold'] = index
    ret.append(df)
    
df = pd.concat(ret)
df = df.groupby('name', as_index=False)['score'].mean()
df = df.sort_values(['score'],  ascending=False)
df.T

Unnamed: 0,62,60,35,52,63,12,59,48,47,32,61,20,56,40,46,34,43,30,57,19,2,51,41,4,14,55,29,45,44,5,18,1,37,53,21,24,0,50,54,16,39,42,3,13,38,17,23,31,25,49,58,8,26,65,64,7,27,22,9,11,33,28,36,10,6,15
name,y_min,y_mean,x_0,y_0,y_mode,area,y_max,x_mode,x_min,v_median,y_median,diffd_kurt,y_25,x_25,x_median,v_std,x_max,v_75,y_75,diffd_75,20.0,x_sum,x_75,23.0,d_mean,y_0_y_min,slope,x_mean,x_max_x_min,26.0,diffd_25,19.0,x_0_x_max,y_0_y_max,diffd_std,diffv_mean,18.0,x_std,y_0_y_mean,d_sum,x_0_x_min,x_kurt,22.0,d_75,x_0_x_mean,diff_second,diffv_max,v_kurt,diffv_skew,x_skew,y_kurt,29,diffv_std,y_std,y_skew,28.0,diffy_25,diffv_kurt,31.0,35.0,v_skew,diffy_75,x_0_x_25,33.0,27.0,d_std
score,16173.7,15662.3,11142.6,10142,9863.75,8647.69,6314.14,6278.51,5961.34,5810.51,5610.29,5562.56,5532.92,5458.44,4664.65,4602.79,4326.77,4259.48,4034.69,3633.61,3396.86,3161.74,3087.62,2706.36,2699.26,2655.09,2590.92,2582.83,2482.46,2450.8,2417.95,2361.21,2290.01,2251.8,2199.44,2168.54,2125.25,2064.31,2054.41,2014.38,1998.73,1957.05,1950.58,1857.96,1793.22,1756.07,1734.02,1676.02,1669.58,1608.44,1594.04,1530,1496.84,1489.16,1476.39,1473.05,1413.95,1402.9,1344.01,1289.42,1275.43,1264.09,1214.27,1211.88,1110.31,1086.77


In [None]:
predd=np.argmax(pred, axis=1)
subb = test_label[['ship']]
subb['p0'],subb['p1'],subb['p2'] = pred[:,0],pred[:,1],pred[:,2]
subb=subb.sort_values('ship')
subb.to_csv('model1_p.csv')

In [829]:
submodel2=pd.read_csv('~/yuchuan/baseline/tianchi_ship_2019/working/model2_p.csv')
submodel1=subb.reset_index()

summodel2=(submodel2['p0']+submodel2['p1']+submodel2['p2'])
summodel1=(submodel1['p0']+submodel1['p1']+submodel1['p2'])

final_sub=pd.DataFrame()
final_sub['p0']=submodel1['p0']/summodel1+submodel2['p0']/summodel2
final_sub['p1']=submodel1['p1']/summodel1+submodel2['p1']/summodel2
final_sub['p2']=submodel1['p2']/summodel1+submodel2['p2']/summodel2

In [838]:
final_pred = np.argmax(final_sub.values, axis=1)
sub = submodel1[['ship']]
sub['pred'] = final_pred

print(sub['pred'].value_counts(1))
sub['pred'] = sub['pred'].map(type_map_rev)
sub.to_csv('210-2lgb-result.csv', index=None, header=None)

0    0.639
1    0.229
2    0.132
Name: pred, dtype: float64


In [None]:
final_features=set(features) | set(del_f_ga)
len(final_features),np.array(final_features)

In [908]:
param = {
    'max_depth':7,
    'booster': 'gbtree',
    'objective': 'multi:softprob',
    'num_class': 3,
    'num_threads':20,
}

In [909]:
def mac_f1_xgb(predss,mm):
    labell=mm.get_label()
    predss=np.argmax(predss.reshape(len(labell),3), axis=1)
    f11=metrics.f1_score(labell,predss,average='macro')
    return 'f1-score',1-f11

fold = StratifiedKFold(n_splits=6, shuffle=True, random_state=1)

X = train_label[features].copy()
y = train_label[target]
models = []
i=0

pred = np.zeros((len(test_label),3))
oof = np.zeros((len(X), 3))
for index, (train_idx, val_idx) in enumerate(fold.split(X, y)):
    train_set  = xgb.DMatrix(X.iloc[train_idx], y.iloc[train_idx])
    val_set  = xgb.DMatrix(X.iloc[val_idx], y.iloc[val_idx])
    watchlist = [(train_set,'train'),(val_set,'val')]
    i+=1
    model = xgb.train(param,train_set, num_boost_round=10000, early_stopping_rounds=500,evals=watchlist,feval=mac_f1_xgb, verbose_eval=100)
    models.append(model)

    test_pred = model.predict(xgb.DMatrix(test_label[features]))
    pred += test_pred/6

[0]	train-merror:0.085391	val-merror:0.154966	train-f1-score:0.111723	val-f1-score:0.20642
Multiple eval metrics have been passed: 'val-f1-score' will be used for early stopping.

Will train until val-f1-score hasn't improved in 500 rounds.
[100]	train-merror:0	val-merror:0.096747	train-f1-score:0	val-f1-score:0.125165
[200]	train-merror:0	val-merror:0.103596	train-f1-score:0	val-f1-score:0.134165
[300]	train-merror:0	val-merror:0.100171	train-f1-score:0	val-f1-score:0.129408
[400]	train-merror:0	val-merror:0.10274	train-f1-score:0	val-f1-score:0.133132
[500]	train-merror:0	val-merror:0.103596	train-f1-score:0	val-f1-score:0.13435
[600]	train-merror:0	val-merror:0.10274	train-f1-score:0	val-f1-score:0.133606
Stopping. Best iteration:
[100]	train-merror:0	val-merror:0.096747	train-f1-score:0	val-f1-score:0.125165

[0]	train-merror:0.092405	val-merror:0.158526	train-f1-score:0.125213	val-f1-score:0.210038
Multiple eval metrics have been passed: 'val-f1-score' will be used for early stopp

In [964]:
z = {}
from collections import Counter
for index, model in enumerate(models):
    Model,Z=Counter(model.get_score(importance_type='gain')),Counter(z)
    z=dict(Model+Z)

    
df_xgb=pd.Series(z).reset_index().sort_values(0)
xgb_features=df_xgb[df_xgb[0]>3]['index'].values.tolist()
len(xgb_features),np.array(xgb_features)

(71, array(['x_0_x_max', 'y_0_y_median', '8', 'x_0_x_mean', '2',
        'diffv_median', 'diffd_std', 'd_75', '6', 'x_kurt', '3', 'v_max',
        'd_median', '1', '26', '8_cnn', 'x_0_x_mode', 'x_0_x_median',
        'diffx_25', 'diffx_75', '5_cnn', 'd_max', 'x_0_x_25', 'y_0_y_25',
        '20', 'd_mean', 'x_skew', 'd_25', 'y_max_x_max', 'diffd_mode',
        'y_0_y_75', 'diff_day', 'x_75', 'diffy_25', 'diffy_mode',
        'y_max_y_min', 'x_mean', 'v_25', 'v_std', 'x_max', 'y_0_y_mode',
        'x_mode', 'x_min_y_min', 'y_max', 'diffx_mode', 'x_25', 'diffy_75',
        'diffx_median', 'diffy_median', 'y', 'x_min', 'diffd_25', 'y_25',
        'y_max_x_min', 'y_0', 'y_75', 'v_75', 'x_max_x_min', 'x',
        'diffd_75', 'v_min', 'x_0', 'v_median', 'diffd_kurt', 'area',
        'x_median', 'x_max_y_min', 'y_median', 'y_mode', 'y_min', 'y_mean'],
       dtype='<U12'))

In [985]:
param = {
    'max_depth':5,
    'booster': 'gbtree',
    'objective': 'multi:softprob',
    'num_class': 3,
    'num_threads':20,
}

In [987]:
def mac_f1_xgb(predss,mm):
    labell=mm.get_label()
    predss=np.argmax(predss.reshape(len(labell),3), axis=1)
    f11=metrics.f1_score(labell,predss,average='macro')
    return 'f1-score',1-f11

fold = StratifiedKFold(n_splits=6, shuffle=True, random_state=513)

X = train_label[xgb_features].copy()
y = train_label[target]
models = []
i=0

pred = np.zeros((len(test_label),3))
oof = np.zeros((len(X), 3))
for index, (train_idx, val_idx) in enumerate(fold.split(X, y)):
    train_set  = xgb.DMatrix(X.iloc[train_idx], y.iloc[train_idx])
    val_set  = xgb.DMatrix(X.iloc[val_idx], y.iloc[val_idx])
    watchlist = [(train_set,'train'),(val_set,'val')]
    i+=1
    model = xgb.train(param,train_set, num_boost_round=10000, early_stopping_rounds=500,evals=watchlist,feval=mac_f1_xgb, verbose_eval=100)
    models.append(model)

    test_pred = model.predict(xgb.DMatrix(test_label[xgb_features]),ntree_limit=model.best_ntree_limit)
    pred += test_pred/6

[0]	train-merror:0.166324	val-merror:0.196062	train-f1-score:0.236415	val-f1-score:0.272588
Multiple eval metrics have been passed: 'val-f1-score' will be used for early stopping.

Will train until val-f1-score hasn't improved in 500 rounds.
[100]	train-merror:0	val-merror:0.086473	train-f1-score:0	val-f1-score:0.113288
[200]	train-merror:0	val-merror:0.081336	train-f1-score:0	val-f1-score:0.104929
[300]	train-merror:0	val-merror:0.083904	train-f1-score:0	val-f1-score:0.108672
[400]	train-merror:0	val-merror:0.083048	train-f1-score:0	val-f1-score:0.107902
[500]	train-merror:0	val-merror:0.083048	train-f1-score:0	val-f1-score:0.107748
[600]	train-merror:0	val-merror:0.082192	train-f1-score:0	val-f1-score:0.106978
Stopping. Best iteration:
[119]	train-merror:0	val-merror:0.079623	train-f1-score:0	val-f1-score:0.103034

[0]	train-merror:0.167152	val-merror:0.200514	train-f1-score:0.216015	val-f1-score:0.262214
Multiple eval metrics have been passed: 'val-f1-score' will be used for early s

In [988]:
pred = np.argmax(pred, axis=1)
sub = test_label[['ship']]
sub['pred'] = pred

print(sub['pred'].value_counts(1))
sub['pred'] = sub['pred'].map(type_map_rev)
sub.to_csv('215-xgb-r513-k6-result.csv', index=None, header=None)

0    0.6500
1    0.2315
2    0.1185
Name: pred, dtype: float64


In [984]:
model.predict(xgb.DMatrix(test_label[xgb_features]),ntree_limit=model.best_ntree_limit)

array([[9.9999416e-01, 1.6157942e-06, 4.1886774e-06],
       [9.9999607e-01, 3.5575163e-06, 3.7250729e-07],
       [9.9967515e-01, 3.1354115e-04, 1.1280426e-05],
       ...,
       [9.9917221e-01, 8.0860383e-04, 1.9268904e-05],
       [3.6292691e-02, 3.2079753e-01, 6.4290977e-01],
       [1.5105685e-04, 9.9934143e-01, 5.0752575e-04]], dtype=float32)

In [977]:
model.best_ntree_limit

206