### 算法赛：智慧海洋

In [20]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm,tqdm_notebook

from sklearn.model_selection import KFold,StratifiedKFold,train_test_split
from sklearn import metrics

import lightgbm as lgb
from lightgbm import LGBMClassifier

import warnings

warnings.filterwarnings('ignore')
train_path = './input/train'
test_path = './input/test'
# os.listdir() 方法用于返回指定的文件夹包含的文件或文件夹的名字的列表
train_file = os.listdir(train_path)
test_file = os.listdir(test_path)

In [2]:
print(len(train_file),len(test_file))
print(train_file[:5])
print(test_file[:5])

7000 2000
['0.csv', '1.csv', '10.csv', '100.csv', '1000.csv']
['7000.csv', '7001.csv', '7002.csv', '7003.csv', '7004.csv']


In [3]:
# f-string在形式上是以f修饰字符串（f'xxx'）,用{}标明被替换的字段
df = pd.read_csv(f'{train_path}/1234.csv')
df.head()

Unnamed: 0,渔船ID,x,y,速度,方向,time,type
0,1234,6735123.0,6568010.0,0.22,0,1120 23:55:37,围网
1,1234,6735123.0,6568010.0,0.11,30,1120 23:45:39,围网
2,1234,6735123.0,6568010.0,0.11,339,1120 23:35:35,围网
3,1234,6735212.0,6568014.0,0.11,136,1120 23:15:32,围网
4,1234,6735212.0,6568014.0,0.22,132,1120 23:05:33,围网


In [4]:
df.type.unique()

array(['围网'], dtype=object)

In [5]:
df.shape

(414, 7)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 414 entries, 0 to 413
Data columns (total 7 columns):
渔船ID    414 non-null int64
x       414 non-null float64
y       414 non-null float64
速度      414 non-null float64
方向      414 non-null int64
time    414 non-null object
type    414 non-null object
dtypes: float64(3), int64(2), object(2)
memory usage: 22.7+ KB


In [7]:
df.describe()

Unnamed: 0,渔船ID,x,y,速度,方向
count,414.0,414.0,414.0,414.0,414.0
mean,1234.0,6608138.0,6581958.0,1.138188,102.876812
std,0.0,47056.85,4674.156,2.702052,112.674451
min,1234.0,6587116.0,6567792.0,0.0,0.0
25%,1234.0,6587210.0,6583972.0,0.11,0.0
50%,1234.0,6587210.0,6583972.0,0.11,81.0
75%,1234.0,6587299.0,6583972.0,0.22,165.25
max,1234.0,6735310.0,6584560.0,10.09,359.0


baseline1已经将文件全部存为h5格式,直接读取即可

In [8]:
# 读取所有训练文件的所有数据
# res = []
# for file in tqdm(train_file):
#     df = pd.read_csv(f'{train_path}/{file}')
#     res.append(df)
# df = pd.concat(res)
# df.columns = ['ship','x','y','v','d','time','type']
# print(df.shape)
# # hdf形成的h5文件拥有更快的读写速度,远高于csv
# df.to_hdf('./input/train.h5','df',mode='w')

# # 读取测试文件
# res = []
# for file in tqdm(test_file):
#     df = pd.read_csv(f'{test_path}/{file}')
#     res.append(df)
# df = pd.concat(res)
# df.columns = ['ship','x','y','v','d','time']
# print(df.shape)
# df.to_hdf('./input/test.h5','df',mode='w')

100%|█████████████████████████████████████████████████████████████████████████████| 7000/7000 [00:16<00:00, 431.49it/s]


(2699638, 7)


100%|█████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:04<00:00, 450.34it/s]


(782378, 6)


In [9]:
import matplotlib.pyplot as plt
pd.set_option('display.max_columns',100)

In [10]:
train = pd.read_hdf('./input/train.h5')
t = train[train['ship']==1234]
t.head()

Unnamed: 0,ship,x,y,v,d,time,type
0,1234,6735123.0,6568010.0,0.22,0,1120 23:55:37,围网
1,1234,6735123.0,6568010.0,0.11,30,1120 23:45:39,围网
2,1234,6735123.0,6568010.0,0.11,339,1120 23:35:35,围网
3,1234,6735212.0,6568014.0,0.11,136,1120 23:15:32,围网
4,1234,6735212.0,6568014.0,0.22,132,1120 23:05:33,围网


x和y跟type的图表分析在baseline1有说明，经过baseline1的分析发现,x和y与type有密切的数学上的关系！

In [11]:
# 特征工程
def group_feature(df,key,target,aggs):
    agg_dict = {}
    for ag in aggs:
        agg_dict[f'{target}_{ag}'] = ag
    print(agg_dict)
    # 以key进行分组,选择特定的特征提取其函数结果作为新特征,函数包含在字典中
    t = df.groupby(key)[target].agg(agg_dict).reset_index()
    return t

def extract_feature(df,train):
    # ship指船号,对每一艘船的x,y,v,d的数据的特征进行提取
    t = group_feature(df, 'ship','x',['max','min','mean','std','skew','sum','count','median','mad'])
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','y',['max','min','mean','std','skew','sum','median','mad'])
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','v',['max','min','mean','std','skew','sum','median','mad'])
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','d',['max','min','mean','std','skew','sum'])
    train = pd.merge(train, t, on='ship', how='left')
    train['x_max_x_min'] = train['x_max'] - train['x_min']
    train['y_max_y_min'] = train['y_max'] - train['y_min']
    train['y_max_x_min'] = train['y_max'] - train['x_min']
    train['x_max_y_min'] = train['x_max'] - train['y_min']
    train['x_max_over_x_min'] = train['x_max'] / train['x_min']
    train['y_max_over_y_min'] = train['y_max'] / train['y_min']
    train['y_max_over_x_min'] = train['y_max'] / train['x_min']
    train['x_max_over_y_min'] = train['x_max'] / train['y_min']
    train['slope'] = train['y_max_y_min'] / np.where(train['x_max_x_min']==0, 0.001, train['x_max_x_min'])
    train['area'] = train['x_max_x_min'] * train['y_max_y_min']
    
    # mode_hour特征是指船号对应的出现频率最大的hour
    mode_hour = df.groupby('ship')['hour'].agg(lambda x:x.value_counts().index[0]).to_dict()
    # 用map将船号改为船号对应的mode_hour特征
    train['mode_hour'] = train['ship'].map(mode_hour)
    
    # 提取hour的函数特征
    t = group_feature(df, 'ship','hour',['max','min'])
    train = pd.merge(train, t, on='ship', how='left')
    
    # 提取hour和date的独特数量的特征
    hour_nunique = df.groupby('ship')['hour'].nunique().to_dict()
    date_nunique = df.groupby('ship')['date'].nunique().to_dict()
    train['hour_nunique'] = train['ship'].map(hour_nunique)
    train['date_nunique'] = train['ship'].map(date_nunique)
    
    # 提取时间差特征,时间数据的形式缩写为dt
    t = df.groupby('ship')['time'].agg({'diff_time':lambda x:np.max(x)-np.min(x)}).reset_index()
    t['diff_day'] = t['diff_time'].dt.days
    t['diff_second'] = t['diff_time'].dt.seconds
    train = pd.merge(train, t, on='ship', how='left')
    return train

# 时间特征处理
def extract_dt(df):
    df['time'] = pd.to_datetime(df['time'], format='%m%d %H:%M:%S')
    # df['month'] = df['time'].dt.month
    # df['day'] = df['time'].dt.day
    df['date'] = df['time'].dt.date
    df['hour'] = df['time'].dt.hour
    # df = df.drop_duplicates(['ship','month'])
    df['weekday'] = df['time'].dt.weekday
    return df

In [12]:
# 正式加载数据并提取时间特征
train = pd.read_hdf('./input/train.h5')
test = pd.read_hdf('./input/test.h5')

train = extract_dt(train)
test = extract_dt(test)

In [13]:
# 每艘船都有很多数据,针对不同(x,y,v,d)提取的特征都是单个值,所以需要去重操作
train_label = train.drop_duplicates('ship')
test_label = test.drop_duplicates('ship')

In [14]:
# value_counts()第一个参数是normalize,输入1或者true返回元素频率
train_label['type'].value_counts(1)

拖网    0.623000
围网    0.231571
刺网    0.145429
Name: type, dtype: float64

In [15]:
# 将预测变量改为数值形式
type_map = dict(zip(train_label['type'].unique(),np.arange(3)))
type_map_rev = {v:k for k,v in type_map.items()}
train_label['type'] = train_label['type'].map(type_map)

In [16]:
# 提取所有特征
train_label = extract_feature(train,train_label)
test_label = extract_feature(test,test_label)

{'x_max': 'max', 'x_min': 'min', 'x_mean': 'mean', 'x_std': 'std', 'x_skew': 'skew', 'x_sum': 'sum', 'x_count': 'count', 'x_median': 'median', 'x_mad': 'mad'}
{'y_max': 'max', 'y_min': 'min', 'y_mean': 'mean', 'y_std': 'std', 'y_skew': 'skew', 'y_sum': 'sum', 'y_median': 'median', 'y_mad': 'mad'}
{'v_max': 'max', 'v_min': 'min', 'v_mean': 'mean', 'v_std': 'std', 'v_skew': 'skew', 'v_sum': 'sum', 'v_median': 'median', 'v_mad': 'mad'}
{'d_max': 'max', 'd_min': 'min', 'd_mean': 'mean', 'd_std': 'std', 'd_skew': 'skew', 'd_sum': 'sum'}
{'hour_max': 'max', 'hour_min': 'min'}
{'x_max': 'max', 'x_min': 'min', 'x_mean': 'mean', 'x_std': 'std', 'x_skew': 'skew', 'x_sum': 'sum', 'x_count': 'count', 'x_median': 'median', 'x_mad': 'mad'}
{'y_max': 'max', 'y_min': 'min', 'y_mean': 'mean', 'y_std': 'std', 'y_skew': 'skew', 'y_sum': 'sum', 'y_median': 'median', 'y_mad': 'mad'}
{'v_max': 'max', 'v_min': 'min', 'v_mean': 'mean', 'v_std': 'std', 'v_skew': 'skew', 'v_sum': 'sum', 'v_median': 'median', 'v

In [17]:
train_label.head()

Unnamed: 0,ship,x,y,v,d,time,type,date,hour,weekday,x_max,x_min,x_mean,x_std,x_skew,x_sum,x_count,x_median,x_mad,y_max,y_min,y_mean,y_std,y_skew,y_sum,y_median,y_mad,v_max,v_min,v_mean,v_std,v_skew,v_sum,v_median,v_mad,d_max,d_min,d_mean,d_std,d_skew,d_sum,x_max_x_min,y_max_y_min,y_max_x_min,x_max_y_min,x_max_over_x_min,y_max_over_y_min,y_max_over_x_min,x_max_over_y_min,slope,area,mode_hour,hour_max,hour_min,hour_nunique,date_nunique,diff_time,diff_day,diff_second
0,0,6152038.0,5124873.0,2.59,102,1900-11-10 11:58:19,0,1900-11-10,11,5,6152038.0,6118352.0,6119351.0,5037.320747,5.255558,2533411000.0,414,6118352.0,1905.724573,5130781.0,5124873.0,5130494.0,850.264541,-4.762308,2124025000.0,5130672.0,341.078408,9.39,0.0,0.265966,1.321248,5.520205,110.11,0.0,0.49496,129,0,4.613527,21.24777,4.483093,1910,33686.667453,5907.975523,-987570.4,1027165.0,1.005506,1.001153,0.838589,1.200427,0.17538,199020000.0,15,23,0,24,4,2 days 23:48:51,2,85731
1,1,6076254.0,5061743.0,3.99,278,1900-11-10 11:40:21,0,1900-11-10,11,5,6102450.0,6049472.0,6091460.0,16543.394419,-1.058454,2345212000.0,385,6102450.0,14574.134228,5112874.0,5042857.0,5094050.0,26764.042729,-0.802446,1961209000.0,5112760.0,24712.754023,10.47,0.0,1.607922,2.412688,1.590284,619.05,0.05,2.006849,336,0,56.153247,91.449382,1.418867,21619,52978.013345,70016.655842,-936597.9,1059593.0,1.008757,1.013884,0.845177,1.210117,1.321617,3709343000.0,19,23,0,24,4,2 days 23:39:47,2,85187
2,10,6321032.0,5242805.0,4.48,213,1900-11-10 11:49:36,0,1900-11-10,11,5,6346913.0,6246119.0,6262484.0,32280.567149,1.62304,2486206000.0,397,6246120.0,25318.018154,5265810.0,5229867.0,5242458.0,5975.460236,2.198003,2081256000.0,5240937.0,3556.709015,10.09,0.0,1.313854,2.442825,2.14541,521.6,0.22,1.779098,359,0,108.758186,112.515081,0.727645,43177,100794.674835,35942.703641,-980308.7,1117046.0,1.016137,1.006873,0.843053,1.21359,0.356593,3622833000.0,23,23,0,24,4,2 days 23:33:53,2,84833
3,100,6102751.0,5112534.0,0.0,0,1900-10-30 23:50:05,0,1900-10-30,23,1,6151439.0,6102326.0,6123711.0,14451.941954,0.02186,2516845000.0,411,6123431.0,12392.343016,5112752.0,5069616.0,5085480.0,14020.260117,1.055676,2090132000.0,5082056.0,10839.543586,8.69,0.0,2.965864,1.647069,-0.215287,1218.97,3.45,1.196892,353,0,161.727494,115.409256,-0.020073,66470,49113.022232,43135.705758,-989574.0,1081823.0,1.008048,1.008509,0.837837,1.213393,0.878295,2118525000.0,11,23,0,24,3,2 days 23:48:47,2,85727
4,1000,6843713.0,5480538.0,2.0,216,1900-11-06 23:42:30,1,1900-11-06,23,1,6844414.0,6748890.0,6807536.0,26263.537565,-0.77019,2566441000.0,377,6809206.0,18041.318374,5540087.0,5440815.0,5464764.0,30135.645906,1.412544,2060216000.0,5450299.0,23649.086501,8.9,0.0,2.08557,2.649306,1.110173,786.26,0.59,2.247028,358,0,159.143236,101.832626,0.217397,59997,95524.035775,99271.486171,-1208803.0,1403598.0,1.014154,1.018246,0.820889,1.257976,1.03923,9482813000.0,0,23,0,24,3,2 days 23:37:11,2,85031


In [18]:
feats = [i for i in train_label.columns if i not in ['ship','type','time','diff_time','date']]
target = 'type'

In [19]:
X = train_label[feats].copy()
y = train_label[target]

模型融合

In [50]:
class Create_Ensemble(object):
    def __init__(self, n_splits, base_models):
        self.n_splits = n_splits
        self.base_models = base_models

    def predict(self, X, y, T):
        X = np.array(X)
        y = np.array(y)
        T = np.array(T)

        folds = list(StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=2020).split(X, y))

        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((T.shape[0], len(self.base_models)))
        
        for i, clf in enumerate(self.base_models):
            S_test_i = np.zeros((T.shape[0], self.n_splits))

            for j, (train_idx, valid_idx) in enumerate(folds):
                X_train = X[train_idx]
                y_train = y[train_idx]
                X_valid = X[valid_idx]
                y_valid = y[valid_idx]
                
                clf.fit(X_train, y_train)
                valid_pred = clf.predict(X_valid)
                S_train[valid_idx, i] = valid_pred
                S_test_i[:, j] = clf.predict(T)
            
            print( "\n F1-Score for model {} : {}".format(i+1,
                  metrics.f1_score(y_valid, valid_pred, average='macro')))
            S_test[:, i] = S_test_i.mean(axis=1)
            
        return S_train, S_test

In [59]:
# LightGBM params
lgb_params1 = {}
lgb_params1['learning_rate'] = 0.005
lgb_params1['n_estimators'] = 1000
lgb_params1['subsample'] = 1
lgb_params1['colsample_bytree'] = 1   
lgb_params1['random_state'] = 2020

lgb_params2 = {}
lgb_params2['learning_rate'] = 0.001
lgb_params2['n_estimators'] = 2000
lgb_params2['subsample'] = 0.85
lgb_params2['colsample_bytree'] = 0.9
lgb_params2['random_state'] = 2021

lgb_model1 = LGBMClassifier(**lgb_params1)
lgb_model2 = LGBMClassifier(**lgb_params2)

效果不佳

In [60]:
models = [lgb_model1,lgb_model2]
lgb_stack = Create_Ensemble(n_splits = 5, base_models = models)        
X1 = X.copy()
Y1 = y.copy()
T1 = test_label.copy()[feats]
lgb_train_pred, lgb_test_pred = lgb_stack.predict(X1, Y1, T1)


 F1-Score for model 1 : 0.8501510283166436

 F1-Score for model 2 : 0.8154308633651727


不融合,且增加到10折

In [66]:
# 五折交叉验证
fold = StratifiedKFold(n_splits=10,shuffle=True,random_state=42)
# 准备预测
models = []
pred = np.zeros((len(test_label),3))
oof = np.zeros((len(X),3))

params = {
    'n_estimators': 5000,
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': 3,
    'early_stopping_rounds': 500,
}

for index, (train_idx, val_idx) in enumerate(fold.split(X, y)):

    train_set = lgb.Dataset(X.iloc[train_idx], y.iloc[train_idx])
    val_set = lgb.Dataset(X.iloc[val_idx], y.iloc[val_idx])

    model = lgb.train(params, train_set, valid_sets=[train_set, val_set], verbose_eval=500)
    models.append(model)
    val_pred = model.predict(X.iloc[val_idx])
    oof[val_idx] = val_pred
    val_y = y.iloc[val_idx]
    val_pred = np.argmax(val_pred, axis=1)
    print(index, 'val f1', metrics.f1_score(val_y, val_pred, average='macro'))

    test_pred = model.predict(test_label[feats])
    pred += test_pred/5

oof = np.argmax(oof, axis=1)
print('oof f1', metrics.f1_score(oof, y, average='macro'))

Training until validation scores don't improve for 500 rounds.
[500]	training's multi_logloss: 0.000430818	valid_1's multi_logloss: 0.384322
Early stopping, best iteration is:
[122]	training's multi_logloss: 0.0563395	valid_1's multi_logloss: 0.269153
0 val f1 0.8706619820108914
Training until validation scores don't improve for 500 rounds.
[500]	training's multi_logloss: 0.00045991	valid_1's multi_logloss: 0.259468
Early stopping, best iteration is:
[230]	training's multi_logloss: 0.0137709	valid_1's multi_logloss: 0.212877
1 val f1 0.8961136435263538
Training until validation scores don't improve for 500 rounds.
[500]	training's multi_logloss: 0.000452608	valid_1's multi_logloss: 0.294953
Early stopping, best iteration is:
[172]	training's multi_logloss: 0.0290752	valid_1's multi_logloss: 0.238891
2 val f1 0.8771196916162637
Training until validation scores don't improve for 500 rounds.
[500]	training's multi_logloss: 0.00042553	valid_1's multi_logloss: 0.29193
Early stopping, best i

In [67]:
# 创建提交文件
sub = pd.DataFrame()
sub['ship'] = test_label['ship']
sub['type'] = lgb_test_pred.mean(axis=1)
sub.to_csv('result_new_features_10Fold.csv', index=False)

In [68]:
# 查看特征重要性
ret = []
for index, model in enumerate(models):
    df = pd.DataFrame()
    df['name'] = model.feature_name()
    df['score'] = model.feature_importance()
    df['fold'] = index
    ret.append(df)
    
df = pd.concat(ret)
df = df.groupby('name', as_index=False)['score'].mean()
df = df.sort_values(['score'], ascending=False)
df

Unnamed: 0,name,score
37,x_min,569.1
43,y_max,555.1
44,y_max_over_x_min,522.8
46,y_max_x_min,512.8
36,x_median,473.1
21,v_median,473.1
38,x_skew,456.8
49,y_median,456.3
51,y_skew,449.6
32,x_max_over_y_min,448.2
