## 重新整理代码整理代码 2020年6月30号 晚20：21

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np
from sklearn.preprocessing import LabelEncoder
import geopandas
import gc
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from shapely.geometry import Point
from gensim.models import Word2Vec
import logging
import gensim
# plt.rcParams['font.sans-serif'] = ['KaiTi']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号

In [134]:
def reduce_mem(df):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('{:.2f} Mb, {:.2f} Mb ({:.2f} %)'.format(start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    gc.collect()
    return df
def get_data(data, model='train'):
    assert model=='train' or model=='test'
    data.sort_values(['loadingOrder','timestamp'],inplace=True)
    if model=='train':
        pass
#         data['vesselNextportETA'] = pd.to_datetime(data['vesselNextportETA'], infer_datetime_format=True) 
    else:
        data['onboardDate'] = pd.to_datetime(data['onboardDate'], infer_datetime_format=True)
    data['timestamp'] = pd.to_datetime(data['timestamp'], infer_datetime_format=True)    
    return data
def get_anchor(df):
    # 转化为360度数
    df['direction']=df['direction']/100
    df['cos_direction']=np.cos(df['direction']/180*np.pi)
    
#     df['direction1']=np.sin(df['direction']/180*np.pi)
    tmp=df.groupby('loadingOrder')
    df['lat_diff'] = tmp['latitude'].diff(1)
    df['lon_diff'] = tmp['longitude'].diff(1)
    df['speed_diff'] = tmp['speed'].diff(1)
    df['direction_diff']=tmp['direction'].diff(1)
    df['diff_seconds'] = tmp['timestamp'].diff(1).dt.total_seconds() 
    ### 这样实际是做了一个采样！！ #可以去除重复的记录
    df['anchor'] =(((df['lat_diff']> 1)|(df['lon_diff'] > 1))&(df['speed']==0)).astype('int')
    ###  这里标记下船几乎停止的地方
    df['stop']=((df['lat_diff'] <= 0.03)&(df['lon_diff'] <= 0.03)&(df['speed'] <= 1)).astype('int')
#     df['delay']=(df['diff_seconds']>3000).astype('int')
    return df
def distance(LatA,LatB,LonA,LonB):
    EARTH_RADIUS = 6378.137 # 千米
    def rad(d):
        return d * np.pi/ 180.0
    s=0
    radLatA = rad(LatA)
    radLatB = rad(LatB)
    a = radLatA-radLatB
    b = rad(LonA)-rad(LonB)
    s= 2 * np.arcsin(np.sqrt(np.power(np.sin(a / 2),2)+ np.cos(radLatA) * np.cos(radLatB)*np.power(np.sin(b / 2),2)))
    s=s* EARTH_RADIUS
    #  保留两位小数
    s = np.round(s * 100)/100
    s = s * 1000 # 转换成m
    return s
def get_feature(df,model='train'):
     #计算移动方便后面计算轨迹长度 m
    df['move_leng']=distance(df.latitude.values,df.groupby('loadingOrder')['latitude'
                ].shift(1).values,df.longitude.values,df.groupby('loadingOrder')['longitude'].shift(1).values)  
    ### 计算下之前的累积距离：
    df['cusum_distance']=df.groupby('loadingOrder')['move_leng'].expanding().sum().reset_index(drop=True)
    df['cusum_speed']=df.groupby('loadingOrder')['speed'].rolling(window=5).mean().reset_index(drop=True)
    
    tmp=df.groupby('loadingOrder')['move_leng'].sum().reset_index()
    tmp.columns=['loadingOrder','sum_distance']
    df=df.merge(tmp,on='loadingOrder',how='left')
    #-----
#     df['cusum_direction']=df.groupby('loadingOrder')['direction_diff'].expanding().mean().reset_index(drop=True)
#     df['cusum_stop']=df.groupby('loadingOrder')['stop'].expanding().sum().reset_index(drop=True)
#     df['stop_time_gap']=df['cusum_stop']*df['diff_seconds']
    
    df['instant_speed']=df['move_leng']/df['diff_seconds']
     # 瞬时加速度 m/s2
    df['instant_acc']=df['instant_speed']/(df['diff_seconds']+0.01)
#     df['instant_speed_change']=df['speed_diff']/df['diff_seconds']
#     df['status_sum']=df['anchor']+df['stop']+df['delay']
    ####---------
#     df['speed_acc']=df['speed']/df['diff_seconds']*60
    # 瞬时加速度 m/s2
#     df['instant_acc']=df['instant_speed']/df['diff_seconds']
#     df['direction_valc']=df['direction_diff']/df['diff_seconds']#
    
    # local
#     df['local_lat_lon']=df['longitude'].astype('int')*10+df['latitude'].astype('int')
    ## 得到最早的时间
#     tmp=df.drop_duplicates('loadingOrder',keep='first').reset_index(drop=True)
#     tmp=tmp[['loadingOrder','timestamp']]
#     tmp.columns=['loadingOrder','start_time']
#     df=df.merge(tmp,on='loadingOrder',how='left')
#     df['have_run_time']=(df['timestamp']-df['start_time']).dt.total_seconds()/3600
    df['have_run_time']=df.groupby('loadingOrder')['diff_seconds'].expanding().sum().reset_index(drop=True)
    df['distanc2taget']=distance(df.latitude.values,df.end_latitude.values,df.longitude.values,df.end_longitude.values)
    df['distanc2start']=distance(df.latitude.values,df.start_latitude.values,df.longitude.values,df.start_longitude.values)
    df['curr_value_time']=(df['distanc2taget']/ df['instant_speed']+0.01)*3600
    df['avg_speed']=df['cusum_distance']/df['have_run_time']*3600
    #-----当前日期：
    df['day_tag']=df['timestamp'].dt.year.values*10000+df['timestamp'].dt.month.values*100+df['timestamp'].dt.day.values
    df['curr_week']=df['timestamp'].dt.week
    df['curr_hour']=df['timestamp'].dt.hour
    ###  除了描述当前状态 还要描述
    ##-----
#     df['long_gap'] = abs(df['end_longitude']-df['longitude'])
#     df['lat_gap'] = abs(df['end_latitude']-df['latitude'])
#     df['start_long_gap'] = abs(df['start_longitude']-df['longitude'])
#     df['start_lat_gap'] = abs(df['start_latitude']-df['latitude'])
#     df['start_long_ratio'] = abs(df['longitude']-df['start_longitude']) / abs(df['end_longitude']-df['start_longitude'])
#     df['start_lat_ratio'] = abs(df['latitude']-df['start_latitude']) / abs(df['end_latitude']-df['start_latitude'])
#     df['end_long_ratio'] = abs(df['longitude']-df['end_longitude']) / abs(df['end_longitude']-df['start_longitude'])
#     df['end_lat_ratio'] = abs(df['latitude']-df['end_latitude']) / abs(df['end_latitude']-df['start_latitude'])   
    return df
def get_labe(df,feat='label'):
    tmp= df.groupby('loadingOrder')['timestamp'].agg({'end_time':'max','mmin':'min'}).reset_index()
    tmp['label']=(tmp['end_time'] - tmp['mmin']).dt.total_seconds()//3600
#     del df['lat_lon_dis']
    return tmp[['loadingOrder',feat]]
def type_encoding(train_data,test_data):
    ### ----对类别进行编码
    for f in ['TRANSPORT_TRACE','carrierName','vesselMMSI']:
        unique_set=set(train_data[f].unique().tolist()+test_data[f].unique().tolist())
        unique_dict={ f:i for i,f in enumerate(unique_set)}
        test_data[f]=test_data[f].map(unique_dict)
        train_data[f]=train_data[f].map(unique_dict)
        
    # 港口名称编码
    unique_set=set(train_data['start_sport'].unique().tolist()+test_data['start_sport'].unique().tolist()
                  +train_data['end_sport'].unique().tolist()+test_data['end_sport'].unique().tolist())
    unique_dict={ f:i for i,f in enumerate(unique_set)}
    for f in ['start_sport','end_sport']:
        test_data[f]=test_data[f].map(unique_dict)
        train_data[f]=train_data[f].map(unique_dict)
    return train_data,test_data

def random_cut(x):
    leng=len(x)
    ## 尽量取中间 这里的起始位可以稍微修正一下
    start=np.random.randint(0,2,leng)
    start
    return start.tolist()


### 处理文件
# 数据流读取
（读取时间较长。。。。。）

In [None]:
###  这里位了核查所有数据 只取基本列信息，并去除重复的
# names= ['loadingOrder','carrierName','timestamp','longitude',
#                   'latitude','vesselMMSI','speed','direction','vesselNextport',
#                   'vesselNextportETA','vesselStatus','vesselDatasource','TRANSPORT_TRACE']

# train_flux=pd.read_csv('./train0523.csv',chunksize=500000,names=names)
# train_df=pd.DataFrame(columns=names)
# for i,data in tqdm(enumerate(train_flux)):
#     del data['vesselDatasource'],data['vesselNextport'],data['vesselNextportETA'],data['vesselStatus']
#     data.drop_duplicates(['loadingOrder','timestamp','carrierName'],inplace=True)# 可能一艘船有好几家运输公司
#     data=reduce_mem_usage(data) 
#     data['direction']=data['direction'].values.astype('int32')
#     train_df=train_df.append(data)       
# train_df=reduce_mem_usage(train_df)
# train_df['speed']=train_df.speed.values.astype('int8')
# train_df['direction']=train_df.direction.values.astype('int32')
# train_df['timestamp']=pd.to_datetime(train_df.timestamp)
# train_df.drop_duplicates(['loadingOrder','timestamp','carrierName'],inplace=True)

# train_df.sort_values(['loadingOrder','timestamp'],inplace=True)
# print(train_df.shape)
# train_df.drop_duplicates(['loadingOrder','timestamp','longitude','latitude'],keep='last',inplace=True)
# print(train_df.shape)


#---
# # 去除trace为nan的值
# print(train_df.shape)
# train_df=train_df.loc[train_df.TRANSPORT_TRACE.notna()]
# print(train_df.shape)

# train_df=pd.to_pickle('./data/complect_data2.pkl') # 保存处理文件
train_df.info()

In [3]:
# 查看测试集状态
test_df=pd.read_csv('./data/testData 0626.csv')
print('总共有{}艘船'.format(test_df.vesselMMSI.nunique()))
print('{}个快递运单'.format(test_df.loadingOrder.nunique()))
print('{}个运货公司'.format(test_df.carrierName.nunique()))
print('{}条运输路径'.format(test_df.TRANSPORT_TRACE.nunique()))
print('运输路段长度：{}'.format(test_df.TRANSPORT_TRACE.apply(lambda x:len(x.split('-'))).unique()))
print('运输过程中的 spped 情况：{}'.format(test_df.speed.unique()))
print('经度跨越：{}'.format(test_df.longitude.max()-test_df.longitude.min()),'纬度跨越：{}'.format(test_df.latitude.max()-test_df.latitude.min()))
print('测试集中，船只的运输的港口：')
print(test_df.TRANSPORT_TRACE.value_counts())
print('测试集时间跨度:')
print('min time:{} max time:{}'.format(test_df.timestamp.min(),test_df.timestamp.max()))

总共有90艘船
228个快递运单
11个运货公司
21条运输路径
运输路段长度：[2 3]
运输过程中的 spped 情况：[24 25 32 33 36 37 40 42 43 41 39 38 27 26 20 22 23 21 18 30  0  8 10 11
 12 17 15 13  1  2  6 29 35 28  7  3 14 31 34 16 19  9  5  4 44 46 47 50
 49]
经度跨越：359.103702 纬度跨越：85.677953
测试集中，船只的运输的港口：
CNYTN-MXZLO          18420
CNYTN-PAONX           4244
CNSHK-MYTPP           2416
CNSHK-CLVAP           2014
CNYTN-ARENA           1574
CNNSA-GHTEM           1358
CNSHK-SGSIN           1120
CNNSA-NAWVB            846
CNYTN-MATNG            818
CNYTN-CAVAN            731
CNSHK-GRPIR            451
CNSHK-INMUN            209
HONGKONG-BU            139
HKHKG-FRFOS             89
CNHKG-ARBUE             80
CNYTN-MTMLA             69
CNSHK-PKQCT             60
CNSHK-SIKOP             48
CNSHK-SGSIN-AEJEA       10
CNSHK-MYPKG             10
CNYTN-MYTPP              6
Name: TRANSPORT_TRACE, dtype: int64
测试集时间跨度:
min time:2019-01-16T17:01:48.000Z max time:2020-03-27T03:48:28.000Z


# 清洗训练集

In [64]:
train_df=pd.read_pickle('./data/sample_complect_data_2.pkl')
train_df.sort_values(['loadingOrder','timestamp'],inplace=True)
train_df['leng']=train_df.groupby('loadingOrder')['timestamp'].transform('count')
# 必须保留起初speed==0的数据，因为test可能中间截取的，不过中间截取也没关系
#我只需要在tran里面的label 也是从speed非0处开始计算，到末尾非0处非0即可
# 去除稀疏的数据---
print(train_df.shape)
train_df=train_df.loc[train_df.leng>3]
print(train_df.shape)
del train_df['leng']

#--------------
# 加载港口坐标信息
event=pd.read_csv('./data/port_2.csv')
event=event[['TRANS_NODE_NAME','LONGITUDE','LATITUDE']]
print(event.TRANS_NODE_NAME.nunique())
# event=event.loc[~((event.LATITUDE<1)&(event.LONGITUDE==-0.386592))]
# print(event.TRANS_NODE_NAME.nunique())

#  计算起始和终止时刻经纬度误差
end_trace=train_df.drop_duplicates('loadingOrder','last')
end_trace=end_trace[['loadingOrder','TRANSPORT_TRACE','latitude','longitude']]
end_trace['TRANS_NODE_NAME']=end_trace['TRANSPORT_TRACE'].apply(lambda x :x.split('-')[-1])
end_trace=end_trace.merge(event,on='TRANS_NODE_NAME',how='left')
end_trace['end_error']=np.abs((end_trace['longitude']-end_trace['LONGITUDE']).values)+np.abs((end_trace['latitude']-end_trace['LATITUDE']).values)
end_trace=end_trace.loc[end_trace.end_error<5].reset_index(drop=True)
#-----------------------------------------
start_trace=train_df.drop_duplicates('loadingOrder','first')
start_trace=start_trace[['loadingOrder','TRANSPORT_TRACE','latitude','longitude']]
start_trace['TRANS_NODE_NAME']=start_trace['TRANSPORT_TRACE'].apply(lambda x :x.split('-')[0])
start_trace=start_trace.merge(event,on='TRANS_NODE_NAME',how='left')
start_trace['end_error']=np.abs((start_trace['longitude']-start_trace['LONGITUDE']).values)+np.abs((start_trace['latitude']-start_trace['LATITUDE']).values)
start_trace=start_trace.loc[start_trace.end_error<5].reset_index(drop=True)
#---------------------------------
start_trace['tag']=1
end_trace['tag']=1
select_loading=pd.concat([start_trace,end_trace],axis=0)
select_loading=select_loading.groupby('loadingOrder')['tag'].sum().reset_index()
print(select_loading.shape)
select_loading=select_loading.loc[select_loading.tag==2]
print(select_loading.shape)
del start_trace,end_trace

print('选取 loadingOrder：',select_loading.loadingOrder.nunique())
#--- 接下来就是从中选取我们需要的loadingOrder即可
print('清洗前：',train_df.shape)
train_df=train_df.loc[train_df.loadingOrder.isin(select_loading.loadingOrder)]
print('清洗后的数据：',train_df.shape)
del select_loading
#  这里得保留end_trace,因为他是我们能获取的终点坐标的均值 可以后来给数据标记终点
# 将speed转化下 否则计算差值可能有问题
train_df['speed']=train_df['speed'].values.astype('int32')
# del trace_dict,mmsi_dict,carri_dict,ni_carri_dict,ni_mmsi_dict,ni_trace_dict
gc.collect()
###--------------------
## 计算统计特征 属于训练集特征统计
# #首先 还是先划分起点和终点吧
train_df['start_sport']=train_df['TRANSPORT_TRACE'].apply(lambda x:x.split('-')[0])
#-----
train_df['end_sport']=train_df['TRANSPORT_TRACE'].apply(lambda x:x.split('-')[-1])

test_df=pd.read_csv('./data/testData 0626.csv')
test_df['start_sport']=test_df['TRANSPORT_TRACE'].apply(lambda x:x.split('-')[0])
test_df['end_sport']=test_df['TRANSPORT_TRACE'].apply(lambda x:x.split('-')[-1])
print(train_df.loadingOrder.nunique())
# train_df=train_df.loc[(train_df.end_sport.isin(test_df.end_sport))|(train_df.TRANSPORT_TRACE.isin(test_df.TRANSPORT_TRACE))].reset_index(drop=True)#(train_df.end_sport.isin(test_df.end_sport))|
train_df['leng']=train_df.TRANSPORT_TRACE.apply(lambda x:len(x.split('-')))
train_df=train_df.loc[(train_df.leng==3)|(train_df.leng==2)].reset_index(drop=True)
# print(train_df.loadingOrder.nunique())
#-------------------
sport=pd.read_csv('./data/port_2.csv')
sport=sport[['TRANS_NODE_NAME','LONGITUDE','LATITUDE']]
sport.columns=['end_sport','end_longitude','end_latitude',]
test_df=test_df.merge(sport,on='end_sport',how='left')

sport.columns=['start_sport','start_longitude','start_latitude']
test_df=test_df.merge(sport,on='start_sport',how='left')
# 
sport=train_df.drop_duplicates('loadingOrder','last')[['loadingOrder','longitude','latitude']]
# sport=sport.groupby('end_sport')[['longitude','latitude']].mean().reset_index()
sport.columns=['loadingOrder','end_longitude','end_latitude',]
train_df=train_df.merge(sport,on='loadingOrder',how='left')
#
sport=train_df.drop_duplicates('loadingOrder','first')[['loadingOrder','longitude','latitude']]
# sport=sport.groupby('end_sport')[['longitude','latitude']].mean().reset_index()
sport.columns=['loadingOrder','start_longitude','start_latitude',]
train_df=train_df.merge(sport,on='loadingOrder',how='left')
print(train_df.loadingOrder.nunique())

train_df=train_df.loc[(train_df.end_latitude.notna())&(train_df.start_latitude.notna())]
print(train_df.loadingOrder.nunique())

# #------
train_label=get_labe(train_df)
#----
train_df=train_df.merge(train_label,on='loadingOrder',how='left')
# #----- 删除航行时间过短的
# train_df=train_df.loc[train_df.label>20].reset_index(drop=True)

#-------------对始发港统计label时间
tmp=train_df.drop_duplicates('loadingOrder','last')

start_sport_label=tmp.groupby('start_sport')['label'].mean().reset_index()
start_sport_label.columns=['start_sport','start_mean_time'] 
# ----
end_sport_label=tmp.groupby('end_sport')['label'].mean().reset_index()
end_sport_label.columns=['end_sport','end_mean_time']
#----
trace_label=tmp.groupby('TRANSPORT_TRACE')['label'].mean().reset_index()
trace_label.columns=['TRANSPORT_TRACE','TRANSPORT_TRACE_mean_time']
## 还可以做 当前其实坐标与起始港坐标距离，因为test中有些并不是和起始港一致的
del train_df['label']

train_df=train_df.merge(start_sport_label,on='start_sport',how='left')
train_df=train_df.merge(end_sport_label,on='end_sport',how='left')
train_df=train_df.merge(trace_label,on='TRANSPORT_TRACE',how='left')
#--------  ----- ----- ---- ------

test_df=test_df.merge(start_sport_label,on='start_sport',how='left')
test_df=test_df.merge(end_sport_label,on='end_sport',how='left')
test_df=test_df.merge(trace_label,on='TRANSPORT_TRACE',how='left')

(38382156, 10)
(38381781, 10)
193
(6060, 2)
(3337, 2)
选取 loadingOrder： 3337
清洗前： (38381781, 9)
清洗后的数据： (21665825, 9)
3337
2896
2896


is deprecated and will be removed in a future version


In [138]:
train_df=get_data(train_df,model='train')# 转换下时间并排序
train_df=get_anchor(train_df)
test_df=get_data(test_df,model='test')
test_df=get_anchor(test_df)

In [139]:
# 做一下降采样 和去除噪声数据
print(train_df.shape)
train_df=train_df.loc[train_df.diff_seconds>=180].reset_index(drop=True)
print(train_df.shape)
# train_df=train_df.loc[train_df.direction!=-1].reset_index(drop=True)
# print(train_df.shape)

(5480900, 49)
(4559407, 49)


In [None]:
def get_feature(df,model='train'):
     #计算移动方便后面计算轨迹长度 m
    df['move_leng']=distance(df.latitude.values,df.groupby('loadingOrder')['latitude'
                ].shift(1).values,df.longitude.values,df.groupby('loadingOrder')['longitude'].shift(1).values)  
    #计算下之前的累计距离
    df['cusum_distance'] = df.groupby('loadingOrder')['move_leng'].cumsum()
    
    #-----------
    #df['cusum_direction'] = df.groupby('loadingOrder')['direction_diff'].expanding().mean().reset_index(drop=True)
    #df['cusum_mean_speed'] = df.groupby('loadingOrder')['speed'].expanding().mean().reset_index(drop=True)
    df['cusum_stop'] = df.groupby('loadingOrder')['stop'].cumsum()
    #------------------------------------------------------
    df['direction_valc']=df['direction_diff']/df['diff_seconds']#
    df['mean_speed'] = df['move_leng']/(df['diff_seconds']+0.01)
    # 瞬时加速度 m/s2
    df['instant_acc']=df['mean_speed']/(df['diff_seconds']+0.01)
    df['cusum_speed']=df.groupby('loadingOrder')['speed'].rolling(window=5).mean().reset_index(drop=True)
    #获取船航行经度和维度的行驶比例和总航行占比
    df['long_gap'] = abs(df['end_longitude']-df['longitude'])
    df['lat_gap'] = abs(df['end_latitude']-df['latitude'])
    df['start_long_gap'] = abs(df['start_longitude']-df['longitude'])
    df['start_lat_gap'] = abs(df['start_latitude']-df['latitude'])
    df['start_long_ratio'] = abs(df['longitude']-df['start_longitude']) / abs(df['end_longitude']-df['start_longitude'])
    df['start_lat_ratio'] = abs(df['latitude']-df['start_latitude']) / abs(df['end_latitude']-df['start_latitude'])
    df['end_long_ratio'] = abs(df['longitude']-df['end_longitude']) / abs(df['end_longitude']-df['start_longitude'])
    df['end_lat_ratio'] = abs(df['latitude']-df['end_latitude']) / abs(df['end_latitude']-df['start_latitude'])
    #获取总差距
    df['all_start_gap'] = df['start_long_gap'] + df['start_lat_gap']
    df['all_start_ratio'] = df['all_start_gap'] / (abs(df['end_longitude']-df['start_longitude'])+abs(df['end_latitude']-df['start_latitude']))
    df['all_end_gap'] = df['long_gap'] + df['lat_gap']
    df['all_end_ratio'] = df['all_end_gap'] / (abs(df['end_longitude']-df['start_longitude'])+abs(df['end_latitude']-df['start_latitude']))
    
    #获取年月日等时间特征
    df['month'] = df['timestamp'].dt.month
    df['day'] = df['timestamp'].dt.day
    df['hour'] = df['timestamp'].dt.hour
    df['time'] = df['month'].astype(str)+'-'+df['day'].astype(str)
    
    ## 得到最早的时间
    tmp=df.drop_duplicates('loadingOrder',keep='first').reset_index(drop=True)
    tmp=tmp[['loadingOrder','timestamp','direction']]
    tmp.columns=['loadingOrder','start_time','start_direction']
    df=df.merge(tmp,on='loadingOrder',how='left')
    df['have_run_time']=(df['timestamp']-df['start_time']).dt.total_seconds()//3600
    df['distanc2taget']=distance(df.latitude.values,df.end_latitude.values,df.longitude.values,df.end_longitude.values)
    df['cusum_mean_speed'] = df['cusum_distance']/(df['have_run_time']+0.01)
    # 瞬时加速度 m/s2
    df['cusum_instant_acc']=df['cusum_mean_speed']/(df['have_run_time']+0.01)
    return df

In [140]:
train_label=get_labe(train_df,feat='end_time')
train_data=get_feature(train_df)
train_data=train_data.merge(train_label,on='loadingOrder',how='left')
train_data['label']=(train_data['end_time']-train_data['timestamp']).dt.total_seconds()//3600
print('get train data:',train_data.shape)

test = get_feature(test_df)
print('get test :',test.shape)

train_data,test=type_encoding(train_data,test)
gc.collect()

is deprecated and will be removed in a future version


get train data: (4559407, 58)
get test : (34712, 56)


119

In [96]:
features = [c for c in train_data.columns if c not in ['index','loadingOrder', 'label','start_time','end_time',
            'geohash_doc','timestamp','isTest', 'end_longitude', 'end_latitude', 'start_longitude', 'start_latitude',
                            'anchor', 'stop', 'delay','speed_acc',
            'predict','cusum_direction','local_lat_lon', 'direction_diff','speed_diff','direction_valc',
                        'gps_label', 'cos_direction','target_erro','diff_seconds','curr_week','curr_hour','leng'
                                        
]]
print(features,len(features))

['TRANSPORT_TRACE', 'carrierName', 'direction', 'latitude', 'longitude', 'speed', 'vesselMMSI', 'start_sport', 'end_sport', 'start_mean_time', 'end_mean_time', 'TRANSPORT_TRACE_mean_time', 'lat_diff', 'lon_diff', 'move_leng', 'cusum_distance', 'cusum_speed', 'mean_speed', 'instant_acc', 'long_gap', 'lat_gap', 'start_long_gap', 'start_lat_gap', 'start_long_ratio', 'start_lat_ratio', 'end_long_ratio', 'end_lat_ratio', 'all_start_gap', 'all_start_ratio', 'all_end_gap', 'all_end_ratio', 'month', 'day', 'hour', 'time', 'start_direction', 'have_run_time', 'distanc2taget', 'cusum_mean_speed', 'cusum_instant_acc'] 40


In [145]:
features = ['direction', 'latitude', 'longitude', 'speed', 'diff_seconds', 'TRANSPORT_TRACE', 'vesselMMSI', 'carrierName', 'start_sport', 
 'end_sport', 'end_longitude', 'end_latitude', 'start_longitude', 'start_latitude', 'lat_diff', 'lon_diff', 'speed_diff', 'direction_diff', 'anchor',
 'stop', 'move_leng', 'cusum_distance', 'cusum_stop', 'direction_valc', 'mean_speed', 'instant_acc', 'long_gap', 
 'lat_gap', 'start_long_gap', 'start_lat_gap', 'all_start_gap', 'all_end_gap', 'start_direction', 'have_run_time', 
 'cusum_mean_speed', 'cusum_instant_acc','cusum_speed']

print(features)
print(len(features))
gc.collect()

['direction', 'latitude', 'longitude', 'speed', 'diff_seconds', 'TRANSPORT_TRACE', 'vesselMMSI', 'carrierName', 'start_sport', 'end_sport', 'end_longitude', 'end_latitude', 'start_longitude', 'start_latitude', 'lat_diff', 'lon_diff', 'speed_diff', 'direction_diff', 'anchor', 'stop', 'move_leng', 'cusum_distance', 'cusum_stop', 'direction_valc', 'mean_speed', 'instant_acc', 'long_gap', 'lat_gap', 'start_long_gap', 'start_lat_gap', 'all_start_gap', 'all_end_gap', 'start_direction', 'have_run_time', 'cusum_mean_speed', 'cusum_instant_acc']
36


19

In [40]:
from sklearn.metrics import mean_squared_error,explained_variance_score
from sklearn.model_selection import KFold,train_test_split
from  lightgbm.sklearn import LGBMRegressor
def build_model(train_data, test, pred, label, seed=1080, is_shuffle=True):
    train_pred = np.zeros((train_data.shape[0], ))
    test_pred = np.zeros((test.shape[0], ))
    n_splits = 5
    # Kfold
    fold = KFold(n_splits=n_splits, shuffle=is_shuffle, random_state=seed)
    kf_way = fold.split(train_data[pred])
    # params
#     test_x=np.concatenate([test[pred].values,geohash_test],axis=1)
    # train
    for n_fold, (train_idx, valid_idx) in enumerate(kf_way, start=1):
        print('******第{}折**********'.format(n_fold))
        train_x, train_y = train_data[pred].iloc[train_idx].values, train_data[label].iloc[train_idx]
        valid_x, valid_y = train_data[pred].iloc[valid_idx].values, train_data[label].iloc[valid_idx]
#         geohash_tr_x,geohash_val_x=geohash_train[train_idx],geohash_train[valid_idx]
#         train_x=np.concatenate([train_x,geohash_tr_x],axis=1)
#         valid_x=np.concatenate([valid_x,geohash_val_x],axis=1)
        
        # 数据加载
        clf=LGBMRegressor( learning_rate=0.05,
        n_estimators=5000,
        num_leaves=156,
        subsample=0.8,
        njobs=-1,
        max_depth=6,
        reg_lambda=0,
        colsample_bytree=0.8,
        random_state=1090,  # 2019
        metric=['mse'])
        
        clf.fit(
        train_x, train_y,
        eval_set=[(valid_x, valid_y)],
        eval_metric=['mse'],
        categorical_feature='auto',
        early_stopping_rounds=50,
        verbose=100)        
        
        train_pred[valid_idx] = clf.predict(valid_x, num_iteration=clf.best_iteration_)
 
        test_pred += clf.predict(test[pred], num_iteration=clf.best_iteration_)/fold.n_splits
    train_data['predict']=train_pred
    print('mean_squared_error:',mean_squared_error(train_data[label].values,train_pred))
    test['label'] = test_pred
    return test[['loadingOrder', 'label']],clf


result,clf = build_model(train_data, test,pred= features,label= 'label', is_shuffle=True)

******第1折**********
Training until validation scores don't improve for 50 rounds
[100]	valid_0's l2: 290.398
[200]	valid_0's l2: 137.217
[300]	valid_0's l2: 87.9246
[400]	valid_0's l2: 64.1095
[500]	valid_0's l2: 50.345
[600]	valid_0's l2: 39.4002
[700]	valid_0's l2: 33.6221
[800]	valid_0's l2: 28.4568
[900]	valid_0's l2: 24.1927
[1000]	valid_0's l2: 21.0066
[1100]	valid_0's l2: 18.4622
[1200]	valid_0's l2: 16.4171
[1300]	valid_0's l2: 14.8983
[1400]	valid_0's l2: 13.5368
[1500]	valid_0's l2: 12.4583
[1600]	valid_0's l2: 11.3806
[1700]	valid_0's l2: 10.54
[1800]	valid_0's l2: 9.80811
[1900]	valid_0's l2: 9.17254
[2000]	valid_0's l2: 8.63606
[2100]	valid_0's l2: 8.17645
[2200]	valid_0's l2: 7.72456
[2300]	valid_0's l2: 7.32529
[2400]	valid_0's l2: 6.93869
[2500]	valid_0's l2: 6.64196
[2600]	valid_0's l2: 6.34846
[2700]	valid_0's l2: 6.0834
[2800]	valid_0's l2: 5.8377
[2900]	valid_0's l2: 5.59295
[3000]	valid_0's l2: 5.39187
[3100]	valid_0's l2: 5.19327
[3200]	valid_0's l2: 5.00739
[3300

In [146]:
from sklearn.metrics import mean_squared_error,explained_variance_score
from sklearn.model_selection import KFold,train_test_split
from  lightgbm.sklearn import LGBMRegressor
train_x,valid_x, train_y, valid_y=train_test_split(train_data[features].values,train_data['label'].values,test_size=0.25,random_state=1068)
print(train_x.shape)
print(valid_x.shape)
 # 数据加载
clf=LGBMRegressor( learning_rate=0.05,
n_estimators=5500,
num_leaves=156,
subsample=0.8,
njobs=-1,
max_depth=6,
reg_lambda=0.5,
colsample_bytree=0.8,
random_state=1090,  # 2019
metric=['mse'])

clf.fit(
train_x, train_y,
eval_set=[(valid_x, valid_y)],
eval_metric=['mse'],
categorical_feature='auto',
early_stopping_rounds=50,
verbose=100)        

test_pred = clf.predict(test[features], num_iteration=clf.best_iteration_)
# train_data['predict']=train_pred
# print('mean_squared_error:',mean_squared_error(valid_y,train_pred))
test['label'] = test_pred

(3419555, 36)
(1139852, 36)
Training until validation scores don't improve for 50 rounds
[100]	valid_0's l2: 3869.89
[200]	valid_0's l2: 2148.24
[300]	valid_0's l2: 1454.68
[400]	valid_0's l2: 1070.06
[500]	valid_0's l2: 812.235
[600]	valid_0's l2: 642.752
[700]	valid_0's l2: 524.619
[800]	valid_0's l2: 442.545


KeyboardInterrupt: 

In [104]:
import_dict={features[i]: f for i,f in enumerate(clf.feature_importances_)}
# import_dict.update({ i:v for i,v in enumerate(clf.feature_importances_[60:])})
sorted(import_dict.items(),key=lambda x:x[1],reverse=True)


[('vesselMMSI', 22410),
 ('start_direction', 21916),
 ('start_longitude', 21052),
 ('cusum_mean_speed', 20447),
 ('start_latitude', 20155),
 ('end_latitude', 19405),
 ('end_longitude', 17286),
 ('long_gap', 16044),
 ('TRANSPORT_TRACE', 16041),
 ('cusum_instant_acc', 15206),
 ('have_run_time', 14519),
 ('all_end_gap', 13953),
 ('end_sport', 12783),
 ('cusum_distance', 12754),
 ('lat_gap', 12530),
 ('latitude', 11852),
 ('longitude', 9118),
 ('start_lat_gap', 9067),
 ('start_long_gap', 8989),
 ('all_start_gap', 7901),
 ('speed', 7554),
 ('direction', 5730),
 ('lat_diff', 3905),
 ('start_sport', 3654),
 ('lon_diff', 2795),
 ('mean_speed', 2626),
 ('move_leng', 1137),
 ('instant_acc', 1018),
 ('diff_seconds', 970),
 ('speed_diff', 615),
 ('direction_diff', 329)]

## 后处理

In [131]:
# result=test.loc[test.have_run_time!=0]
result=test.copy()
# result['diff_label']=result.groupby('loadingOrder')['label'].diff(1)
# result1=result.loc[(result.diff_label>=0)]
# result=pd.concat([result1,result.loc[~(result.loadingOrder.isin(result1.loadingOrder))]],axis=0)

# result=test.drop_duplicates('loadingOrder','last')

result['ETA']=(result['timestamp']+result['label'].apply(lambda x:pd.Timedelta(hours=x)))
result['avg_time']=(result['ETA']-result['onboardDate']).dt.total_seconds()//3600

# #-----------
# # 箱型去除异常算法
# result['q1']=result.groupby('loadingOrder')['avg_time'].transform(lambda x:x.quantile(0.9))# q1.columns=['loadingOrder','q1']
# result['q2']=result.groupby('loadingOrder')['avg_time'].transform(lambda x:x.quantile(0.2))
# result['q1_bool']=(result['avg_time']>result['q1']).astype('int')
# result['q2_bool']=(result['avg_time']<result['q2']).astype('int')
# result['avg_time']=result['q1_bool']*result['q1']+result['avg_time']*(-result['q1_bool']+1)
# result['avg_time']=result['q2_bool']*result['q2']+result['avg_time']*(-result['q2_bool']+1)

result=result.groupby('loadingOrder')['avg_time'].mean().reset_index()

In [132]:
subs=test_df
subs=subs.merge(result[['loadingOrder','avg_time']],on='loadingOrder',how='left')
subs['ETA']=(subs['onboardDate']+subs['avg_time'].apply(lambda x:pd.Timedelta(hours=x))).apply(lambda x:x.strftime('%Y/%m/%d  %H:%M:%S'))
# del subs['label'],subs['start_time']

# subs = test_df.merge(subs, on='loadingOrder', how='left')
subs['onboardDate'] = test_df['onboardDate'].apply(lambda x:x.strftime('%Y/%m/%d  %H:%M:%S'))
subs['creatDate'] = pd.datetime.now().strftime('%Y/%m/%d  %H:%M:%S')
subs['timestamp'] =test_df['timestamp'].apply(lambda x:x.strftime('%Y-%m-%dT%H:%M:%S.000Z'))
# 整理columns顺序
subs =subs[['loadingOrder', 'timestamp', 'longitude', 'latitude', 'carrierName', 'vesselMMSI', 'onboardDate', 'ETA', 'creatDate']]
subs.to_csv('submitte.csv',index=False,)

25.639749942383038

## 文件提交

In [None]:
subs=test_df
subs=subs.merge(result[['loadingOrder','avg_time']],on='loadingOrder',how='left')
subs['ETA']=(subs['onboardDate']+subs['avg_time'].apply(lambda x:pd.Timedelta(hours=x))).apply(lambda x:x.strftime('%Y/%m/%d  %H:%M:%S'))
subs['onboardDate'] = test_df['onboardDate'].apply(lambda x:x.strftime('%Y/%m/%d  %H:%M:%S'))
subs['creatDate'] = pd.datetime.now().strftime('%Y/%m/%d  %H:%M:%S')
subs['timestamp'] =test_df['timestamp'].apply(lambda x:x.strftime('%Y-%m-%dT%H:%M:%S.000Z'))
# 整理columns顺序
subs =subs[['loadingOrder', 'timestamp', 'longitude', 'latitude', 'carrierName', 'vesselMMSI', 'onboardDate', 'ETA', 'creatDate']]
subs.to_csv('submmite.csv',index=False,)

## 多模多种子 多参数多分数 融合

In [None]:
result2['onboardDate'] = pd.to_datetime(result2['onboardDate'])
result2['onboardDate'] = result2['onboardDate'].apply(lambda x:x.strftime('%Y/%m/%d  %H:%M:%S'))
result2['ETA'] = pd.to_datetime(result2['ETA'])
result2['ETA'] = result2['ETA'].apply(lambda x:x.strftime('%Y/%m/%d  %H:%M:%S'))
result2['creatDate'] = pd.to_datetime(result2['creatDate'])
result2['creatDate'] = result2['creatDate'].apply(lambda x:x.strftime('%Y/%m/%d  %H:%M:%S'))

In [None]:
result1['timestamp'] = pd.to_datetime(result1['timestamp'])
result1['timestamp'] = result1['timestamp'].apply(lambda x:x.strftime('%Y/%m/%d  %H:%M:%S'))
result1['base_time'] = '2019/01/01  00:00:00'
result1['base_time'] = pd.to_datetime(result1['base_time'])
result1['ETA'] = pd.to_datetime(result1['ETA'])
result1['time_gap'] = (result1['ETA'] - result1['base_time']).dt.total_seconds()
result1['gap'] = result1.groupby('loadingOrder')['time_gap'].transform('median')
result1['ETA1'] = (result1['base_time']+result1['gap'].apply(lambda x:pd.Timedelta(seconds=x))).apply(lambda x:x.strftime('%Y/%m/%d  %H:%M:%S'))

In [None]:
result2['timestamp'] = pd.to_datetime(result2['timestamp'])
result2['timestamp'] = result2['timestamp'].apply(lambda x:x.strftime('%Y/%m/%d  %H:%M:%S'))
result2['base_time'] = '2019/01/01  00:00:00'
result2['base_time'] = pd.to_datetime(result1['base_time'])
result2['ETA'] = pd.to_datetime(result2['ETA'])
result2['time_gap'] = (result2['ETA'] - result2['base_time']).dt.total_seconds()
result2['gap'] = result2.groupby('loadingOrder')['time_gap'].transform('mean')

In [None]:
#求均值
result1['gap1'] = result2['gap']
result1['gap2'] = (result1['gap'] + result1['gap1'])/2

result1['ETA2'] = (result1['base_time']+result1['gap2'].apply(lambda x:pd.Timedelta(seconds=x))).apply(lambda x:x.strftime('%Y/%m/%d  %H:%M:%S'))