In [1]:
import glob
import os
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from sklearn.model_selection import KFold
import lightgbm as lgb
import requests
from sklearn.model_selection import StratifiedKFold
from lightgbm.sklearn import LGBMClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, f1_score,mean_squared_error,explained_variance_score
from scipy.stats import entropy, kurtosis
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split 
import pickle
import matplotlib.pyplot as plt
from tqdm import tqdm
import time
import datetime
import gc
import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [2]:
def reduce_mem(df):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('{:.2f} Mb, {:.2f} Mb ({:.2f} %)'.format(start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    gc.collect()
    return df

In [2]:
#data = pd.read_pickle('anlydata/complect_2.pkl')
data = pd.read_pickle('anlydata/samp_data.pkl')
#读取相关字典
carri_dict = np.load('anlydata/carri_dict.npy',allow_pickle=True).item()
mmsi_dict = np.load('anlydata/mmsi_dict.npy',allow_pickle=True).item()
trace_dict = np.load('anlydata/trace_dict.npy',allow_pickle=True).item()
#data = reduce_mem(data)
gc.collect()

93

In [3]:
#将表格中的值反编回字典trace_dict中的关键字
d1 = pd.DataFrame.from_dict(trace_dict,orient='index',columns =['TRANSPORT_TRACE'])
d1 = d1.reset_index().rename(columns={'index':'trace'})
data = data.merge(d1,on='TRANSPORT_TRACE',how='left').reset_index(drop=True)
del data['TRANSPORT_TRACE']
data = data.rename(columns={'trace':'TRANSPORT_TRACE'})

In [4]:
#将表格中的值反编回字典mmsi_dict中的关键字
d1 = pd.DataFrame.from_dict(mmsi_dict,orient='index',columns =['vesselMMSI'])
d1 = d1.reset_index().rename(columns={'index':'mmsi'})
data = data.merge(d1,on='vesselMMSI',how='left').reset_index(drop=True)
del data['vesselMMSI']
data = data.rename(columns={'mmsi':'vesselMMSI'})

In [5]:
#将表格中的值反编回字典carri_dict中的关键字
d1 = pd.DataFrame.from_dict(carri_dict,orient='index',columns =['carrierName'])
d1 = d1.reset_index().rename(columns={'index':'caname'})
data = data.merge(d1,on='carrierName',how='left').reset_index(drop=True)
del data['carrierName']
data = data.rename(columns={'caname':'carrierName'})

In [6]:
test = pd.read_csv('train0523/testData 0626.csv')
port = pd.read_csv('train0523/port_2.csv')

In [7]:
data1 = data.drop_duplicates([c for c in data.columns if c not in ['loadingOrder']])

In [8]:
data1 = data1.loc[data1['TRANSPORT_TRACE'].notnull()]
data1['len'] = data1['TRANSPORT_TRACE'].str.split('-')
data1['len'] = data1['len'].str.len()
data1 = data1.loc[(data1['len']>=2)&(data1['len']<=3)]
gc.collect()

14

In [9]:
#获取起始点和终点岗口
def get_pot(df):
    df['start_pot'] = df['TRANSPORT_TRACE'].str.split('-').apply(lambda x:x[0])
    df['end_pot'] = df['TRANSPORT_TRACE'].str.split('-').apply(lambda x:x[-1])
    return df
train = get_pot(data1)
del data1
#train = reduce_mem(train)
test = get_pot(test)

In [10]:
#test添加起始港口和终点港口的坐标
port1 = port.rename(columns={'TRANS_NODE_NAME':'start_pot','LONGITUDE':
                            'start_long','LATITUDE':'start_lat'})
port2 = port.rename(columns={'TRANS_NODE_NAME':'end_pot','LONGITUDE':
                            'end_long','LATITUDE':'end_lat'})
test = test.merge(port1,on='start_pot',how='left')
test = test.merge(port2,on='end_pot',how='left')
del port1,port2
gc.collect()

56

In [11]:
port1 = port.rename(columns={'TRANS_NODE_NAME':'start_pot','LONGITUDE':
                            'start_long_1','LATITUDE':'start_lat_1'})
port2 = port.rename(columns={'TRANS_NODE_NAME':'end_pot','LONGITUDE':
                            'end_long_1','LATITUDE':'end_lat_1'})
train = train.merge(port1,on='start_pot',how='left')
train = train.merge(port2,on='end_pot',how='left')
del port1,port2
gc.collect()

20

In [12]:
#train添加起始港口和终点港口的坐标
tmp=train.drop_duplicates('loadingOrder',keep='last')
tmp = tmp[['loadingOrder','longitude','latitude']].rename(columns={'longitude':
                                            'end_long','latitude':'end_lat'})
tmp1=train.drop_duplicates('loadingOrder',keep='first')
tmp1 = tmp1[['loadingOrder','longitude','latitude']].rename(columns={'longitude':
                                            'start_long','latitude':'start_lat'})
train = train.merge(tmp,on='loadingOrder',how='left')
train = train.merge(tmp1,on='loadingOrder',how='left')
#train = reduce_mem(train)
gc.collect()

0

In [13]:
gc.collect()

20

In [13]:
train = train.loc[train['start_long'].notnull()]
train = train.loc[train['start_lat'].notnull()]
train = train.loc[train['end_long'].notnull()]
train = train.loc[train['end_lat'].notnull()]
gc.collect()

20

In [14]:
train = train.loc[(abs((train['start_long'] - train['start_long_1'])+(train['start_lat'] - train['start_lat_1']))<5)] 
train = train.loc[(abs((train['end_long'] - train['end_long_1'])+(train['end_lat'] - train['end_lat_1']))<5)]
del train['start_long_1'],train['start_lat_1'],train['end_long_1'],train['end_lat_1']
#train = reduce_mem(train)
gc.collect()

26

In [15]:
def get_data(data, model='train'):
    #转换成时间戳，并且将每个运单按照时间排序
    assert model=='train' or model=='test'
    data.sort_values(['loadingOrder','timestamp'],inplace=True)
    if model=='train':
        pass
#         data['vesselNextportETA'] = pd.to_datetime(data['vesselNextportETA'], infer_datetime_format=True) 
    else:
        data['onboardDate'] = pd.to_datetime(data['onboardDate'], infer_datetime_format=True)
    data['timestamp'] = pd.to_datetime(data['timestamp'], infer_datetime_format=True)    
    return data
def get_anchor(df):
    # 转化为360度数
    df['direction']=df['direction'].values/10
    tmp=df.groupby('loadingOrder')
    df['lat_diff'] = tmp['latitude'].diff(1)
    df['lon_diff'] = tmp['longitude'].diff(1)
    df['lat_diff_1'] = tmp['latitude'].diff(-1)
    df['lon_diff_1'] = tmp['longitude'].diff(-1)
    df['speed_diff'] = tmp['speed'].diff(1)
    df['speed_diff_1'] = tmp['speed'].diff(-1)
    df['direction_diff']=tmp['direction'].diff(1)
    df['diff_seconds'] = tmp['timestamp'].diff(1).dt.total_seconds()
    ### 这样实际是做了一个采样！！ #可以去除重复的记录
    df['anchor'] =((df['lat_diff']<= 0.03)&(df['lon_diff'] <= 0.03)&(df['speed_diff'] <= 0.3)).astype('int')
    ###  这里标记下船几乎停止的地方
    df['stop']=((df['lat_diff'] <= 0.03)&(df['lon_diff'] <= 0.03)&(df['speed'] <= 1)).astype('int')
    df['delay']=(df['diff_seconds']>3000).astype('int')
    return df
def distance(LatA,LatB,LonA,LonB):
    EARTH_RADIUS = 6378.137 # 千米
    def rad(d):
        return d * np.pi/ 180.0
    s=0
    radLatA = rad(LatA)
    radLatB = rad(LatB)
    a = radLatA-radLatB
    b = rad(LonA)-rad(LonB)
    s= 2 * np.arcsin(np.sqrt(np.power(np.sin(a / 2),2)+ np.cos(radLatA) * np.cos(radLatB)*np.power(np.sin(b / 2),2)))
    s=s* EARTH_RADIUS
    #  保留两位小数
    s = np.round(s * 100)/100
    s = s * 1000 # 转换成m
    return s
def get_feature(df,model='train'):
     #计算移动方便后面计算轨迹长度 m
    df['move_leng']=distance(df.latitude.values,df.groupby('loadingOrder')['latitude'
                ].shift(1).values,df.longitude.values,df.groupby('loadingOrder')['longitude'].shift(1).values)  
    #计算下之前的累计距离
    df['cusum_distance'] = df.groupby('loadingOrder')['move_leng'].cumsum()
    
    #-----------
    #df['cusum_direction'] = df.groupby('loadingOrder')['direction_diff'].expanding().mean().reset_index(drop=True)
    #df['cusum_mean_speed'] = df.groupby('loadingOrder')['speed'].expanding().mean().reset_index(drop=True)
    df['cusum_stop'] = df.groupby('loadingOrder')['stop'].cumsum()
    df['cusum_speed']=df.groupby('loadingOrder')['speed'].rolling(window=5).mean().reset_index(drop=True)
    #------------------------------------------------------
    df['direction_valc']=df['direction_diff']/df['diff_seconds']#
    df['mean_speed'] = df['move_leng']/(df['diff_seconds']+0.01)
    # 瞬时加速度 m/s2
    df['instant_acc']=df['mean_speed']/(df['diff_seconds']+0.01)
    
    #获取船航行经度和维度的行驶比例和总航行占比
    df['long_gap'] = abs(df['end_long']-df['longitude'])
    df['lat_gap'] = abs(df['end_lat']-df['latitude'])
    df['start_long_gap'] = abs(df['start_long']-df['longitude'])
    df['start_lat_gap'] = abs(df['start_lat']-df['latitude'])
    df['start_long_ratio'] = abs(df['longitude']-df['start_long']) / abs(df['end_long']-df['start_long'])
    df['start_lat_ratio'] = abs(df['latitude']-df['start_lat']) / abs(df['end_lat']-df['start_lat'])
    df['end_long_ratio'] = abs(df['longitude']-df['end_long']) / abs(df['end_long']-df['start_long'])
    df['end_lat_ratio'] = abs(df['latitude']-df['end_lat']) / abs(df['end_lat']-df['start_lat'])
    #获取总差距
    df['all_start_gap'] = df['start_long_gap'] + df['start_lat_gap']
    df['all_start_ratio'] = df['all_start_gap'] / (abs(df['end_long']-df['start_long'])+abs(df['end_lat']-df['start_lat']))
    df['all_end_gap'] = df['long_gap'] + df['lat_gap']
    df['all_end_ratio'] = df['all_end_gap'] / (abs(df['end_long']-df['start_long'])+abs(df['end_lat']-df['start_lat']))
    
    #获取年月日等时间特征
    df['month'] = df['timestamp'].dt.month
    df['day'] = df['timestamp'].dt.day
    df['hour'] = df['timestamp'].dt.hour
    df['time'] = df['month'].astype(str)+'-'+df['day'].astype(str)
    
    ## 得到最早的时间
    tmp=df.drop_duplicates('loadingOrder',keep='first').reset_index(drop=True)
    tmp=tmp[['loadingOrder','timestamp','direction']]
    tmp.columns=['loadingOrder','start_time','start_direction']
    df=df.merge(tmp,on='loadingOrder',how='left')
    df['have_run_time']=(df['timestamp']-df['start_time']).dt.total_seconds()//3600
    df['distanc2taget']=distance(df.latitude.values,df.end_lat.values,df.longitude.values,df.end_long.values)
    df['cusum_mean_speed'] = df['cusum_distance']/(df['have_run_time']+0.01)
    # 瞬时加速度 m/s2
    df['cusum_instant_acc']=df['cusum_mean_speed']/(df['have_run_time']+0.01)
    return df
def type_encoding(train_data,test_data):
    ### ----对类别进行编码
    for f in ['TRANSPORT_TRACE','carrierName','vesselMMSI','time']:
        unique_set=set(train_data[f].unique().tolist()+test_data[f].unique().tolist())
        unique_dict={ f:i for i,f in enumerate(unique_set)}
        test_data[f]=test_data[f].map(unique_dict)
        train_data[f]=train_data[f].map(unique_dict)
        
    # 港口名称编码
    unique_set=set(train_data['start_pot'].unique().tolist()+test_data['start_pot'].unique().tolist()
                  +train_data['end_pot'].unique().tolist()+test_data['end_pot'].unique().tolist())
    unique_dict={ f:i for i,f in enumerate(unique_set)}
    for f in ['start_pot','end_pot']:
        test_data[f]=test_data[f].map(unique_dict)
        train_data[f]=train_data[f].map(unique_dict)
    return train_data,test_data
def get_label(df):
    tmp = df.groupby('loadingOrder')['timestamp'].agg({'time_max':'max'})
    df = df.merge(tmp,on='loadingOrder',how='left')
    df['label'] = (df['time_max'] - df['timestamp']).dt.total_seconds()//3600
    return df

In [16]:
train = get_data(train,model='train')
train = get_anchor(train)
train = get_feature(train)
train = get_label(train)
gc.collect()

10

In [17]:
test = get_data(test,model='test')
test = get_anchor(test)
test = get_feature(test)

In [20]:
train = get_sample(train)
test = get_sample1(test)
gc.collect()

121

In [18]:
train,test1 = type_encoding(train,test)
gc.collect()

0

In [101]:
train['leng']=train.groupby('loadingOrder')['timestamp'].transform('count')
train1 = train.loc[(train['leng']>2)]
gc.collect()

20

In [None]:
['direction', 'latitude', 'longitude', 'speed', 'diff_seconds', 'TRANSPORT_TRACE', 'vesselMMSI', 'start_pot', 'end_pot',
 'end_long', 'end_lat', 'start_long', 'start_lat', 'lat_diff', 'lon_diff', 'speed_diff', 'direction_diff', 'move_leng',
 'cusum_distance', 'cusum_stop', 'mean_speed', 'instant_acc', 'long_gap', 'lat_gap', 'start_long_gap', 'start_lat_gap', 
 'all_start_gap', 'all_end_gap', 'start_direction', 'have_run_time', 'cusum_mean_speed', 'cusum_instant_acc']14.25

In [None]:
['direction', 'latitude', 'longitude', 'speed', 'diff_seconds', 'TRANSPORT_TRACE', 'vesselMMSI', 'carrierName', 'start_pot', 
 'end_pot', 'end_long', 'end_lat', 'start_long', 'start_lat', 'lat_diff', 'lon_diff', 'speed_diff', 'direction_diff', 'anchor',
 'stop', 'delay', 'move_leng', 'cusum_distance', 'cusum_stop', 'direction_valc', 'mean_speed', 'instant_acc', 'long_gap', 
 'lat_gap', 'start_long_gap', 'start_lat_gap', 'all_start_gap', 'all_end_gap', 'start_direction', 'have_run_time', 
 'cusum_mean_speed', 'cusum_instant_acc']14.06

In [100]:
features = [c for c in train.columns if c in ['direction', 'latitude', 'longitude', 'speed', 'diff_seconds', 'TRANSPORT_TRACE', 'vesselMMSI', 'carrierName', 'start_pot', 
 'end_pot', 'end_long', 'end_lat', 'start_long', 'start_lat', 'lat_diff', 'lon_diff', 'speed_diff', 'direction_diff', 'anchor',
 'stop', 'delay', 'move_leng', 'cusum_distance', 'cusum_stop', 'direction_valc', 'mean_speed', 'instant_acc', 'long_gap', 
 'lat_gap', 'start_long_gap', 'start_lat_gap', 'all_start_gap', 'all_end_gap', 'start_direction', 'have_run_time', 
 'cusum_mean_speed', 'cusum_instant_acc','cusum_speed']]
print(features)
print(len(features))
gc.collect()

['direction', 'latitude', 'longitude', 'speed', 'diff_seconds', 'TRANSPORT_TRACE', 'vesselMMSI', 'carrierName', 'start_pot', 'end_pot', 'end_long', 'end_lat', 'start_long', 'start_lat', 'lat_diff', 'lon_diff', 'speed_diff', 'direction_diff', 'anchor', 'stop', 'delay', 'move_leng', 'cusum_distance', 'cusum_stop', 'direction_valc', 'mean_speed', 'instant_acc', 'long_gap', 'lat_gap', 'start_long_gap', 'start_lat_gap', 'all_start_gap', 'all_end_gap', 'start_direction', 'have_run_time', 'cusum_mean_speed', 'cusum_instant_acc', 'cusum_speed']
38


20

In [None]:
from sklearn.metrics import mean_squared_error,explained_variance_score
from sklearn.model_selection import KFold
from  lightgbm.sklearn import LGBMRegressor
def mse_score_eval(preds, valid):
    labels = valid.get_label()
    scores = mean_squared_error(y_true=labels, y_pred=preds)
    return 'mse_score', scores, True

def build_model(train_data, test, pred, label, seed=1099, is_shuffle=True):
    train_pred = np.zeros((train_data.shape[0], ))
    test_pred = np.zeros((test.shape[0], ))
    n_splits = 5
    # Kfold
    fold = KFold(n_splits=n_splits, shuffle=is_shuffle, random_state=seed)
    kf_way = fold.split(train_data[pred])
    # params
#     test_x=np.concatenate([test[pred].values,geohash_test],axis=1)
    # train
    for n_fold, (train_idx, valid_idx) in enumerate(kf_way, start=1):
        train_x, train_y = train_data[pred].iloc[train_idx].values, train_data[label].iloc[train_idx]
        valid_x, valid_y = train_data[pred].iloc[valid_idx].values, train_data[label].iloc[valid_idx]
#         geohash_tr_x,geohash_val_x=geohash_train[train_idx],geohash_train[valid_idx]
#         train_x=np.concatenate([train_x,geohash_tr_x],axis=1)
#         valid_x=np.concatenate([valid_x,geohash_val_x],axis=1)
        
        # 数据加载
        clf=LGBMRegressor( learning_rate=0.5,
        n_estimators=4000,
        boosting_type = 'gbdt',
        objective = 'regression',
        num_leaves=256,
        subsample=0.8,
        njobs=-1,
        max_depth=6,
        reg_lambda=0,
        colsample_bytree=0.8,
        random_state=2019,  # 2019
        metric=['mse'])
        
        clf.fit(
        train_x, train_y,
        eval_set=[(valid_x, valid_y)],
        eval_metric=['mse'],
        categorical_feature='auto',
        early_stopping_rounds=100,
        verbose=100)        
        
        train_pred[valid_idx] = clf.predict(valid_x, num_iteration=clf.best_iteration_)
        
        
        test_pred += clf.predict(test[pred], num_iteration=clf.best_iteration_)/fold.n_splits
    
    print('mean_squared_error:',mean_squared_error(train_data[label].values,train_pred))
    test['label'] = test_pred
    return test[['loadingOrder', 'label']],clf


def bulid_onetrain(train_data, test,pred= features,label= 'label',seed=1099, is_shuffle=True):
    train_x,train_y=train_data[features].values,train_data[label].values
    clf=LGBMRegressor( learning_rate=0.05,
    boosting_type = 'gbdt',
    objective = 'regression',
    n_estimators=6000,
    num_leaves=156,
    subsample=0.8,
    njobs=-1,
    max_depth=6,
    reg_lambda=0,
    colsample_bytree=0.8,
    random_state=2019,  # 2019
    metric=['mse'])

    clf.fit(
    train_x, train_y,
    eval_set=[(train_x, train_y)],
    eval_metric=['mse'],
    categorical_feature='auto',
    early_stopping_rounds=100,
    verbose=100)        

    #train_pred= clf.predict(train_x, num_iteration=clf.best_iteration_)


    test_pred= clf.predict(test[pred], num_iteration=clf.best_iteration_)

    #print('mean_squared_error:',mean_squared_error(train_y,train_pred))
    test['label'] = test_pred
    return test[['loadingOrder', 'label']],clf
#result,clf = build_model(train1, test,pred= features,label= 'label', is_shuffle=True)
result,clf=bulid_onetrain(train1, test1,pred= features,label= 'label',is_shuffle=True)

Training until validation scores don't improve for 100 rounds
[100]	training's l2: 3956.7
[200]	training's l2: 2066.48
[300]	training's l2: 1417.67
[400]	training's l2: 1037.52
[500]	training's l2: 817.842
[600]	training's l2: 647.177
[700]	training's l2: 536.525


In [83]:
test['onboardDate'] = pd.to_datetime(test['onboardDate'])
test['timestamp'] = pd.to_datetime(test['timestamp'])
test['timestamp'] = test['timestamp'].apply(lambda x:x.strftime('%Y/%m/%d  %H:%M:%S'))
test['timestamp'] = pd.to_datetime(test['timestamp'])
test['ETA']=(test['timestamp']+test['label'].apply(lambda x:pd.Timedelta(hours=x))).apply(lambda x:x.strftime('%Y/%m/%d  %H:%M:%S'))
#test.drop(['direction','TRANSPORT_TRACE'],axis=1,inplace=True)
test['onboardDate'] = test['onboardDate'].apply(lambda x:x.strftime('%Y/%m/%d  %H:%M:%S'))
test['onboardDate'] = pd.to_datetime(test['onboardDate'])
test['creatDate'] = pd.datetime.now().strftime('%Y/%m/%d  %H:%M:%S')
#test['timestamp'] =test['timestamp'].apply(lambda x:x.strftime('%Y-%m-%dT%H:%M:%S.000Z'))
test['sub_onboard'] = (test['timestamp']-test['onboardDate']).dt.total_seconds()

In [84]:
result = test[['loadingOrder', 'timestamp', 'longitude', 'latitude', 'carrierName', 'vesselMMSI', 'onboardDate', 'ETA', 'creatDate']]

In [85]:
#预测结果取均值
result['base_time'] = '2019/01/01  00:00:00'
result['base_time'] = pd.to_datetime(result['base_time'])
result['ETA'] = pd.to_datetime(result['ETA'])
result['time_gap'] = (result['ETA'] - result['base_time']).dt.total_seconds()
result['gap'] = result.groupby('loadingOrder')['time_gap'].transform('mean')
result['ETA1'] = (result['base_time']+result['gap'].apply(lambda x:pd.Timedelta(seconds=x))).apply(lambda x:x.strftime('%Y/%m/%d  %H:%M:%S'))

In [86]:
result = result[['loadingOrder', 'timestamp', 'longitude', 'latitude', 'carrierName', 'vesselMMSI', 'onboardDate', 'ETA1', 'creatDate']].rename(columns={'ETA1':'ETA'})

In [34]:
base = pd.read_csv('B10.csv')

In [95]:
np.abs(((pd.to_datetime(result2.ETA)-pd.to_datetime(base.ETA)).dt.total_seconds().values//3600)).mean()

16.752160636091265

In [94]:
result2

Unnamed: 0,loadingOrder,timestamp,longitude,latitude,carrierName,vesselMMSI,onboardDate,ETA,creatDate
0,AE223035353902,2019-07-03T21:16:48.000Z,120.093858,22.581320,OIEQNT,C2075927370,2019/07/02 04:12:48,2019/07/26 15:35:46,2020/07/01 22:12:14
1,AE223035353902,2019-07-03T21:34:48.000Z,120.035707,22.617522,OIEQNT,C2075927370,2019/07/02 04:12:48,2019/07/26 15:35:46,2020/07/01 22:12:14
2,AE223035353902,2019-07-03T21:51:18.000Z,119.981800,22.658465,OIEQNT,C2075927370,2019/07/02 04:12:48,2019/07/26 15:35:46,2020/07/01 22:12:14
3,AE223035353902,2019-07-03T21:54:18.000Z,119.970845,22.668688,OIEQNT,C2075927370,2019/07/02 04:12:48,2019/07/26 15:35:46,2020/07/01 22:12:14
4,AE223035353902,2019-07-03T22:11:08.000Z,119.953628,22.756897,OIEQNT,C2075927370,2019/07/02 04:12:48,2019/07/26 15:35:46,2020/07/01 22:12:14
...,...,...,...,...,...,...,...,...,...
34707,ZZ524449869421,2020-03-17T04:02:38.000Z,103.776707,1.252897,BHSOUA,P2595193878,2020/03/13 06:07:28,2020/04/05 02:53:03,2020/07/01 22:12:14
34708,ZZ524449869421,2020-03-17T04:03:18.000Z,103.776312,1.253418,BHSOUA,P2595193878,2020/03/13 06:07:28,2020/04/05 02:53:03,2020/07/01 22:12:14
34709,ZZ524449869421,2020-03-17T04:05:18.000Z,103.775175,1.254865,BHSOUA,P2595193878,2020/03/13 06:07:28,2020/04/05 02:53:03,2020/07/01 22:12:14
34710,ZZ524449869421,2020-03-17T04:05:58.000Z,103.774803,1.255285,BHSOUA,P2595193878,2020/03/13 06:07:28,2020/04/05 02:53:03,2020/07/01 22:12:14


In [88]:
result1 = result[['loadingOrder','ETA']].drop_duplicates('loadingOrder')

In [89]:
test3 = pd.read_csv('train0523/testData 0626.csv')

In [90]:
test3 = test3.merge(result1,on='loadingOrder',how='left')

In [93]:
result2 = test3[['loadingOrder', 'timestamp', 'longitude', 'latitude', 'carrierName', 'vesselMMSI', 'onboardDate', 'ETA', 'creatDate']]

In [91]:
test3['creatDate'] = pd.datetime.now().strftime('%Y/%m/%d  %H:%M:%S')

In [37]:
#添加终点港口的时间
tmp=test2.drop_duplicates('loadingOrder',keep='last')
tmp = tmp[['loadingOrder','timestamp']].rename(columns={'timestamp':
                                            'timestamp1'})
test2 = test2.merge(tmp,on='loadingOrder',how='left')

In [208]:
B1['onboardDate'] = pd.to_datetime(B1['onboardDate'])
B1['onboardDate'] = B1['onboardDate'].apply(lambda x:x.strftime('%Y/%m/%d  %H:%M:%S'))
B1['ETA'] = pd.to_datetime(B1['ETA'])
B1['ETA'] = B1['ETA'].apply(lambda x:x.strftime('%Y/%m/%d  %H:%M:%S'))
B1['creatDate'] = pd.datetime.now().strftime('%Y/%m/%d  %H:%M:%S')

In [30]:
test2['timestamp'] = pd.to_datetime(test2['timestamp'])
test2['timestamp'] = test2['timestamp'].apply(lambda x:x.strftime('%Y/%m/%d  %H:%M:%S'))

In [None]:
test2 = test2.merge(result,on=['vesselMMSI','onboardDate'],how='left')

In [35]:
result = test[['loadingOrder', 'timestamp', 'longitude', 'latitude', 'carrierName', 'vesselMMSI', 'onboardDate', 'ETA', 'creatDate']]

In [36]:
result['base_time'] = '2019/01/01  00:00:00'
result['base_time'] = pd.to_datetime(result['base_time'])
result['ETA'] = pd.to_datetime(result['ETA'])
result['time_gap'] = (result['ETA'] - result['base_time']).dt.total_seconds()
result['gap'] = result.groupby('loadingOrder')['time_gap'].transform('mean')
result['ETA1'] = (result['base_time']+result['gap'].apply(lambda x:pd.Timedelta(seconds=x))).apply(lambda x:x.strftime('%Y/%m/%d  %H:%M:%S'))

In [None]:
#多结果均值融合

In [70]:
result1 = pd.read_csv('result5.csv')
result2 = pd.read_csv('change1.csv')

In [103]:
result2['onboardDate'] = pd.to_datetime(result2['onboardDate'])
result2['onboardDate'] = result2['onboardDate'].apply(lambda x:x.strftime('%Y/%m/%d  %H:%M:%S'))
result2['ETA'] = pd.to_datetime(result2['ETA'])
result2['ETA'] = result2['ETA'].apply(lambda x:x.strftime('%Y/%m/%d  %H:%M:%S'))
result2['creatDate'] = pd.to_datetime(result2['creatDate'])
result2['creatDate'] = result2['creatDate'].apply(lambda x:x.strftime('%Y/%m/%d  %H:%M:%S'))

In [83]:
result1['timestamp'] = pd.to_datetime(result1['timestamp'])
result1['timestamp'] = result1['timestamp'].apply(lambda x:x.strftime('%Y/%m/%d  %H:%M:%S'))
result1['base_time'] = '2019/01/01  00:00:00'
result1['base_time'] = pd.to_datetime(result1['base_time'])
result1['ETA'] = pd.to_datetime(result1['ETA'])
result1['time_gap'] = (result1['ETA'] - result1['base_time']).dt.total_seconds()
result1['gap'] = result1.groupby('loadingOrder')['time_gap'].transform('median')
result1['ETA1'] = (result1['base_time']+result1['gap'].apply(lambda x:pd.Timedelta(seconds=x))).apply(lambda x:x.strftime('%Y/%m/%d  %H:%M:%S'))

In [84]:
result2['timestamp'] = pd.to_datetime(result2['timestamp'])
result2['timestamp'] = result2['timestamp'].apply(lambda x:x.strftime('%Y/%m/%d  %H:%M:%S'))
result2['base_time'] = '2019/01/01  00:00:00'
result2['base_time'] = pd.to_datetime(result1['base_time'])
result2['ETA'] = pd.to_datetime(result2['ETA'])
result2['time_gap'] = (result2['ETA'] - result2['base_time']).dt.total_seconds()
result2['gap'] = result2.groupby('loadingOrder')['time_gap'].transform('mean')

In [85]:
#求均值
result1['gap1'] = result2['gap']
result1['gap2'] = (result1['gap'] + result1['gap1'])/2

In [86]:
result1['ETA2'] = (result1['base_time']+result1['gap2'].apply(lambda x:pd.Timedelta(seconds=x))).apply(lambda x:x.strftime('%Y/%m/%d  %H:%M:%S'))

In [37]:
result = result[['loadingOrder', 'timestamp', 'longitude', 'latitude', 'carrierName', 'vesselMMSI', 'onboardDate', 'ETA1', 'creatDate']].rename(columns={'ETA1':'ETA'})

In [43]:
result['ETA'] = pd.to_datetime(result['ETA'])
result['ETA'] = result['ETA'].apply(lambda x:x.strftime('%Y/%m/%d  %H:%M:%S'))

In [46]:
#转化格式
result['carrierName'] = test3['carrierName']
result['vesselMMSI'] = test3['vesselMMSI']

In [24]:
result = result[['loadingOrder', 'timestamp', 'longitude', 'latitude', 'carrierName', 'vesselMMSI', 'onboardDate', 'ETA1', 'creatDate']].rename(columns={'ETA1':'ETA'})

In [47]:
result['timestamp'] = pd.to_datetime(result['timestamp'])
result['timestamp'] =result['timestamp'].apply(lambda x:x.strftime('%Y-%m-%dT%H:%M:%S.000Z'))

In [44]:
test3['creatDate'] = pd.datetime.now().strftime('%Y/%m/%d  %H:%M:%S')

In [51]:
import_dict={features[i]: f for i,f in enumerate(clf.feature_importances_)}
# import_dict.update({ i:v for i,v in enumerate(clf.feature_importances_[60:])})
sorted(import_dict.items(),key=lambda x:x[1],reverse=True)

[('have_run_time', 25738),
 ('lat_gap', 19945),
 ('long_gap', 19503),
 ('cusum_distance', 18154),
 ('vesselMMSI', 15338),
 ('start_long_gap', 14088),
 ('start_lat', 13775),
 ('start_long', 13485),
 ('start_lat_gap', 13342),
 ('end_lat', 12374),
 ('cusum_mean_speed', 12285),
 ('latitude', 11822),
 ('cusum_instant_acc', 11706),
 ('end_long', 11373),
 ('lat_diff', 11083),
 ('TRANSPORT_TRACE', 10643),
 ('mean_speed', 10521),
 ('longitude', 10402),
 ('lon_diff', 8642),
 ('speed', 7767),
 ('diff_seconds', 5406),
 ('instant_acc', 4888),
 ('speed_diff', 2315)]

In [None]:
#速度特征
#位移特征
#轨迹特征