In [1]:
import os, random, csv, datetime, json
import pandas as pd
import numpy as np
import argparse
import time
from tqdm import tqdm

In [2]:
def gen_feats(task,head,path,nl,link_feats):
########################
###head features
########################
    head['speed'] = head['distance']/head['simple_eta']
    head['speed_avg'] = head['slice_id'].map(head.groupby('slice_id')['speed'].mean())
    head['speed_std'] = head['slice_id'].map(head.groupby('slice_id')['speed'].std())
    head['speed_min'] = head['slice_id'].map(head.groupby('slice_id')['speed'].min())
    head['speed_max'] = head['slice_id'].map(head.groupby('slice_id')['speed'].max())
    head['volume'] = head['slice_id'].map(head.groupby('slice_id').size())
    head['old_driver'] = head['driver_id'].map(head.groupby('driver_id').size())
    
    head.set_index('order_id',inplace=True)
    
    path['link_eta'] = path['link_time']*path['link_ratio']
    path['cum_time'] = (path.groupby('order_id')['link_eta'].cumsum()/300).astype('int')
    link = path[path['cross_flag']==0]
    cross = path[path['cross_flag']==1]
########################
###link features
########################
    link = link[link['link_time']>0]
    link['link_id'] = link['link_id'].astype('int')
    link['link_type'] = link['link_id'].map(link_feats['link_label']).fillna(0).astype('int')
    link['next_link'] = link['link_id'].map(nl).fillna(0).astype('int')
    link['slice_id'] = link['order_id'].map(head['slice_id'])
    
    gl = link.groupby('order_id')
    head['link_no'] = head.index.map(gl.size())
    head['link_time_sum'] = head.index.map(gl['link_eta'].sum())
    head['link_time_avg'] = head.index.map(gl['link_time'].mean())
    head['link_time_std'] = head.index.map(gl['link_time'].std())
    head['link_time_max'] = head.index.map(gl['link_time'].max())
    head['link_time_min'] = head.index.map(gl['link_time'].min())
    
    head['time_delay_max'] = head.index.map(gl['cum_time'].max())
    head['time_delay_avg'] = head.index.map(gl['cum_time'].mean())
    head['time_delay_std'] = head.index.map(gl['cum_time'].std())
    
    gl = link.groupby(['order_id','link_current_status'])['link_time']
    head = head.join(gl.size().unstack(level=-1,fill_value=0).add_prefix('current_no_'))
    head = head.join(gl.sum().unstack(level=-1,fill_value=0).add_prefix('current_time_sum_'))
    head = head.join(gl.mean().unstack(level=-1,fill_value=0).add_prefix('current_time_avg_'))
    head = head.join(gl.std().unstack(level=-1,fill_value=0).add_prefix('current_time_std_'))
    head = head.join(gl.max().unstack(level=-1,fill_value=0).add_prefix('current_time_max_'))
    head = head.join(gl.min().unstack(level=-1,fill_value=0).add_prefix('current_time_min_'))

    gl = link.groupby(['order_id','next_link'])['link_time']
    head = head.join(gl.size().unstack(level=-1,fill_value=0).add_prefix('next_no_'))
    head = head.join(gl.sum().unstack(level=-1,fill_value=0).add_prefix('next_time_sum_'))
    head = head.join(gl.mean().unstack(level=-1,fill_value=0).add_prefix('next_time_avg_'))
    head = head.join(gl.std().unstack(level=-1,fill_value=0).add_prefix('next_time_std_'))
    head = head.join(gl.max().unstack(level=-1,fill_value=0).add_prefix('next_time_max_'))
    head = head.join(gl.min().unstack(level=-1,fill_value=0).add_prefix('next_time_min_'))

    gl = link.groupby(['order_id','link_type'])['link_time']
    head = head.join(gl.size().unstack(level=-1,fill_value=0).add_prefix('type_no_'))
    head = head.join(gl.sum().unstack(level=-1,fill_value=0).add_prefix('type_time_sum_'))
    head = head.join(gl.mean().unstack(level=-1,fill_value=0).add_prefix('type_time_avg_'))
    head = head.join(gl.std().unstack(level=-1,fill_value=0).add_prefix('type_time_std_'))
    head = head.join(gl.max().unstack(level=-1,fill_value=0).add_prefix('type_time_max_'))
    head = head.join(gl.min().unstack(level=-1,fill_value=0).add_prefix('type_time_min_'))
    
#     gl = link.groupby(['order_id','link_arrival_status'])['link_time']
#     head = head.join(gl.size().unstack(level=-1,fill_value=0).add_prefix('arrival_no_'))
#     head = head.join(gl.sum().unstack(level=-1,fill_value=0).add_prefix('arrival_time_sum_'))
#     head = head.join(gl.mean().unstack(level=-1,fill_value=0).add_prefix('arrival_time_avg_'))
#     head = head.join(gl.std().unstack(level=-1,fill_value=0).add_prefix('arrival_time_std_'))
#     head = head.join(gl.max().unstack(level=-1,fill_value=0).add_prefix('arrival_time_max_'))
#     head = head.join(gl.min().unstack(level=-1,fill_value=0).add_prefix('arrival_time_min_'))

########################
###cross features
########################
    gc = cross.groupby('order_id') 
    head['cross_no'] = head.index.map(gc.size())
    head['cross_sum'] = head.index.map(gc['link_time'].sum())
    head['cross_avg'] = head.index.map(gc['link_time'].mean())
    head['cross_std'] = head.index.map(gc['link_time'].std())
    head['cross_max'] = head.index.map(gc['link_time'].max())
    head['cross_min'] = head.index.map(gc['link_time'].min())
#     head['cross_ratio'] = head['cross_sum']/head['simple_eta']

    head = head.fillna(0).reset_index().set_index('slice_id')
    gl = link.groupby(['slice_id','link_current_status'])
    head = head.join(gl.size().unstack(level=-1,fill_value=0).add_prefix('link_status_no_'))
    head = head.fillna(0).reset_index()
    if(task=='train'):
        head['date'] = f[:8]
    return head

In [3]:
network = pd.read_table('/media/fan/hdd/giscup/giscup_2021/nextlinks.txt',sep=' ',names=['link_id','next_link'])
network['next_link_no'] = network['next_link'].apply(lambda z: 4 if len(list(z.split(',')))>4 else len(list(z.split(','))))
nl = network.set_index('link_id')['next_link_no']

In [4]:
link_feats = pd.read_csv('/media/fan/hdd/giscup/giscup_2021/link_feats.csv')
link_feats.set_index('Unnamed: 0',inplace=True)

In [5]:
TRAIN_FILES = ['202008'+str(i).zfill(2)+'.csv' for i in range(1,32)]
for i,f in tqdm(enumerate(TRAIN_FILES)):    
    head = pd.read_csv('/media/fan/hdd/giscup/giscup_2021/train_head/'+f)
    path = pd.read_csv('/media/fan/hdd/giscup/giscup_2021/train_path/'+f)
    if(head.shape[0]==0):
        continue
    head = gen_feats('train',head,path,nl,link_feats)
    if(i==0):
        head.to_csv('/media/fan/hdd/giscup/train.csv',index=False)
    else:
        head.to_csv('/media/fan/hdd/giscup/train.csv',mode='a',header=False,index=False)

31it [30:07, 58.30s/it]


In [6]:
head = pd.read_csv('/media/fan/hdd/giscup/giscup_2021/test_head.csv')
path = pd.read_csv('/media/fan/hdd/giscup/giscup_2021/test_path.csv')

head = gen_feats('test',head,path,nl,link_feats)
head['date'] = '20200901'
head.to_csv('/media/fan/hdd/giscup/test.csv',index=False)

In [None]:
link = path[path['cross_flag']==0]

In [None]:
head.columns

In [None]:
nl

In [None]:
head['cross_no'].unique()