In [1]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

df_tr = pd.read_csv("train.csv")

def polyline_to_trip_duration(polyline):
  return max(polyline.count("[") - 2, 0) * 15

df_tr["LEN"] = df_tr["POLYLINE"].apply(polyline_to_trip_duration)

def parse_time(x):
  dt = datetime.fromtimestamp(x["TIMESTAMP"])
  return dt.year, dt.month, dt.day, dt.hour, dt.weekday()

df_tr[["YR", "MON", "DAY", "HR", "WK"]] = df_tr[["TIMESTAMP"]].apply(parse_time, axis=1, result_type="expand")

In [2]:
df_tst = pd.read_csv("test_public.csv")
df_tr_ = df_tr.copy(deep=True)
df_tr_ = df_tr_.drop(['TRIP_ID','YR','DAY_TYPE','ORIGIN_CALL','ORIGIN_STAND','TAXI_ID','TIMESTAMP','MISSING_DATA','POLYLINE'], axis=1)
df_tr_['CALL'] = [ord(x)-ord('A') for x in df_tr_['CALL_TYPE']]
df_tr_ = df_tr_.drop(['CALL_TYPE'], axis=1)

In [3]:
# input should be (N, 1), output is (N, dim)
def one_hot(in_dat, dim):
    out_dat = np.zeros((in_dat.shape[0], dim))
    for ind in range(in_dat.shape[0]):
        out_dat[ind, in_dat[ind]] = 1
    return out_dat

# HR, CALL, DAY one-hot encoded
# input should be (N, 3)
def converter(in_dat):
    hr_dat = one_hot(in_dat[:, 0], 24)
    call_dat = one_hot(in_dat[:, 1], 3)
    
    return np.concatenate((hr_dat, call_dat), axis=1)

In [4]:
hr_call_in = converter(df_tr_[["HR", "CALL"]].to_numpy())

In [5]:
df_tst_ = df_tst.copy(deep=True)
df_tst_[["YR", "MON", "DAY", "HR", "WK"]] = df_tst_[["TIMESTAMP"]].apply(parse_time, axis=1, result_type="expand")
df_tst_ = df_tst_.drop(['YR','DAY_TYPE','ORIGIN_CALL','ORIGIN_STAND','TAXI_ID','TIMESTAMP','MISSING_DATA'], axis=1)
df_tst_['CALL'] = [ord(x)-ord('A') for x in df_tst_['CALL_TYPE']]

tst_np = converter(df_tst_[["HR","CALL", "MON"]].to_numpy())
print(tst_np.shape)
print(hr_call_in.shape)

(320, 27)
(1710670, 27)


In [6]:
all_dat = np.concatenate((hr_call_in, tst_np), axis=0)
all_dat.shape

(1710990, 27)

In [7]:
%%time

from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=4, algorithm='arpack', n_iter=7, random_state=42)
all_dat_trans = svd.fit_transform(all_dat)

CPU times: user 1min 7s, sys: 55 s, total: 2min 2s
Wall time: 19.2 s


In [44]:
all_dat_trans.shape

(1710990, 4)

In [45]:
train_trans = all_dat_trans[:hr_call_in.shape[0]]
tst_trans = all_dat_trans[hr_call_in.shape[0]:]

In [46]:
print(train_trans.shape)
print(tst_trans.shape)

(1710670, 4)
(320, 4)


In [47]:
%%time

from sklearn.neighbors import KNeighborsRegressor

neigh = KNeighborsRegressor(n_neighbors=3, weights='distance', algorithm='ball_tree', leaf_size=512, n_jobs=-1)
neigh.fit(train_trans, df_tr["LEN"].to_numpy(dtype='float'))

CPU times: user 2min, sys: 949 ms, total: 2min 1s
Wall time: 2min


KNeighborsRegressor(algorithm='ball_tree', leaf_size=512, n_jobs=-1,
                    n_neighbors=3)

In [48]:
%%time

df_sample = pd.read_csv("sampleSubmission.csv")

# print(df_sample)
for index in range(len(tst_np)):
    if index % 10 == 0:
        print(index)
    df_sample.at[index, 'TRAVEL_TIME'] = neigh.predict(tst_trans[index:index+1])

df_sample.to_csv("my_pred.csv", index=None)

0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
CPU times: user 2.59 s, sys: 865 ms, total: 3.45 s
Wall time: 2.83 s
