In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import zipfile
import json
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from datetime import datetime
import time
from math import *

In [None]:
zf = zipfile.ZipFile('../input/train.csv.zip')
train = pd.read_csv(zf.open('train.csv'), usecols=['DAY_TYPE','TIMESTAMP','POLYLINE'])
# print(train)
train['POLYLINE'] = (train['POLYLINE'].apply(lambda x: json.loads(x)))
# print(train['DAY_TYPE'])
train.head()

In [None]:
def data_processing(train):
    ts=pd.DataFrame()
    ts['length'] = train['POLYLINE'].apply(lambda x: len(x))
    ts['TIMESTAMP']=train['TIMESTAMP'].apply(lambda x: time.localtime(int(x)).tm_hour)
    ts['start_long'] = train['POLYLINE'].apply(lambda x: x[0][0] if len(x)>0 else 0 )
    ts['start_lat'] = train['POLYLINE'].apply(lambda x: x[0][1] if len(x)>0 else 0)
    ts['end_long'] = train['POLYLINE'].apply(lambda x: x[len(x)-1][0] if len(x)>0 else 0 )
    ts['end_lat'] = train['POLYLINE'].apply(lambda x: x[len(x)-1][1] if len(x)>0 else 0 )
    
    ts['distance']=train['POLYLINE'].apply((lambda x : get_dist(x[0][0],x[0][1],x[len(x)-1][0],x[len(x)-1][1]) if len(x)>0 else 0))
    ts['bearing']=train['POLYLINE'].apply((lambda x : calcBearing(x[0][0],x[0][1],x[len(x)-1][0],x[len(x)-1][1]) if len(x)>0 else 0))
    #     print(ts['distance'])
    ts['d_type'] = train['DAY_TYPE'].apply(lambda x: ord(x)-ord('A'))
    ts.head()
    return ts

In [None]:
### Get Haversine distance
def get_dist(lon1,lat1,lon2,lat2):
  lon_diff = np.abs(lon1-lon2)*np.pi/360.0
  lat_diff = np.abs(lat1-lat2)*np.pi/360.0
  a = np.sin(lat_diff)**2 + np.cos(lat1*np.pi/180.0) * np.cos(lat2*np.pi/180.0) * np.sin(lon_diff)**2  
  d = 2*6371*np.arctan2(np.sqrt(a), np.sqrt(1-a))
  return(d)


In [None]:
# Bearing
def calcBearing(lon1,lat1,lon2,lat2):
    dLon = lon2 - lon1
    y = sin(dLon) * cos(lat2)
    x = cos(lat1) * sin(lat2) \
        - sin(lat1) * cos(lat2) * cos(dLon)
    return degrees(atan2(y, x))

In [None]:
def get_features(ts):
    
    a=np.array(ts['d_type'],dtype='float64').reshape(-1,1)
    b=np.array(ts['start_long'],dtype='float64').reshape(-1,1)
    c=np.array(ts['start_lat'],dtype='float64').reshape(-1,1)
    g=np.array(ts['end_long'],dtype='float64').reshape(-1,1)
    h=np.array(ts['end_lat'],dtype='float64').reshape(-1,1)
    d=np.array(ts['distance'],dtype='float64').reshape(-1,1)
    e=np.array(ts['TIMESTAMP'],dtype='float64').reshape(-1,1)
    f=np.array(ts['bearing'],dtype='float64').reshape(-1,1)
    
    
    features=np.concatenate((a,b,c,d,e,f,g,h),axis=1)
    return features

In [None]:
ts=data_processing(train)
features=get_features(ts)

ts['length']=ts['length'].apply(lambda x: 15*(x-1))
Y_train=np.array(ts['length'],dtype='float64')

lr=LinearRegression()
lr.fit(features,Y_train)
rm=RandomForestRegressor()
rm.fit(features,Y_train)

In [None]:
# print(train['POLYLINE'][0])
zf = zipfile.ZipFile('../input/test.csv.zip')
test = pd.read_csv(zf.open('test.csv'), usecols=['DAY_TYPE','TIMESTAMP','POLYLINE','TRIP_ID'])
test['POLYLINE'] = test['POLYLINE'].apply(json.loads)
tst=data_processing(test)
fea=get_features(tst)
# test['length'] = test['POLYLINE'].apply(lambda x: len(x))
# X_test=test['length']
# travel_test=test['length'].apply(lambda x:15*(x-1))

# for i in range(320):
#     if(travel_test[i]>699):
#         travel_test[i]=travel_test[i]
#     else:
#         travel_test[i]=699
# print(travel_test)
# X_test=np.array(X_test,dtype='float64').reshape(-1,1)
ids=test['TRIP_ID']
y_predict=lr.predict(fea)
Y_predict=rm.predict(fea)
tf=pd.DataFrame(ids,columns=['TRIP_ID'])
rf=pd.DataFrame(ids,columns=['TRIP_ID'])
tf['TRAVEL_TIME']=y_predict
rf['TRAVEL_TIME']=Y_predict
tf.to_csv('lr.csv', index=False)
rf.to_csv('rf.csv', index=False)

# avg=sum(Y_train)/len(Y_train)
# print(avg)
