In [1]:
import numpy as np
import pandas as pd
from math import *
from copy import deepcopy

In [2]:
df = pd.read_csv('./data/20111123/13301104001.csv', error_bad_lines=False)
df.head()

Unnamed: 0,15395,AYJ,13301104001,20111123000134,116.2887650,39.8632050,428709232,146956322,000,176,0,4,50#
0,26859,AYJ,13301104001,20111123000228,116.288765,39.863182,428709234,146956236,0,176,0,4,50#
1,39112,AYJ,13301104001,20111123000321,116.288765,39.863186,428709225,146956251,0,176,0,4,50#
2,50949,AYJ,13301104001,20111123000415,116.28878,39.863186,428709278,146956257,0,176,0,4,50#
3,2581,AYJ,13301104001,20111123000508,116.288803,39.86319,428709372,146956244,0,290,0,4,50#
4,11729,AYJ,13301104001,20111123000606,116.288818,39.863186,428709418,146956228,0,290,0,4,50#


In [3]:
# drop irrelevant cols
to_drop = ['4', '50#']
df.drop(to_drop, inplace=True, axis=1)

In [4]:
df.shape

(1428, 11)

In [5]:
# AYJ seems redundent
print (sum (df[:]['AYJ'] != 'AYJ'))
# remove it
df.drop(['AYJ'], inplace=True, axis=1)

0


In [6]:
# renaming cols
rename_mapping_cands = ['point_id', 'taxi_id', 'time', 'x', 'y', 'OSM_st', 'OSM_ed', 'speed', 'theta', 'status']
rename_mapping = {}
for i, name in enumerate(df.columns.values):
    rename_mapping[name] = rename_mapping_cands[i]


In [7]:
rename_mapping

{'15395': 'point_id',
 '13301104001': 'taxi_id',
 '20111123000134': 'time',
 '116.2887650': 'x',
 '39.8632050': 'y',
 '428709232': 'OSM_st',
 '146956322': 'OSM_ed',
 '000': 'speed',
 '176': 'theta',
 '0': 'status'}

In [8]:
df.rename(rename_mapping, axis=1, inplace=True)

In [9]:
df.head()

Unnamed: 0,point_id,taxi_id,time,x,y,OSM_st,OSM_ed,speed,theta,status
0,26859,13301104001,20111123000228,116.288765,39.863182,428709234,146956236,0,176,0
1,39112,13301104001,20111123000321,116.288765,39.863186,428709225,146956251,0,176,0
2,50949,13301104001,20111123000415,116.28878,39.863186,428709278,146956257,0,176,0
3,2581,13301104001,20111123000508,116.288803,39.86319,428709372,146956244,0,290,0
4,11729,13301104001,20111123000606,116.288818,39.863186,428709418,146956228,0,290,0


In [10]:
# modifying data types
# convert time from int to string
df.time = df.time.astype('str')

In [11]:
df.time

0       20111123000228
1       20111123000321
2       20111123000415
3       20111123000508
4       20111123000606
5       20111123000700
6       20111123000755
7       20111123000850
8       20111123000944
9       20111123001040
10      20111123001134
11      20111123001228
12      20111123001415
13      20111123001509
14      20111123001604
15      20111123001658
16      20111123001752
17      20111123001847
18      20111123001942
19      20111123002036
20      20111123002130
21      20111123002224
22      20111123002319
23      20111123002413
24      20111123002507
25      20111123002600
26      20111123002610
27      20111123002655
28      20111123002748
29      20111123002902
             ...      
1398    20111123233108
1399    20111123233203
1400    20111123233256
1401    20111123233350
1402    20111123233443
1403    20111123233632
1404    20111123233727
1405    20111123233821
1406    20111123233916
1407    20111123234010
1408    20111123234105
1409    20111123234159
1410    201

In [12]:
# remove unreal points
limits = {'long': [39.768522, 40.028584], 'lat': [116.218923, 116.550119]}
# print (sum(df.where(df.loc[:]['x'] >= limits['lat'][0]).loc[:, 'x'] <= limits['lat'][1]))
# print (sum (df.where(df.loc[:, 'y'] >= limits['long'][0]).loc[:, 'y'] <= limits['long'][1]))
df = df[(df.x >= limits['lat'][0]) & (df.x <= limits['lat'][1])]
df = df[(df.y >= limits['long'][0]) & (df.y <= limits['long'][1])]


In [13]:
# remove duplicate timing points
df.drop_duplicates(['taxi_id', 'time'], inplace=True)

In [14]:
df

Unnamed: 0,point_id,taxi_id,time,x,y,OSM_st,OSM_ed,speed,theta,status
0,26859,13301104001,20111123000228,116.288765,39.863182,428709234,146956236,0,176,0
1,39112,13301104001,20111123000321,116.288765,39.863186,428709225,146956251,0,176,0
2,50949,13301104001,20111123000415,116.288780,39.863186,428709278,146956257,0,176,0
3,2581,13301104001,20111123000508,116.288803,39.863190,428709372,146956244,0,290,0
4,11729,13301104001,20111123000606,116.288818,39.863186,428709418,146956228,0,290,0
5,23084,13301104001,20111123000700,116.288818,39.863182,428709430,146956214,0,290,0
6,35297,13301104001,20111123000755,116.288834,39.863178,428709487,146956210,0,290,0
7,47432,13301104001,20111123000850,116.288849,39.863171,428709541,146956194,0,290,0
8,59881,13301104001,20111123000944,116.288864,39.863171,428709598,146956204,0,290,0
9,7022,13301104001,20111123001040,116.288864,39.863171,428709599,146956174,0,290,0


In [15]:
# Remove high speed points
df.speed

0       0
1       0
2       0
3       0
4       0
5       0
6       0
7       0
8       0
9       0
10      0
11      0
12      0
13      0
14      0
15      0
16      0
17      0
18      0
19      0
20      0
21      0
22      0
23      0
24      0
25      0
26      0
27      0
28      0
29      0
       ..
1398    0
1399    0
1400    0
1401    0
1402    0
1403    0
1404    0
1405    0
1406    0
1407    0
1408    0
1409    0
1410    0
1411    0
1412    0
1413    0
1414    0
1415    0
1416    0
1417    0
1418    0
1419    0
1420    0
1421    0
1422    0
1423    0
1424    0
1425    0
1426    0
1427    0
Name: speed, Length: 1428, dtype: int64

In [16]:
df.drop(df[df.speed > 90].index.tolist(), inplace=True)

In [17]:
df

Unnamed: 0,point_id,taxi_id,time,x,y,OSM_st,OSM_ed,speed,theta,status
0,26859,13301104001,20111123000228,116.288765,39.863182,428709234,146956236,0,176,0
1,39112,13301104001,20111123000321,116.288765,39.863186,428709225,146956251,0,176,0
2,50949,13301104001,20111123000415,116.288780,39.863186,428709278,146956257,0,176,0
3,2581,13301104001,20111123000508,116.288803,39.863190,428709372,146956244,0,290,0
4,11729,13301104001,20111123000606,116.288818,39.863186,428709418,146956228,0,290,0
5,23084,13301104001,20111123000700,116.288818,39.863182,428709430,146956214,0,290,0
6,35297,13301104001,20111123000755,116.288834,39.863178,428709487,146956210,0,290,0
7,47432,13301104001,20111123000850,116.288849,39.863171,428709541,146956194,0,290,0
8,59881,13301104001,20111123000944,116.288864,39.863171,428709598,146956204,0,290,0
9,7022,13301104001,20111123001040,116.288864,39.863171,428709599,146956174,0,290,0


In [18]:
# calculating distance between points u and v
# u[long, lat]
def dis(u, v):
    mid = sin(u[1]) * sin(v[1]) + cos(u[1]) * cos(v[1]) * cos(u[0] - v[0])
    R = 6371.004
    mid = -1 if mid < -1 else mid
    mid = 1 if mid > 1 else mid
    return acos(mid) * R
    

In [19]:
df.shape

(1427, 10)

In [20]:
to_drop = []
for i in range(df.shape[0] - 1):
    u = [df.iloc[i].x, df.iloc[i].y]
    v = [df.iloc[i + 1].x, df.iloc[i + 1].y]
    for j in range(2):
        u[j] *= 3.1415926 / 180
        v[j] *= 3.1415926 / 180
    if dis(u, v) > 2:
        to_drop.append(i)

In [21]:
df.shape

(1427, 10)

In [22]:
# remove laggy points
class Time(object):
    def __init__(self, year=None, month=None, day=None, hour=None, minute=None, second=None):
        self.year = year
        self.month = month
        self.day = day
        self.hour = hour
        self.minute = minute
        self.second = second
    
    def __abs__(self):
        t = deepcopy(self)
        for k, v in t.__dict__.items():
            setattr(t, k, abs(v))
            
    def __sub__(self, other):
        t = Time(0, 0, 0, 0, self.minute - other.minute, self.second - other.second)
        return t

In [23]:
def gen_Time(time):
    year = int(time[: 4])
    month = int(time[4: 6])
    day = int(time[6: 8])
    hour = int(time[8: 10])
    minute = int(time[10: 12])
    second = int(time[12: 14])
    return Time(year, month, day, hour, minute, second)

In [24]:
t0 = gen_Time('20111123000228')
t1 = gen_Time('20111123000330')
t2 = t0 - t1
print (t2.hour, t2.second)

0 -2


In [26]:
# df.iloc[0].time
# 20111123000228
# where = [0, 4]
# for i in range(2, 7):
#     where.append(where[i - 1] + 2)
to_drop = []
for i in range(df.shape[0] - 1):
    t0 = gen_Time(df.iloc[i].time)
    t1 = gen_Time(df.iloc[i].time)
    delta_t = t0 - t1
    if (delta_t.minute >= 11 or (delta_t.minute == 10 and delta_t.second >= 0)):
        to_drop.append(i)

In [28]:
df.drop(to_drop, inplace=True)

In [29]:
# remove stationary points
# ??????

In [37]:
# remove waiting points
df = df[df.status != 3]

In [38]:
df.shape

(1427, 10)