In [1]:
import numpy as np
import pandas as pd
from math import *

In [2]:
df = pd.read_csv('./data/20111123/13301104001.csv', error_bad_lines=False)
df.head()

Unnamed: 0,15395,AYJ,13301104001,20111123000134,116.2887650,39.8632050,428709232,146956322,000,176,0,4,50#
0,26859,AYJ,13301104001,20111123000228,116.288765,39.863182,428709234,146956236,0,176,0,4,50#
1,39112,AYJ,13301104001,20111123000321,116.288765,39.863186,428709225,146956251,0,176,0,4,50#
2,50949,AYJ,13301104001,20111123000415,116.28878,39.863186,428709278,146956257,0,176,0,4,50#
3,2581,AYJ,13301104001,20111123000508,116.288803,39.86319,428709372,146956244,0,290,0,4,50#
4,11729,AYJ,13301104001,20111123000606,116.288818,39.863186,428709418,146956228,0,290,0,4,50#


In [3]:
# drop irrelevant cols
to_drop = ['4', '50#']
df.drop(to_drop, inplace=True, axis=1)

In [4]:
df.head()

Unnamed: 0,15395,AYJ,13301104001,20111123000134,116.2887650,39.8632050,428709232,146956322,000,176,0
0,26859,AYJ,13301104001,20111123000228,116.288765,39.863182,428709234,146956236,0,176,0
1,39112,AYJ,13301104001,20111123000321,116.288765,39.863186,428709225,146956251,0,176,0
2,50949,AYJ,13301104001,20111123000415,116.28878,39.863186,428709278,146956257,0,176,0
3,2581,AYJ,13301104001,20111123000508,116.288803,39.86319,428709372,146956244,0,290,0
4,11729,AYJ,13301104001,20111123000606,116.288818,39.863186,428709418,146956228,0,290,0


In [5]:
# AYJ seems redundent
print (sum (df[:]['AYJ'] != 'AYJ'))
# remove it
df.drop(['AYJ'], inplace=True, axis=1)

0


In [6]:
df.head()

Unnamed: 0,15395,13301104001,20111123000134,116.2887650,39.8632050,428709232,146956322,000,176,0
0,26859,13301104001,20111123000228,116.288765,39.863182,428709234,146956236,0,176,0
1,39112,13301104001,20111123000321,116.288765,39.863186,428709225,146956251,0,176,0
2,50949,13301104001,20111123000415,116.28878,39.863186,428709278,146956257,0,176,0
3,2581,13301104001,20111123000508,116.288803,39.86319,428709372,146956244,0,290,0
4,11729,13301104001,20111123000606,116.288818,39.863186,428709418,146956228,0,290,0


In [7]:
# renaming cols
rename_mapping_cands = ['point_id', 'taxi_id', 'time', 'x', 'y', 'OSM_st', 'OSM_ed', 'speed', 'theta', 'status']
rename_mapping = {}
for i, name in enumerate(df.columns.values):
    rename_mapping[name] = rename_mapping_cands[i]


In [8]:
rename_mapping

{'15395': 'point_id',
 '13301104001': 'taxi_id',
 '20111123000134': 'time',
 '116.2887650': 'x',
 '39.8632050': 'y',
 '428709232': 'OSM_st',
 '146956322': 'OSM_ed',
 '000': 'speed',
 '176': 'theta',
 '0': 'status'}

In [9]:
df.rename(rename_mapping, axis=1, inplace=True)

In [10]:
df.head()

Unnamed: 0,point_id,taxi_id,time,x,y,OSM_st,OSM_ed,speed,theta,status
0,26859,13301104001,20111123000228,116.288765,39.863182,428709234,146956236,0,176,0
1,39112,13301104001,20111123000321,116.288765,39.863186,428709225,146956251,0,176,0
2,50949,13301104001,20111123000415,116.28878,39.863186,428709278,146956257,0,176,0
3,2581,13301104001,20111123000508,116.288803,39.86319,428709372,146956244,0,290,0
4,11729,13301104001,20111123000606,116.288818,39.863186,428709418,146956228,0,290,0


In [11]:
# modifying data types
# convert time from int to string
df.time = df.time.astype('str')

In [12]:
df.time

0       20111123000228
1       20111123000321
2       20111123000415
3       20111123000508
4       20111123000606
5       20111123000700
6       20111123000755
7       20111123000850
8       20111123000944
9       20111123001040
10      20111123001134
11      20111123001228
12      20111123001415
13      20111123001509
14      20111123001604
15      20111123001658
16      20111123001752
17      20111123001847
18      20111123001942
19      20111123002036
20      20111123002130
21      20111123002224
22      20111123002319
23      20111123002413
24      20111123002507
25      20111123002600
26      20111123002610
27      20111123002655
28      20111123002748
29      20111123002902
             ...      
1398    20111123233108
1399    20111123233203
1400    20111123233256
1401    20111123233350
1402    20111123233443
1403    20111123233632
1404    20111123233727
1405    20111123233821
1406    20111123233916
1407    20111123234010
1408    20111123234105
1409    20111123234159
1410    201

In [13]:
# remove unreal points
limits = {'long': [39.768522, 40.028584], 'lat': [116.218923, 116.550119]}
# print (sum(df.where(df.loc[:]['x'] >= limits['lat'][0]).loc[:, 'x'] <= limits['lat'][1]))
# print (sum (df.where(df.loc[:, 'y'] >= limits['long'][0]).loc[:, 'y'] <= limits['long'][1]))
df.x = df[(df.x >= limits['lat'][0]) & (df.x <= limits['lat'][1])]
df.y = df[(df.y >= limits['long'][0]) & (df.y <= limits['long'][1])]


In [14]:
# remove duplicate timing points
df.drop_duplicates(['taxi_id', 'time'], inplace=True)

In [15]:
df

Unnamed: 0,point_id,taxi_id,time,x,y,OSM_st,OSM_ed,speed,theta,status
0,26859,13301104001,20111123000228,26859,26859,428709234,146956236,0,176,0
1,39112,13301104001,20111123000321,39112,39112,428709225,146956251,0,176,0
2,50949,13301104001,20111123000415,50949,50949,428709278,146956257,0,176,0
3,2581,13301104001,20111123000508,2581,2581,428709372,146956244,0,290,0
4,11729,13301104001,20111123000606,11729,11729,428709418,146956228,0,290,0
5,23084,13301104001,20111123000700,23084,23084,428709430,146956214,0,290,0
6,35297,13301104001,20111123000755,35297,35297,428709487,146956210,0,290,0
7,47432,13301104001,20111123000850,47432,47432,428709541,146956194,0,290,0
8,59881,13301104001,20111123000944,59881,59881,428709598,146956204,0,290,0
9,7022,13301104001,20111123001040,7022,7022,428709599,146956174,0,290,0


In [16]:
# Remove high speed points
df.speed

0       0
1       0
2       0
3       0
4       0
5       0
6       0
7       0
8       0
9       0
10      0
11      0
12      0
13      0
14      0
15      0
16      0
17      0
18      0
19      0
20      0
21      0
22      0
23      0
24      0
25      0
26      0
27      0
28      0
29      0
       ..
1398    0
1399    0
1400    0
1401    0
1402    0
1403    0
1404    0
1405    0
1406    0
1407    0
1408    0
1409    0
1410    0
1411    0
1412    0
1413    0
1414    0
1415    0
1416    0
1417    0
1418    0
1419    0
1420    0
1421    0
1422    0
1423    0
1424    0
1425    0
1426    0
1427    0
Name: speed, Length: 1428, dtype: int64

In [17]:
df.drop(df[df.speed > 90].index.tolist(), inplace=True)

In [18]:
df

Unnamed: 0,point_id,taxi_id,time,x,y,OSM_st,OSM_ed,speed,theta,status
0,26859,13301104001,20111123000228,26859,26859,428709234,146956236,0,176,0
1,39112,13301104001,20111123000321,39112,39112,428709225,146956251,0,176,0
2,50949,13301104001,20111123000415,50949,50949,428709278,146956257,0,176,0
3,2581,13301104001,20111123000508,2581,2581,428709372,146956244,0,290,0
4,11729,13301104001,20111123000606,11729,11729,428709418,146956228,0,290,0
5,23084,13301104001,20111123000700,23084,23084,428709430,146956214,0,290,0
6,35297,13301104001,20111123000755,35297,35297,428709487,146956210,0,290,0
7,47432,13301104001,20111123000850,47432,47432,428709541,146956194,0,290,0
8,59881,13301104001,20111123000944,59881,59881,428709598,146956204,0,290,0
9,7022,13301104001,20111123001040,7022,7022,428709599,146956174,0,290,0


In [19]:
# calculating distance between points u and v
# u[long, lat]
def dis(u, v):
    return acos((sin(u[1]) * sin(v[1])) + (cos(u[1]) * cos(v[1]) * cos(u[0] - v[0]) )) * 6371.004
    

In [20]:
to_drop = []
for i in range(df.shape[0] - 1):
    print (df.x[i])
    # u = [df.x[i], df.y[i]]
    # v = [df.x[i + 1], df.y[i + 1]]
    # if dis(u, v) > 2:
    #     to_drop.append(i)

26859
39112
50949
2581
11729
23084
35297
47432
59881
7022
17236
28595
53015
2729
11582
22264
34536
46863
58986
6682
17370
28504
40813
52999
2191
10698
38468
21616
33800
50630
591
9433
20659
32798
45381
57598
4585
14783
26160
38847
51247
66
8568
19845
31329
43657
56080
14076
26724
39472
52416
65400
10616
23290
36011
48729
61340
17096
29989
42891
55523
68074
25795
10287
22750
38283
51259
63804
6769
19368
32432
44853
57205
1577
39574
13235
43041
25715
38275
55681
68460
9068
20398
32179
44462
57127
4418
14827
26050
38109
50629
62878
20160
32665
44850
57322
13969
25994
38220
50351
62651
19406
31583
43883
56370
11653
23351
35429
47770
60206
7257
18960
31147
43629
56045
3316
12985
24700
36720
48930
61385
6785
17947
29978
42154
54337
2196
12031
24064
36712
48852
61133
6289
17694
29868
42047
54384
2698
12677
24292
36184
48259
60318
5804
17352
29330
41359
53407
1263
12945
24962
36971
48994
61042
4852
15822
27657
39721
51659
120
9537
21263
33260
45481
27658
39533
51432
63295
9516
21790
33614
4550

KeyError: 621