In [15]:
import pandas as pd
'''
读取原始文件
增加列名、去重
'''

raw_df=pd.read_csv(r"L:\pycharm projects\Bike_Scrapper\RecoveredBikeData\2022-11\16\allBikes.csv",sep='\t',header=None)
raw_df.columns=['time','bikeno','lat','lng']
raw_df.drop_duplicates(subset=["time","bikeno"],keep="last",inplace=True)
print(raw_df)

                    time      bikeno         lat        lng
0       2022-11-15 23:59  9030249852  120.693405  27.929291
1       2022-11-15 23:59  9020146466  120.694123  27.929934
2       2022-11-15 23:59  9020035769  120.694528  27.930329
3       2022-11-15 23:59  9020155216  120.705297  27.930061
4       2022-11-15 23:59  9020158766  120.706612  27.930275
...                  ...         ...         ...        ...
307827  2022-11-16 23:58  9020228011  120.707673  27.918834
307828  2022-11-16 23:58  9020141361  120.707727  27.918790
307829  2022-11-16 23:58  9020156776  120.707734  27.918791
307830  2022-11-16 23:58  9020241304  120.707742  27.918808
307831  2022-11-16 23:59  9020186956  120.707750  27.918754

[307832 rows x 4 columns]


In [16]:
import pandas as pd
import time,datetime,pickle
'''
从原始文件构建观察矩阵
并持久化变量
'''
def getUnavailableTimestamps(raw_df,bikeno):
    '''
    返回bikeno所有的不可用时间，返回格式为set
    '''
    all_timestamps=set(raw_df["time"])
    bikeno_timestamps=set(raw_df.query("bikeno==@bikeno")["time"])
    return all_timestamps-bikeno_timestamps
        
def toMatrix(raw_df):
    '''
    将原始pd转化成matrix格式
    columns为时间点
    index为bikeno
    0表示unavailable,1表示available
    '''
    time1=time.time()
    all_bikes=list(set(raw_df["bikeno"]))
    all_timestamps=sorted([datetime.datetime.strptime(i,"%Y-%m-%d %H:%M") for i in (set(raw_df["time"]))])
    bikes_timestamps_matrix=pd.DataFrame(data=1,index=all_bikes,columns=all_timestamps,dtype=int)#初始化所有值都为1:available
    for x,bike in enumerate(all_bikes):
        print("%s of %s time cost:%s s."%(x,len(all_bikes),int(time.time()-time1)),end='\r')
        unavailable_timestamps=getUnavailableTimestamps(raw_df,bike)
        for timestamp in unavailable_timestamps:
            bikes_timestamps_matrix.loc[bike,timestamp]=0            
    return bikes_timestamps_matrix

print(raw_df)
matrix=toMatrix(raw_df)#从原始文件构建为观察矩阵
pickle.dump(matrix, open("matrix.pkl", "wb"))#持久化变量

                    time      bikeno         lat        lng
0       2022-11-15 23:59  9030249852  120.693405  27.929291
1       2022-11-15 23:59  9020146466  120.694123  27.929934
2       2022-11-15 23:59  9020035769  120.694528  27.930329
3       2022-11-15 23:59  9020155216  120.705297  27.930061
4       2022-11-15 23:59  9020158766  120.706612  27.930275
...                  ...         ...         ...        ...
307827  2022-11-16 23:58  9020228011  120.707673  27.918834
307828  2022-11-16 23:58  9020141361  120.707727  27.918790
307829  2022-11-16 23:58  9020156776  120.707734  27.918791
307830  2022-11-16 23:58  9020241304  120.707742  27.918808
307831  2022-11-16 23:59  9020186956  120.707750  27.918754

[307832 rows x 4 columns]
3875 of 3876 time cost:392 s.

In [44]:
import re
'''
从观察矩阵中发现骑行
将骑行信息持久化存储
'''
def getEachCyclingPos(bikeno,raw_df,matrix):
    '''
    line为matrix中的一行，即各个时间点的status列表
    line smaple:[0,0,0,0,1,1,1,1,1,1,1,0,0,0,0,1,1,1,0,1]
    找到所有形如正则中10+1的所有结果
    返回start和end
    return smaple:[(10,16),(17,20)]左闭右开
    '''
    line=matrix.loc[bikeno,:]
    s=''.join([str(i) for i in line])#list转换成str
    res=[]
    for i in range(len(s)):#对s的每个字串进行分别匹配，返回匹配结果，解决re只能找到1001001中一个1001的不足
        temp=[i.span() for i in re.finditer('10+1',s[i:])]#利用现成的re库
        temp=[(x[0]+i,x[1]+i) for x in temp]
        res+=temp
    res=set(res)
    timestamps=matrix.columns
    rides=[]
    for i in res:
        '''
        这一段需要注意开始骑行的时间与获取位置的时间不完全一致：
        假设一段骑行为100001
        开始时间按照第一个0的时间计算，结束时间按照最后一个1计算
        开始位置按照第一个1查找，结束位置按照最后一个1查找
        '''
        start_time=timestamps[i[0]+1].strftime("%Y-%m-%d %H:%M") #开始时间为状态1的下一个时间点
        end_time=timestamps[i[1]-1].strftime("%Y-%m-%d %H:%M")#结束时间就是状态1的时间点,注意是左闭右开，所以此处要减去1
        start_time_act=timestamps[i[0]].strftime("%Y-%m-%d %H:%M")#查询开始位置应该是开始使用前的status==1的时间点
        
        start_item=raw_df.query("bikeno==@bikeno").query("time==@start_time_act")#获取开始时间的位置信息只能由状态1获得
        start_pos=[round(float(i),6) for i in (start_item['lat'],start_item['lng'])]
        
        end_item=raw_df.query("bikeno==@bikeno").query("time==@end_time")
        end_pos=[round(float(i),6) for i in (end_item['lat'],end_item['lng'])]
        
        rides.append({"start_time":start_time,"end_time":end_time,"start_pos":start_pos,"end_pos":end_pos})
    return rides

def getALLCyclingPos(raw_df,matrix):
    bikes=matrix.index
    res=[]
    start_time=time.time()
    for x,bike in enumerate(bikes):
        print("正在处理第%s辆 车辆编号为%s ,已用时%s s."%(x,bike,int(time.time()-start_time)),end='\r')
        resdict={}
        resdict["bikeno"]=bike
        resdict["cyclings"]=getEachCyclingPos(bike,raw_df,matrix)
        res.append(resdict)
    return res

matrix = pickle.load(open("matrix.pkl", "rb"))#加载观察矩阵
all_cyclings_pos=getALLCyclingPos(raw_df,matrix)#发现骑行
pickle.dump(all_cyclings_pos, open("all_cyclings_pos.pkl", "wb"))#持久化变量

正在处理第3524辆 车辆编号为9080086525 ,已用时322 s.

In [6]:
import requests,json,pickle
'''
获取骑行规划
持久化该过程得到的变量
'''
def getCyclingPath(start_pos,end_pos,act_duration,key):
    """
    利用高德地图API，查找起点到终点的路线，选择第一条为最优路线
    提取返回结果中的polylines，返回路径点列表
    """
    if start_pos[0]<start_pos[1]:
        start_pos.reverse()
    if end_pos[0]<end_pos[1]:
        end_pos.reverse()

    url="https://restapi.amap.com/v4/direction/bicycling"
    params={
        "key":key,
        "origin":','.join([str(i) for i in start_pos]),
        "destination":','.join([str(i) for i in end_pos]),
    }
    r=requests.get(url,params=params)
    rdata=json.loads(r.text)
    if rdata["errcode"]==0:
        rpaths=rdata["data"]["paths"]
        rdurations=[abs(int(i["duration"])-act_duration) for i in rpaths]#计算规划时长与实际时长的差值
        path_index=rdurations.index(min(rdurations))
        steps=rpaths[path_index]["steps"]
        points=[]
        for step in steps:
            points+=step["polyline"].split(';')
        return {"paths":points,"act_duration":act_duration,"duration":rpaths[path_index]["duration"],"distance":rpaths[path_index]["distance"]}
    else:
        print(r.text)
        raise Exception("API Exception")
        
import time,datetime
def addCyclingPaths(bikeinfo):
    '''
    将路径信息添加到原字典中
    '''
    keys=[
        "69e8d3a2002eec46b1a73a7becd320dc",
        '250698c861c56b47eba7496f5d11f6fd',
        '7c7c15f223aabf7a50695f405c1bbe8b'

    ]

    count=0
    spoint=time.time()
    use=0
    for x,item in enumerate(bikeinfo):
        cycs=item["cyclings"]
        for y,cyc in enumerate(cycs):
            count+=1
            print("%s正在获取第%s辆车辆%s 的第%s次骑行路径,已用时%ss."%(count,x+1,item["bikeno"],y+1,int(time.time()-spoint)),end='\r')
            start_time=datetime.datetime.strptime(cyc["start_time"],"%Y-%m-%d %H:%M")
            end_time=datetime.datetime.strptime(cyc["end_time"],"%Y-%m-%d %H:%M")
            act_duration=(end_time-start_time).seconds
            try:
                cycs[y]["path"]=getCyclingPath(cyc["start_pos"],cyc["end_pos"],act_duration,keys[use])
            except :
                use+=1
                if use==len(keys):
                    raise Exception
                cycs[y]["path"]=getCyclingPath(cyc["start_pos"],cyc["end_pos"],act_duration,keys[use])

            bikeinfo[x]["cyclings"]=cycs
    return bikeinfo


all_cyclings_pos = pickle.load(open("all_cyclings_pos.pkl", "rb"))#加载骑行信息
print(all_cyclings_pos)
breakpoint()
all_cyclings_paths=addCyclingPaths(all_cyclings_pos)#获取骑行规划
pickle.dump(all_cyclings_paths, open("all_cyclings_paths.pkl", "wb"))#持久化变量

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



{"errcode":10044,"errmsg":"USER_DAILY_QUERY_OVER_LIMIT"}
10607正在获取第414辆车辆9020162765 的第24次骑行路径,已用时903s.

Exception: 

In [5]:
all_cyclings_paths2 = pickle.load(open("all_cyclings_paths.pkl", "rb"))#加载骑行路径信息
all_cyclings_paths2[0]#查看一条记录

FileNotFoundError: [Errno 2] No such file or directory: 'all_cyclings_paths.pkl'