In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import heapq


In [25]:
def read_clean_data():
    colum_names=['taxi id', 'date time', 'longitude', 'latitude']

    taxi_id = [6275,3015,3557,3579,8179,2560]
    df = pd.DataFrame(columns=colum_names)
    for i in range(len(taxi_id)):
        df_15 = pd.read_csv('taxi_log_2008_by_id/'+str(taxi_id[i])+'.txt', names=colum_names)
        df = pd.concat([df, df_15], axis=0)
    df = df.drop_duplicates()
    df['date time']=pd.to_datetime(df['date time'], format='%Y-%m-%d %H:%M:%S')
    df = df.reset_index(drop=True)
    df = df[(df['longitude']>=116.215140) & (df['longitude']<=116.586700) & (df['latitude']>=39.757610) & (df['latitude']<=40.079850)]
    df = df.reset_index(drop=True)
    df['diff'] = df['date time'].diff()
    df = df[df['diff']>=pd.Timedelta(seconds=1)]
    df = df[df['diff']<=pd.Timedelta(minutes=5)]
    df = df.reset_index(drop=True)
    return df

In [26]:
def connect_two_point(df):
    df_connect = df[['longitude', 'latitude']].copy()
    df_connect.columns = ['longitude_a', 'latitude_a']
    df_connect['longitude_b'] = df['longitude'].shift(-1)
    df_connect['latitude_b'] = df['latitude'].shift(-1)
    df_connect['diff'] = df['diff'].shift(-1)
    df_connect = df_connect.dropna()
    df_connect = df_connect.reset_index(drop=True)
    df_connect = df_connect.groupby(['longitude_a', 'latitude_a', 'longitude_b', 'latitude_b'])['diff'].mean().reset_index()
    return df_connect

In [27]:
def create_adjency_list(df_adjency):
    df_adjency['point_a'] = df_adjency['longitude_a'].astype(str) + ' ' + df_adjency['latitude_a'].astype(str)
    df_adjency['point_b'] = df_adjency['longitude_b'].astype(str) + ' ' + df_adjency['latitude_b'].astype(str)
    df_adjency = df_adjency.drop(['longitude_a', 'latitude_a', 'longitude_b', 'latitude_b'], axis=1)

    adjacency_list = {}
    for index, row in df_adjency.iterrows():
        if row['point_a'] not in adjacency_list:
            adjacency_list[row['point_a']] = []
        adjacency_list[row['point_a']].append((row['point_b'], row['diff'].total_seconds()))

    return adjacency_list

In [28]:
def find_path(adjacency_list, start, end):
    def a_star(adjacency_list, start, goal):
        frontier = []
        heapq.heappush(frontier, (0, start))
        came_from = {}
        cost_so_far = {}
        came_from[start] = None
        cost_so_far[start] = 0

        while frontier:
            current = heapq.heappop(frontier)[1]

            if current == goal:
                break

            for next in adjacency_list[current]:
                new_cost = cost_so_far[current] + next[1]
                if next[0] not in cost_so_far or new_cost < cost_so_far[next[0]]:
                    cost_so_far[next[0]] = new_cost
                    priority = new_cost
                    heapq.heappush(frontier, (priority, next[0]))
                    came_from[next[0]] = current

        return came_from, cost_so_far
    
    came_from, cost_so_far = a_star(adjacency_list, '116.21612 39.89987', '116.21612 39.89987')
    path = []
    current = '116.21612 39.89987'
    while current != None:
        path.append(current)
        current = came_from[current]
    path.reverse()
    return path

In [29]:
def to_df(path):
    df = pd.DataFrame(path, columns=['point'])
    df['longitude'] = df['point'].apply(lambda x: x.split(' ')[0])
    df['latitude'] = df['point'].apply(lambda x: x.split(' ')[1])
    df = df.drop(['point'], axis=1)
    df['longitude'] = df['longitude'].astype(float)
    df['latitude'] = df['latitude'].astype(float)
    return df

In [30]:
def usable_data(df_path,df_connect):
    df_path['longitude_a'] = df_path['longitude'].shift(1)
    df_path['longitude_b'] = df_path['longitude']
    df_path['latitude_a'] = df_path['latitude'].shift(1)
    df_path['latitude_b'] = df_path['latitude']
    df_path = df_path.dropna()

    df_path = df_path.drop(['longitude', 'latitude'], axis=1)

    df_path['longitude_a'] = df_path['longitude_a'].astype(float)
    df_path['longitude_b'] = df_path['longitude_b'].astype(float)
    df_path['latitude_a'] = df_path['latitude_a'].astype(float)
    df_path['latitude_b'] = df_path['latitude_b'].astype(float)

    df_usable = pd.merge(df_path, df_connect, on=['longitude_a', 'latitude_a', 'longitude_b', 'latitude_b'], how='left')
    df_usable = df_usable.drop(['point_a','point_b'], axis=1)

    return df_usable


In [31]:
def all_thing(df_connect,start,end):
    adjacency_list = create_adjency_list(df_connect)
    path = find_path(adjacency_list, start, end)
    df_path = to_df(path)
    df_prout = usable_data(df_path,df_connect)
    time = df_prout['diff'].sum()
    return time, df_path

In [32]:
def main():
    df = read_clean_data()
    df_connect = connect_two_point(df)
    time, df_first = all_thing(df_connect, '116.21612 39.89987', '116.21612 39.89987')
    print(time)
    print(df_first)

main()