In [4]:
df=pd.read_csv('clean_dataset.csv')
df_bus=pd.read_csv('/content/公車站點資料.csv')

In [1]:
from geopy.distance import geodesic

def distance(x):
    global ori     # global vars, 起點, i.e each row in df
    lat1, lon1 = ori
    lat2, lon2 = x['lat'], x['lon']

    coords_1 = (lat1, lon1)
    coords_2 = (lat2, lon2)

    d = geodesic(coords_1, coords_2).meters  # 使用更快速的距離計算方式
    return d


In [8]:
from sklearn.neighbors import BallTree
import numpy as np

# 假設 df_bus 是包含腳踏車站經緯度的 DataFrame
bus_coords = df_bus[['lat', 'lng']].values
ball_tree = BallTree(np.radians(bus_coords), metric='haversine')

def near_Bus(x, y):
    global ori, ball_tree  # global vars
    ori = (x, y)           # 起點, i.e each row in df
    nearest_dist, _ = ball_tree.query([np.radians(ori)], k=1)
    return nearest_dist[0] * 6371 * 1000  # 將距離轉換成米


In [9]:
from concurrent.futures import ThreadPoolExecutor

def parallel_distance_calculation(row):
    return near_Bus(row.lat, row.lon)

print('Calculating distance between bus and house...')
ori = (0, 0)
bus_stn = df_bus[['lat', 'lng']].rename(columns={'lat': 'lat', 'lng': 'lon'}) # all bus station

with ThreadPoolExecutor() as executor:
    df['d_bus'] = list(executor.map(parallel_distance_calculation, df.itertuples(index=False)))


Calculating distance between bus and house...


In [11]:
# 使用科學記號來顯示
pd.options.display.float_format = '{:.2f}'.format

# 跑出你的計算
# ...

# 將 d_bike 欄位轉換成浮點數格式
df['d_bus'] = df['d_bus'].astype(float)
df

Unnamed: 0,ID,lon,lat,house_age,residence_housing,congregate_housing,commercial_use,industrial_use,apartment,building_low,...,building_area,main_building_area,balcony_area,auxiliary_area,floor,total_floor,parking_area,parking_number,unit_price,d_bus
0,TR-1,121.55,25.02,32.58,1,0,0,0,0,0,...,-0.17,0.39,0.18,-0.44,11,11,-0.82,0.00,4.63,187.80
1,TR-2,121.50,25.02,24.17,1,0,0,0,0,0,...,0.31,-0.32,0.61,-0.44,7,12,-0.82,0.00,1.89,215.01
2,TR-3,120.37,22.64,6.17,0,1,0,0,0,0,...,0.42,-0.10,-0.36,1.53,10,15,0.16,1.00,1.49,253.50
3,TR-4,121.46,25.06,8.83,0,1,0,0,0,0,...,0.16,-0.07,0.32,0.23,9,14,0.52,1.00,2.05,168.64
4,TR-5,121.47,25.02,11.00,1,0,0,0,0,0,...,0.99,0.79,1.72,-0.44,41,43,0.53,1.00,3.27,86.15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11746,TR-11747,121.30,24.94,28.33,1,0,0,0,1,0,...,-0.27,0.15,-0.18,-0.17,4,5,0.48,1.00,0.90,169.21
11747,TR-11748,120.97,24.80,29.25,1,0,0,0,0,1,...,-0.33,-0.18,-0.07,-0.16,2,7,-0.82,0.00,1.04,114.29
11748,TR-11749,121.66,25.07,22.83,1,0,0,0,0,0,...,-0.85,-0.90,-0.50,-0.17,15,17,-0.82,0.00,2.14,138.75
11749,TR-11750,121.45,24.98,25.08,1,0,0,0,0,0,...,-1.20,-1.33,-0.71,-0.44,12,16,-0.82,0.00,2.29,117.25


In [12]:
# 假設df是你的資料框架，並且有一個名為'distance'的欄位表示距離
# 如果欄位名稱不同，請自行更換

# 新增一個欄位，預設為False
df['distance_less_than_500m'] = False

# 將距離小於500公尺的設為True
df.loc[df['d_bus'] < 500, 'distance_less_than_500m'] = True
df

Unnamed: 0,ID,lon,lat,house_age,residence_housing,congregate_housing,commercial_use,industrial_use,apartment,building_low,...,main_building_area,balcony_area,auxiliary_area,floor,total_floor,parking_area,parking_number,unit_price,d_bus,distance_less_than_500m
0,TR-1,121.55,25.02,32.58,1,0,0,0,0,0,...,0.39,0.18,-0.44,11,11,-0.82,0.00,4.63,187.80,True
1,TR-2,121.50,25.02,24.17,1,0,0,0,0,0,...,-0.32,0.61,-0.44,7,12,-0.82,0.00,1.89,215.01,True
2,TR-3,120.37,22.64,6.17,0,1,0,0,0,0,...,-0.10,-0.36,1.53,10,15,0.16,1.00,1.49,253.50,True
3,TR-4,121.46,25.06,8.83,0,1,0,0,0,0,...,-0.07,0.32,0.23,9,14,0.52,1.00,2.05,168.64,True
4,TR-5,121.47,25.02,11.00,1,0,0,0,0,0,...,0.79,1.72,-0.44,41,43,0.53,1.00,3.27,86.15,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11746,TR-11747,121.30,24.94,28.33,1,0,0,0,1,0,...,0.15,-0.18,-0.17,4,5,0.48,1.00,0.90,169.21,True
11747,TR-11748,120.97,24.80,29.25,1,0,0,0,0,1,...,-0.18,-0.07,-0.16,2,7,-0.82,0.00,1.04,114.29,True
11748,TR-11749,121.66,25.07,22.83,1,0,0,0,0,0,...,-0.90,-0.50,-0.17,15,17,-0.82,0.00,2.14,138.75,True
11749,TR-11750,121.45,24.98,25.08,1,0,0,0,0,0,...,-1.33,-0.71,-0.44,12,16,-0.82,0.00,2.29,117.25,True


In [16]:
#確認
unique_values = df['distance_less_than_500m'].unique()
print(unique_values)


[ True False]


In [17]:
df.to_csv('clean_dataset_distancebus.csv', index=False)