In [32]:
import pandas as pd
import os

In [40]:
folder_path = "./data/flight_list/2022/"

# 抓資料夾下所有 parquet 檔案
files = [f for f in os.listdir(folder_path) if f.endswith(".parquet")]

dfs = []

for file in files:
    file_path = os.path.join(folder_path, file)
    try:
        df = pd.read_parquet(file_path)
        dfs.append(df)
    except Exception as e:
        print(f"⚠️ 無法讀取 {file}: {e}")

# 合併成一個 DataFrame（只合併成功讀取的檔案）
full_df = pd.concat(dfs, ignore_index=True)

print(f"成功合併 {len(dfs)} 個檔案")
print(full_df.shape)
print(full_df.head())

成功合併 12 個檔案
(13390934, 17)
                                                  id  icao24    flt_id  \
0  174f6dd5074bbd3d6fad9407703b4b2054f1dab8788a2d...  440a8c  TAY8151    
1  ddfcc8c8549f1843ea281b28cdedcf1b9ebfb99ffedf2a...  406d4c  VIR364     
2  6aec7e5c404cafca05ee14b9d15a114e3cc171c0bc2733...  392adb  RX19       
3  ec781dc3bca30d8c2c96ce46ac356887ad495fe3d14e86...  490031      None   
4  90019a408b4ccd08b14ec39581e3e449f79100144473ae...  77058f  ALK504     

         dof  adep  ades adep_p ades_p registration                    model  \
0 2022-01-01  None  None   None   None       OE-IFM              B747-4KZ(F)   
1 2022-01-01  None  None   None   None       G-VCRU  BOEING 787-9 Dreamliner   
2 2022-01-01  None  None   None   None         RX19                     None   
3 2022-01-01  None  None   None   None          TWR                     None   
4 2022-01-01  None  None   None   None       4R-ALO                A330 343E   

  typecode icao_aircraft_class icao_operator   

In [42]:
# 方法2：如果你也想把空字串 '' 當作空值，一起處理
df_cleaned = full_df[(full_df['adep'].notna()) & (full_df['ades'].notna()) & 
                     (full_df['adep'] != '') & (full_df['ades'] != '')]

# 看一下結果
print(df_cleaned.shape)
print(df_cleaned[['flt_id', 'adep', 'ades']].head())

(4376444, 17)
       flt_id  adep  ades
141  ZXP24     EHDL  EHGR
279  PBW13     EDNY  ETHL
284  CHX11     EDTD  LSZH
286  BOX195    EGNX  EDDP
289  THY8AH    UUWW  LTFM


In [44]:
print(f"原本資料量：{full_df.shape}")
print(f"清理後資料量：{df_cleaned.shape}")

原本資料量：(13390934, 17)
清理後資料量：(4376444, 17)


In [46]:
# 讀入機場經緯度對照表
airport_df = pd.read_csv('your_airport_lookup.csv')  # 這裡換成你的檔名！

# 保留只需要的欄位
airport_df = airport_df[['icao', 'latitude', 'longitude']]

# 先清理 full_df：只留有 adep 和 ades 的
flights = full_df.dropna(subset=['adep', 'ades'])

# 將出發地 (adep) 加上經緯度
flights = flights.merge(airport_df, how='left', left_on='adep', right_on='icao')
flights = flights.rename(columns={'latitude': 'pt1_lat', 'longitude': 'pt1_lon'})
flights = flights.drop(columns=['icao'])

# 將抵達地 (ades) 加上經緯度
flights = flights.merge(airport_df, how='left', left_on='ades', right_on='icao')
flights = flights.rename(columns={'latitude': 'pt2_lat', 'longitude': 'pt2_lon'})
flights = flights.drop(columns=['icao'])

# 只留下我們需要的欄位
flights = flights[['pt1_lat', 'pt1_lon', 'pt2_lat', 'pt2_lon']]

flights



Unnamed: 0,id,icao24,flt_id,dof,adep,ades,adep_p,ades_p,registration,model,typecode,icao_aircraft_class,icao_operator,first_seen,last_seen,version,unix_time
141,537b458a3878096893a30ffaf1d6e2e1cdc8024c407511...,485b2f,ZXP24,2022-01-01,EHDL,EHGR,,,PH-PXX,,A139,H2T,,2022-01-01 00:00:05,2022-01-01 00:59:00,v2.0.0,1640995205
279,5ec1fa2689f0321b983b1c33d2b96c1b574c5896283f0e...,3dd796,PBW13,2022-01-01,EDNY,ETHL,,,D-HBWW,MBB-BK 117 D-2 (H145),EC45,H2T,,2022-01-01 00:06:15,2022-01-01 01:59:55,v2.0.0,1640995575
284,32066031209fdc4c6ff83d3b7cc04ad4f34fd46a9ef7ac...,3ddc73,CHX11,2022-01-01,EDTD,LSZH,,,D-HDST,,EC45,H2T,,2022-01-01 00:07:05,2022-01-01 00:27:00,v2.0.0,1640995625
286,c02a6d221fb1a9c1040b23c4d89098bbcb352d520c0a5b...,3c458a,BOX195,2022-01-01,EGNX,EDDP,,,D-AALJ,,B77L,L2J,,2022-01-01 00:07:15,2022-01-01 01:22:55,v2.0.0,1640995635
289,9175d69e53c3835633b8b7967ca65e9b341369432648f5...,4baa92,THY8AH,2022-01-01,UUWW,LTFM,,,TC-JTR,A321 231SL,A321,L2J,THY,2022-01-01 00:08:00,2022-01-01 02:33:10,v2.0.0,1640995680
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13390584,110f7c854f92fa205ab61cf37cee435b3348daa68be490...,a528a5,UPS294,2022-12-27,EDDK,LKPR,,,N431UP,757-24APF,B752,L2J,UPS,2022-12-27 03:00:05,2022-12-27 03:52:50,v2.0.0,1672110005
13390596,eda1b260e63917bc4eaf580dae98e3bedc08957fd66cc0...,acd8ac,FDX6202,2022-12-27,LFPG,EDDV,,,N927FD,757-204,B752,L2J,FDX,2022-12-27 03:00:20,2022-12-27 03:58:20,v2.0.0,1672110020
13390636,9ad3544a23ba682c50ccc446bb02ba2bd270628d330ea7...,440bcb,BCS1730,2022-12-27,EBBR,LFXA,,,OE-LNR,,B752,L2J,,2022-12-27 03:05:10,2022-12-27 03:58:35,v2.0.0,1672110310
13390722,a07e3449171999ac56789aedb8941b23a0d17f160d936a...,3ddc6f,CHX88,2022-12-27,ETHN,EDDN,,,D-HDSP,MBB BK-117D2 (H145),EC45,H2T,CHX,2022-12-27 03:20:40,2022-12-27 03:47:30,v2.0.0,1672111240


In [None]:
# 方向不敏感：對經緯度組合排序
def sort_coords(row):
    pt1 = (row['pt1_lat'], row['pt1_lon'])
    pt2 = (row['pt2_lat'], row['pt2_lon'])
    return sorted([pt1, pt2])

flights[['sorted_pt1', 'sorted_pt2']] = flights.apply(lambda row: pd.Series(sort_coords(row)), axis=1)

# 整理出新 DataFrame
flight_counts = flights.groupby(['sorted_pt1', 'sorted_pt2']).size().reset_index(name='count')

# 拆開 sorted_pt1 / sorted_pt2 回成經緯度欄位
flight_counts[['pt1_lat', 'pt1_lon']] = pd.DataFrame(flight_counts['sorted_pt1'].tolist(), index=flight_counts.index)
flight_counts[['pt2_lat', 'pt2_lon']] = pd.DataFrame(flight_counts['sorted_pt2'].tolist(), index=flight_counts.index)

# 只保留需要的欄位
final_df = flight_counts[['count', 'pt1_lat', 'pt1_lon', 'pt2_lat', 'pt2_lon']]

# 看結果
print(final_df.head())
