In [17]:
import requests
import re
import pandas as pd
import configparser
import os

import ssl
ssl._create_default_https_context = ssl._create_unverified_context   

# Traffic Data

In [2]:
def get_mrt_traffic_data_link():
    
    url = 'https://data.taipei/api/dataset/' \
      '63f31c7e-7fc3-418b-bd82-b95158755b4d' \
      '/resource/eb481f58-1238-4cff-8caa-fa7bb20cb4f4/download'

    # 設定每個cell可顯示字串長度
    # default = 50
    pd.options.display.max_colwidth = 400
    return pd.read_csv(url)

In [3]:
df_source = get_mrt_traffic_data_link()
df_source.tail(5)

Unnamed: 0,年月,資料路徑
63,202204,http://tcgmetro.blob.core.windows.net/stationod/%E8%87%BA%E5%8C%97%E6%8D%B7%E9%81%8B%E6%AF%8F%E6%97%A5%E5%88%86%E6%99%82%E5%90%84%E7%AB%99OD%E6%B5%81%E9%87%8F%E7%B5%B1%E8%A8%88%E8%B3%87%E6%96%99_202204.csv
64,202205,http://tcgmetro.blob.core.windows.net/stationod/%E8%87%BA%E5%8C%97%E6%8D%B7%E9%81%8B%E6%AF%8F%E6%97%A5%E5%88%86%E6%99%82%E5%90%84%E7%AB%99OD%E6%B5%81%E9%87%8F%E7%B5%B1%E8%A8%88%E8%B3%87%E6%96%99_202205.csv
65,202206,http://tcgmetro.blob.core.windows.net/stationod/%E8%87%BA%E5%8C%97%E6%8D%B7%E9%81%8B%E6%AF%8F%E6%97%A5%E5%88%86%E6%99%82%E5%90%84%E7%AB%99OD%E6%B5%81%E9%87%8F%E7%B5%B1%E8%A8%88%E8%B3%87%E6%96%99_202206.csv
66,202207,http://tcgmetro.blob.core.windows.net/stationod/%E8%87%BA%E5%8C%97%E6%8D%B7%E9%81%8B%E6%AF%8F%E6%97%A5%E5%88%86%E6%99%82%E5%90%84%E7%AB%99OD%E6%B5%81%E9%87%8F%E7%B5%B1%E8%A8%88%E8%B3%87%E6%96%99_202207.csv
67,202208,http://tcgmetro.blob.core.windows.net/stationod/%E8%87%BA%E5%8C%97%E6%8D%B7%E9%81%8B%E6%AF%8F%E6%97%A5%E5%88%86%E6%99%82%E5%90%84%E7%AB%99OD%E6%B5%81%E9%87%8F%E7%B5%B1%E8%A8%88%E8%B3%87%E6%96%99_202208.csv


## Create Data directory

In [18]:
directory = 'traffic_data'

if not os.path.exists(directory):
    os.makedirs(directory)

## Dowload Data

In [11]:
%%time

for idx, row in df_source.iterrows():
    print(f"processing mrt traffic data {row[0]}...")
    df_traffic = pd.read_csv(row[1])

    # get traffic of first n days only
    df_traffic.to_csv(f'{directory}/臺北捷運每日分時各站OD流量統計資料_{row[0]}.csv', index=False)


processing mrt traffic data 201701...
CPU times: user 20.6 s, sys: 4.6 s, total: 25.2 s
Wall time: 1min 59s


## transform from csv to parquet

In [14]:
files = os.listdir(directory)

for file in files:
    if '臺北捷運每日分時各站OD流量統計資料' in file:
        print(f'processing file {file}...')
        file_date = file.split('_')[1].replace('.csv','')
        cols = ['日期','時段','進站','出站','人次']
        
        df = pd.read_csv(f'{directory}/{file}', usecols=cols)
        df = df.rename(columns={'日期': 'dt', 
                               '時段': 'hour',
                               '進站': 'entrance',
                               '出站': 'exit',
                               '人次': 'traffic'})
        df['dt'] = pd.to_datetime(df['dt'])
        df.to_parquet(f'{directory}/mrt_q_{file_date}.parquet', index=False)



processing file 臺北捷運每日分時各站OD流量統計資料_201701.csv...


## create testing data (opt.)

In [None]:
for date in ['202201', '202202', '202203']:
    print(f"processing data of {date}...")
    df = pd.read_csv(f'{directory}/臺北捷運每日分時各站OD流量統計資料_{date}.csv')
    df_traffic_test = df[df["日期"].str[-2:]<="03"].copy()
    df_traffic_test['日期'].unique()
    df = None
    df_traffic_test.to_csv(f'{directory}/mrt_traffic_{date}.csv', index=False)