April

In [1]:
import os
import requests
import pandas as pd
from datetime import datetime, timedelta
import xml.etree.ElementTree as ET

# Define the start and end dates
start_date = datetime.strptime('20231001', '%Y%m%d')
end_date = datetime.strptime('20231015', '%Y%m%d')

# Define the base URL
base_url = 'https://api.data.gov.hk/v1/historical-archive/get-file?url=https%3A%2F%2Fresource.data.one.gov.hk%2Ftd%2Ftraffic-detectors%2FrawSpeedVol-all.xml&time='

# Loop over the date range
current_date = start_date
while current_date <= end_date:
    # Save the DataFrame to a CSV file
    filename = f"traffic_{current_date.strftime('%Y%m%d')}.csv"
    if os.path.isfile(filename):
        print(f"File {filename} already exists. Skipping this day.")
        current_date += timedelta(days=1)
        continue

    # Initialize a dictionary to store the total speed and count for each detector and lane
    totals = {}

    # Loop over each minute of the current day
    for minute in range(0, 24*60):
        # Generate the timestamp for the current minute
        timestamp = current_date.strftime('%Y%m%d') + '-' + f'{minute//60:02d}{minute%60:02d}'

        # Fetch the data for the current timestamp
        response = requests.get(base_url + timestamp)

        # Check the status of the response
        if response.status_code != 200:
            print(f"Failed to fetch data for {timestamp}, status code: {response.status_code}")
            continue

        root = ET.fromstring(response.content)

        # Extract the data and add it to the dictionary
        for period in root.findall('.//periods/period'):
            for detector in period.findall('.//detectors/detector'):
                detector_id = detector.find('detector_id').text

                for lane in detector.findall('.//lanes/lane'):
                    lane_id_element = lane.find('lane_id')
                    if lane_id_element is not None:
                        lane_id = lane_id_element.text
                    else:
                        print("lane_id element not found")
                        continue

                    speed_element = lane.find('speed')

                    # Check if the 'speed' element is present and contains a valid float
                    if speed_element is not None:
                        try:
                            speed = float(speed_element.text)
                        except ValueError:
                            print(f"Invalid speed: {speed_element.text}")
                            continue

                        # Add the speed to the total for the current detector and lane id
                        if (detector_id, lane_id) not in totals:
                            totals[(detector_id, lane_id)] = [speed, 1]
                        else:
                            totals[(detector_id, lane_id)][0] += speed
                            totals[(detector_id, lane_id)][1] += 1

    # Calculate the average speed for each detector and lane id and add it to the list
    data = [{'detectorID': detector_id, 'laneType': lane_id, 'speed': total_speed / count}
            for (detector_id, lane_id), (total_speed, count) in totals.items()]

    # Convert the list into a DataFrame
    df = pd.DataFrame(data)

    # Save the DataFrame to a CSV file
    df.to_csv(filename, index=False)
    print(f"File {filename} saved.")

    print(f"end of the day {current_date.strftime('%Y-%m-%d')}")

    # Move to the next day
    current_date += timedelta(days=1)

print(f"finished fetch")

File traffic_20231001.csv already exists. Skipping this day.
File traffic_20231002.csv already exists. Skipping this day.
File traffic_20231003.csv already exists. Skipping this day.
File traffic_20231004.csv already exists. Skipping this day.
File traffic_20231005.csv already exists. Skipping this day.
File traffic_20231006.csv already exists. Skipping this day.
File traffic_20231007.csv already exists. Skipping this day.
File traffic_20231008.csv already exists. Skipping this day.
File traffic_20231009.csv already exists. Skipping this day.
File traffic_20231010.csv already exists. Skipping this day.
File traffic_20231011.csv already exists. Skipping this day.
File traffic_20231012.csv saved.
end of the day 2023-10-12
File traffic_20231013.csv saved.
end of the day 2023-10-13
File traffic_20231014.csv saved.
end of the day 2023-10-14
File traffic_20231015.csv saved.
end of the day 2023-10-15
finished fetch


Do the Analysis,For the daily_HKO_RF_ALL.csv file, i want to only select the data from 2023-10-01 to 2023-10-15.