#Q4  MLA && PCA - Traffic Density -> Traffic Duration / Traffic Clustering - GET DATA HOURLY

In [5]:
import os
import requests
import pandas as pd
from datetime import datetime, timedelta
import xml.etree.ElementTree as ET

# Define the start and end dates
start_date = datetime.strptime('20230901', '%Y%m%d')
end_date = datetime.strptime('20230902', '%Y%m%d')

# Define the base URL
base_url = 'https://api.data.gov.hk/v1/historical-archive/get-file?url=https%3A%2F%2Fresource.data.one.gov.hk%2Ftd%2Ftraffic-detectors%2FrawSpeedVol-all.xml&time='

# Loop over the date range
current_date = start_date
while current_date <= end_date:
    # Loop over each hour of the current day
    for hour in range(0, 24):
        # Save the DataFrame to a CSV file
        filename = f"traffic_{current_date.strftime('%Y%m%d')}_hour_{hour}.csv"
        if os.path.isfile(filename):
            print(f"File {filename} already exists. Skipping this hour.")
            continue

        # Initialize a dictionary to store the total speed and count for each detector and lane
        totals = {}

        # Loop over each minute of the current hour
        for minute in range(0, 60):
            # Generate the timestamp for the current minute
            timestamp = current_date.strftime('%Y%m%d') + '-' + f'{hour:02d}{minute:02d}'

            # Fetch the data for the current timestamp
            response = requests.get(base_url + timestamp)

            # Check the status of the response
            if response.status_code != 200:
                print(f"Failed to fetch data for {timestamp}, status code: {response.status_code}")
                continue

            root = ET.fromstring(response.content)

            # Extract the data and add it to the dictionary
            for period in root.findall('.//periods/period'):
                for detector in period.findall('.//detectors/detector'):
                    detector_id = detector.find('detector_id').text

                    for lane in detector.findall('.//lanes/lane'):
                        lane_id_element = lane.find('lane_id')
                        if lane_id_element is not None:
                            lane_id = lane_id_element.text
                        else:
                            print("lane_id element not found")
                            continue

                        speed_element = lane.find('speed')

                        occupancy = int(lane.find('occupancy').text)
                        volume = int(lane.find('volume').text)

                        # Check if the 'speed' element is present and contains a valid float
                        if speed_element is not None:
                            try:
                                speed = float(speed_element.text)
                            except ValueError:
                                print(f"Invalid speed: {speed_element.text}")
                                continue

                            # Add the speed to the total for the current detector and lane id
                            if (detector_id, lane_id) not in totals:
                                totals[(detector_id, lane_id)] = [speed, 1, occupancy, volume]
                            else:
                                totals[(detector_id, lane_id)][0] += speed
                                totals[(detector_id, lane_id)][1] += 1
                                totals[(detector_id, lane_id)][2] += occupancy
                                totals[(detector_id, lane_id)][3] += volume
                    
        # Calculate the average speed for each detector and lane id and add it to the list
        data = [{'detectorID': detector_id, 'laneType': lane_id, 'speed': total_speed / count, 'totalOccupancy': total_occupancy, 'totalVolume': total_volume}
                for (detector_id, lane_id), (total_speed, count, total_occupancy, total_volume) in totals.items()]

        # Convert the list into a DataFrame
        df = pd.DataFrame(data)

        # Save the DataFrame to a CSV file
        df.to_csv(filename, index=False)
        print(f"File {filename} saved.")

        print(f"end of the hour {hour}")
    # Move to the next day
    current_date += timedelta(days=1)

print(f"finished fetch")

File traffic_20230901_hour_0.csv saved.
end of the hour 0
File traffic_20230901_hour_1.csv saved.
end of the hour 1
File traffic_20230901_hour_2.csv saved.
end of the hour 2
File traffic_20230901_hour_3.csv saved.
end of the hour 3
File traffic_20230901_hour_4.csv saved.
end of the hour 4
File traffic_20230901_hour_5.csv saved.
end of the hour 5
File traffic_20230901_hour_6.csv saved.
end of the hour 6
File traffic_20230901_hour_7.csv saved.
end of the hour 7
File traffic_20230901_hour_8.csv saved.
end of the hour 8
File traffic_20230901_hour_9.csv saved.
end of the hour 9
File traffic_20230901_hour_10.csv saved.
end of the hour 10
File traffic_20230901_hour_11.csv saved.
end of the hour 11
File traffic_20230901_hour_12.csv saved.
end of the hour 12
File traffic_20230901_hour_13.csv saved.
end of the hour 13
File traffic_20230901_hour_14.csv saved.
end of the hour 14
File traffic_20230901_hour_15.csv saved.
end of the hour 15
File traffic_20230901_hour_16.csv saved.
end of the hour 16


KeyboardInterrupt: 