#Q4  MLA && PCA - Traffic Density -> Traffic Duration / Traffic Clustering - GET DATA HOURLY

## MULTITHREADING CODE

In [1]:
import os
import requests
import pandas as pd
from datetime import datetime, timedelta
import xml.etree.ElementTree as ET
from concurrent.futures import ThreadPoolExecutor

# Define the start and end dates
start_date = datetime.strptime('20230903', '%Y%m%d')
end_date = datetime.strptime('20230904', '%Y%m%d')

# Define the base URL
base_url = 'https://api.data.gov.hk/v1/historical-archive/get-file?url=https%3A%2F%2Fresource.data.one.gov.hk%2Ftd%2Ftraffic-detectors%2FrawSpeedVol-all.xml&time='

# Function to fetch data for a given minute
def fetch_data(minute, current_date, hour, totals):
    # Generate the timestamp for the current minute
    timestamp = current_date.strftime('%Y%m%d') + '-' + f'{hour:02d}{minute:02d}'

    # Fetch the data for the current timestamp
    response = requests.get(base_url + timestamp)

    # Check the status of the response
    if response.status_code != 200:
        print(f"Failed to fetch data for {timestamp}, status code: {response.status_code}")
        return

    root = ET.fromstring(response.content)

    # Extract the data and add it to the dictionary
    for period in root.findall('.//periods/period'):
        for detector in period.findall('.//detectors/detector'):
            detector_id = detector.find('detector_id').text

            for lane in detector.findall('.//lanes/lane'):
                lane_id_element = lane.find('lane_id')
                if lane_id_element is not None:
                    lane_id = lane_id_element.text
                else:
                    print("lane_id element not found")
                    continue

                speed_element = lane.find('speed')

                occupancy = int(lane.find('occupancy').text)
                volume = int(lane.find('volume').text)

                # Check if the 'speed' element is present and contains a valid float
                if speed_element is not None:
                    try:
                        speed = float(speed_element.text)
                    except ValueError:
                        print(f"Invalid speed: {speed_element.text}")
                        continue

                    # Add the speed to the total for the current detector and lane id
                    if (detector_id, lane_id) not in totals:
                        totals[(detector_id, lane_id)] = [speed, 1, occupancy, volume]
                    else:
                        totals[(detector_id, lane_id)][0] += speed
                        totals[(detector_id, lane_id)][1] += 1
                        totals[(detector_id, lane_id)][2] += occupancy
                        totals[(detector_id, lane_id)][3] += volume

# Loop over the date range
current_date = start_date
while current_date <= end_date:
    # Loop over each hour of the current day
    for hour in range(0, 24):
        # Save the DataFrame to a CSV file
        filename = f"traffic_{current_date.strftime('%Y%m%d')}_hour_{hour}.csv"
        
        print('Getting', filename);

        if os.path.isfile(filename):
            print(f"File {filename} already exists. Skipping this hour.")
            continue

        # Initialize a dictionary to store the total speed and count for each detector and lane
        totals = {}

        # Create a ThreadPoolExecutor
        with ThreadPoolExecutor(max_workers=10) as executor:
            # Loop over each minute of the current hour
            for minute in range(0, 60):
                executor.submit(fetch_data, minute, current_date, hour, totals)

        # Calculate the average speed for each detector and lane id and add it to the list
        data = [{'detectorID': detector_id, 'laneType': lane_id, 'speed': total_speed / count, 'totalOccupancy': total_occupancy, 'totalVolume': total_volume}
                for (detector_id, lane_id), (total_speed, count, total_occupancy, total_volume) in totals.items()]

        # Convert the list into a DataFrame
        df = pd.DataFrame(data)

        # Save the DataFrame to a CSV file
        df.to_csv(filename, index=False)
        print(f"File {filename} saved.")

        print(f"end of the hour {hour}")
    # Move to the next day
    current_date += timedelta(days=1)

print(f"finished fetch")


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Getting traffic_20230903_hour_0.csv
File traffic_20230903_hour_0.csv saved.
end of the hour 0
Getting traffic_20230903_hour_1.csv
File traffic_20230903_hour_1.csv saved.
end of the hour 1
Getting traffic_20230903_hour_2.csv
File traffic_20230903_hour_2.csv saved.
end of the hour 2
Getting traffic_20230903_hour_3.csv
File traffic_20230903_hour_3.csv saved.
end of the hour 3
Getting traffic_20230903_hour_4.csv
File traffic_20230903_hour_4.csv saved.
end of the hour 4
Getting traffic_20230903_hour_5.csv
File traffic_20230903_hour_5.csv saved.
end of the hour 5
Getting traffic_20230903_hour_6.csv
File traffic_20230903_hour_6.csv saved.
end of the hour 6
Getting traffic_20230903_hour_7.csv


KeyboardInterrupt: 