#Q4  MLA && PCA - Traffic Density -> Traffic Duration / Traffic Clustering - GET DATA HOURLY

## MULTITHREADING CODE

In [3]:
import os
import requests
import pandas as pd
from datetime import datetime, timedelta
import xml.etree.ElementTree as ET
from concurrent.futures import ThreadPoolExecutor

# Define the start and end dates
start_date = datetime.strptime('20230901', '%Y%m%d')
end_date = datetime.strptime('20230902', '%Y%m%d')

# Define the base URL
base_url = 'https://api.data.gov.hk/v1/historical-archive/get-file?url=https%3A%2F%2Fresource.data.one.gov.hk%2Ftd%2Ftraffic-detectors%2FrawSpeedVol-all.xml&time='

# Function to fetch data for a given minute
def fetch_data(minute, current_date, hour, totals):
    # Generate the timestamp for the current minute
    timestamp = current_date.strftime('%Y%m%d') + '-' + f'{hour:02d}{minute:02d}'

    # Fetch the data for the current timestamp
    response = requests.get(base_url + timestamp)

    # Check the status of the response
    if response.status_code != 200:
        print(f"Failed to fetch data for {timestamp}, status code: {response.status_code}")
        return

    root = ET.fromstring(response.content)

    # Extract the data and add it to the dictionary
    for period in root.findall('.//periods/period'):
        for detector in period.findall('.//detectors/detector'):
            detector_id = detector.find('detector_id').text

            for lane in detector.findall('.//lanes/lane'):
                lane_id_element = lane.find('lane_id')
                if lane_id_element is not None:
                    lane_id = lane_id_element.text
                else:
                    print("lane_id element not found")
                    continue

                speed_element = lane.find('speed')

                occupancy = int(lane.find('occupancy').text)
                volume = int(lane.find('volume').text)

                # Check if the 'speed' element is present and contains a valid float
                if speed_element is not None:
                    try:
                        speed = float(speed_element.text)
                    except ValueError:
                        print(f"Invalid speed: {speed_element.text}")
                        continue

                    # Add the speed to the total for the current detector and lane id
                    if (detector_id, lane_id) not in totals:
                        totals[(detector_id, lane_id)] = [speed, 1, occupancy, volume]
                    else:
                        totals[(detector_id, lane_id)][0] += speed
                        totals[(detector_id, lane_id)][1] += 1
                        totals[(detector_id, lane_id)][2] += occupancy
                        totals[(detector_id, lane_id)][3] += volume

# Loop over the date range
current_date = start_date
while current_date <= end_date:
    # Loop over each hour of the current day
    for hour in range(0, 24):
        # Save the DataFrame to a CSV file
        filename = f"traffic_{current_date.strftime('%Y%m%d')}_hour_{hour}.csv"
        
        print('Getting', filename);

        if os.path.isfile(filename):
            print(f"File {filename} already exists. Skipping this hour.")
            continue

        # Initialize a dictionary to store the total speed and count for each detector and lane
        totals = {}

        # Create a ThreadPoolExecutor
        with ThreadPoolExecutor(max_workers=10) as executor:
            # Loop over each minute of the current hour
            for minute in range(0, 60):
                executor.submit(fetch_data, minute, current_date, hour, totals)

        # Calculate the average speed for each detector and lane id and add it to the list
        data = [{'detectorID': detector_id, 'laneType': lane_id, 'speed': total_speed / count, 'totalOccupancy': total_occupancy, 'totalVolume': total_volume}
                for (detector_id, lane_id), (total_speed, count, total_occupancy, total_volume) in totals.items()]

        # Convert the list into a DataFrame
        df = pd.DataFrame(data)

        # Save the DataFrame to a CSV file
        df.to_csv(filename, index=False)
        print(f"File {filename} saved.")

        print(f"end of the hour {hour}")
    # Move to the next day
    current_date += timedelta(days=1)

print(f"finished fetch")


Getting traffic_20230901_hour_0.csv
File traffic_20230901_hour_0.csv already exists. Skipping this hour.
Getting traffic_20230901_hour_1.csv
File traffic_20230901_hour_1.csv already exists. Skipping this hour.
Getting traffic_20230901_hour_2.csv
File traffic_20230901_hour_2.csv already exists. Skipping this hour.
Getting traffic_20230901_hour_3.csv
File traffic_20230901_hour_3.csv already exists. Skipping this hour.
Getting traffic_20230901_hour_4.csv
File traffic_20230901_hour_4.csv already exists. Skipping this hour.
Getting traffic_20230901_hour_5.csv
File traffic_20230901_hour_5.csv already exists. Skipping this hour.
Getting traffic_20230901_hour_6.csv
File traffic_20230901_hour_6.csv already exists. Skipping this hour.
Getting traffic_20230901_hour_7.csv
File traffic_20230901_hour_7.csv already exists. Skipping this hour.
Getting traffic_20230901_hour_8.csv
File traffic_20230901_hour_8.csv already exists. Skipping this hour.
Getting traffic_20230901_hour_9.csv
File traffic_202309

#Append Geographic info to the dataset

In [15]:
import pandas as pd
import os

# Define the path to the location data and load it into a DataFrame
location_data_path = "./traffic_speed_volume_occ_info.csv"
location_data = pd.read_csv(location_data_path)

# Define the path to the data folder
data_folder_path = "."

# Define the path to the output directory and create it if it does not exist
output_directory_path = "./geoMod"
os.makedirs(output_directory_path, exist_ok=True)

# Iterate over the specified range of dates
for date in range(20230901, 20230905):
    # Iterate over all the hours in a day
    for hour in range(24):
        # Define the path to the current file
        file_path = os.path.join(data_folder_path, f"traffic_{date}_hour_{str(hour).zfill(2)}.csv")
        
        # Check if the file exists
        if os.path.exists(file_path):
            # Load the current file into a DataFrame
            data = pd.read_csv(file_path)
            
            # Merge the data with the location data
            merged_data = pd.merge(data, location_data, how="left", left_on="detectorID", right_on="AID_ID_Number")

            
            # Keep only the necessary columns
            merged_data = merged_data[["detectorID", "laneType", "speed", "totalOccupancy", "totalVolume", "Latitude", "Longitude"]]
            
            # Define the path to the output file
            output_file_path = os.path.join(output_directory_path, f"traffic_{date}_hour_{str(hour).zfill(2)}.csv")

            print(merged_data);
            
            # Save the merged data to the output file
            merged_data.to_csv(output_file_path, index=False)

       detectorID     laneType      speed  totalOccupancy  totalVolume  \
0        AID01101    Fast Lane  69.491667             290          106   
1        AID01101  Middle Lane  65.441667             460          175   
2        AID01101    Slow Lane  59.733333             313           78   
3        AID01102    Fast Lane  73.291667             217           93   
4        AID01102  Middle Lane  71.033333             583          217   
...           ...          ...        ...             ...          ...   
1836  TDSYSR20001    Slow Lane  62.441667             225           94   
1837  TDSYSR20002    Fast Lane  75.025000             251          154   
1838  TDSYSR20002    Slow Lane  70.625000             267          106   
1839  TDSYSR20003    Fast Lane  65.358333             181          127   
1840  TDSYSR20003    Slow Lane  65.825000             115           66   

       Latitude   Longitude  
0     22.248091  114.152525  
1     22.248091  114.152525  
2     22.248091  114.