#### Analyze the MIDAS dataset to merge traffic on M20

In [14]:
import io
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import os
import multiprocess
from tqdm import tqdm

In [15]:
#Function to process each file and filter data
def process_file(file_path):
    try:
        # Read the CSV file
        data = pd.read_csv(file_path)
        
        # Filter rows containing "M20" in "site_ID"
        data_m20 = data[data["site_ID"].str.contains("M20")]
        
        return data_m20
    except FileNotFoundError:
        print(f"File {file_path} is missing.")
        return pd.DataFrame()  # Return empty DataFrame in case of error

# Function to generate the file paths
def generate_file_paths():
    file_paths = [f'/Users/wjiang/OneDrive - The Alan Turing Institute/Full_History/2_01_combined_15min_site_level_kent000000000{i:03}.csv' for i in range(700)]
    return file_paths

# Function to divide the file list into chunks
def chunk_file_list(file_list, num_chunks):
    avg_chunk_size = len(file_list) // num_chunks
    return [file_list[i:i + avg_chunk_size] for i in range(0, len(file_list), avg_chunk_size)]

# Main function that orchestrates the multiprocessing
def main():
    # Generate file paths
    file_list = generate_file_paths()

    # Define number of worker processes
    num_workers = 8  # You can adjust based on your machine's CPU cores

    # Split the file list into chunks
    file_chunks = chunk_file_list(file_list, num_workers)

    # Create a Pool of workers to process the file chunks
    with multiprocess.Pool(processes=num_workers) as pool:
        # Use tqdm to track progress of the iterable
        result = list(
            tqdm(
                pool.imap(process_file, file_list),
                total=len(file_list),
                desc="Processing files"
            )
        )
    # Concatenate all filtered data into a single DataFrame
    df_M20 = pd.concat(result, ignore_index=True)

    # Save the merged DataFrame to a CSV file
    df_M20.to_csv('combined_M20.csv', index=False)

    print("M20 data merged and saved to 'merged_M20.csv'.")

In [16]:
if __name__ == "__main__":
    main()

Processing files:  33%|███████▍               | 228/700 [00:09<00:19, 24.12it/s]

File /Users/wjiang/OneDrive - The Alan Turing Institute/Full_History/2_01_combined_15min_site_level_kent000000000235.csv is missing.
File /Users/wjiang/OneDrive - The Alan Turing Institute/Full_History/2_01_combined_15min_site_level_kent000000000238.csv is missing.


Processing files:  52%|███████████▉           | 362/700 [00:14<00:13, 25.41it/s]

File /Users/wjiang/OneDrive - The Alan Turing Institute/Full_History/2_01_combined_15min_site_level_kent000000000370.csv is missing.


Processing files:  69%|███████████████▊       | 481/700 [00:18<00:07, 28.08it/s]

File /Users/wjiang/OneDrive - The Alan Turing Institute/Full_History/2_01_combined_15min_site_level_kent000000000488.csv is missing.


Processing files:  69%|███████████████▉       | 484/700 [00:18<00:07, 27.37it/s]

File /Users/wjiang/OneDrive - The Alan Turing Institute/Full_History/2_01_combined_15min_site_level_kent000000000494.csv is missing.
File /Users/wjiang/OneDrive - The Alan Turing Institute/Full_History/2_01_combined_15min_site_level_kent000000000495.csv is missing.


Processing files: 100%|███████████████████████| 700/700 [00:27<00:00, 25.61it/s]


M20 data merged and saved to 'merged_M20.csv'.
