In [1]:
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor,as_completed
folder = Path(r"/home/rishi/ML Projects/Air Pollution/Datasets/cpcb_data")

def file_sort(string):
    parts=string.split('_')
    year=parts[0]
    site_no=int(parts[2])
    return(site_no, year)


def process_site(site, files, folder, output_dir):
    """Process a single site - combine all its yearly files"""
    temp_list = []
    for file in sorted(files, key=file_sort):
        p = Path(folder) / file
        df = pd.read_csv(p)
        temp_list.append(df)
    
    x_df = pd.concat(temp_list, ignore_index=True)
    x_df['Timestamp'] = pd.to_datetime(x_df['Timestamp'], errors='raise')
    x_df = x_df.dropna(subset=['Timestamp'])

    # Set as index (required for resample)
    x_df = x_df.set_index('Timestamp')

    # Sort by timestamp
    x_df = x_df.sort_index()

    # Resample to hourly using median
    x_df = x_df.resample('1h').max()

    # Reset index to make Timestamp a column again (for CSV saving)
    x_df = x_df.reset_index()
    x_df.to_csv(output_dir / site, index=False)
    return site

def combine_on_years(folder):
    f_dir = Path(folder)
    
    files = [f.name for f in f_dir.iterdir() if f.is_file()]
    files = sorted(files, key=file_sort)
    sites = list(set(s[5:] for s in files))
    sites = sorted(sites)
    
    file_dict = {}
    for site in sites:
        file_dict[site] = []
        for file in files:
            if site in file:
                file_dict[site].append(file)
    
    output_dir = Path("/home/rishi/ML Projects/Air Pollution/sites_comb_max")
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Parallel processing

    with ProcessPoolExecutor(max_workers=24) as executor:  # Note: lower number
        futures = {
            executor.submit(process_site, site, file_list, folder, output_dir): site
            for site, file_list in file_dict.items()
        }
        
        # Progress bar
        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing sites"):
            site = futures[future]
            try:
                future.result()
            except Exception as e:
                print(f"Error processing {site}: {e}")
    
    return file_dict

In [2]:
d=combine_on_years("/home/rishi/ML Projects/Air Pollution/Datasets/cpcb_data")

Processing sites: 100%|██████████| 564/564 [02:14<00:00,  4.20it/s]
