In [7]:
import pandas as pd
import geopandas as gpd
import warnings
from filelock import FileLock
from concurrent.futures import ProcessPoolExecutor
import logging
import os
import psutil

# Suppress specific FutureWarnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Global constants
buffer_radii = [2816, 1890, 965, 482]  # in meters
postcode_data_path = "ukpostcodes.csv"
chunk_size = 700  # Define a chunk size for processing

# Prepare and convert postcode data
postcode_data = pd.read_csv(postcode_data_path)
postcode_data = gpd.GeoDataFrame(
    postcode_data,
    geometry=gpd.points_from_xy(postcode_data.longitude, postcode_data.latitude),
    crs="EPSG:4326"
).to_crs(epsg=27700)

def log_memory_usage():
    process = psutil.Process(os.getpid())
    logging.info(f"Memory usage: {process.memory_info().rss / (1024 ** 2)} MB")


def process_task(city, year, radius, city_boundaries_path):
    try:
        log_memory_usage()
        logging.info(f"Processing {city}, {year}, {radius}")
        output_file = f"multiprocessing_shp{outputfile[city]}.csv"
        lock_file = f"multiprocessing_shp{outputfile[city]}.lock"
        # Load city boundaries
        city_boundaries = gpd.read_file(city_boundaries_path)
        city_postcodes = gpd.sjoin(postcode_data, city_boundaries, how='inner', op='intersects')

        # Load polygons data
        shp_path = f'urban_greenspace_classification/OSM/{city}_{year}_clipped.shp'
        polygons_gdf = gpd.read_file(shp_path).to_crs(postcode_data.crs)

        results = []

        # Process in chunks
        for start in range(0, len(city_postcodes), chunk_size):
            chunk = city_postcodes.iloc[start:start+chunk_size]
            # logging.info(f"Buffering postcodes for {radius} meters, chunk {start // chunk_size}")
            
            buffered_chunk = chunk.copy()
            buffered_chunk['geometry'] = buffered_chunk['geometry'].buffer(radius)
            buffered_chunk['total_area'] = buffered_chunk['geometry'].area

            # logging.info(f"Overlay operation for chunk {start // chunk_size}")
            intersected_polygons = gpd.overlay(buffered_chunk, polygons_gdf, how='intersection')
            grouped = intersected_polygons.groupby('postcode')

            for postcode, group in grouped:
                total_area = group['total_area'].iloc[0]
                for tag in range(5):
                    category_area = group[group['tag'] == tag]['geometry'].area.sum()
                    proportion = category_area / total_area if total_area else 0
                    results.append({
                        'postcode': postcode,
                        'latitude': chunk.loc[chunk['postcode'] == postcode, 'latitude'].iloc[0],
                        'longitude': chunk.loc[chunk['postcode'] == postcode, 'longitude'].iloc[0],
                        'tag': tag,
                        'tag_proportion': proportion,
                        'city': city,
                        'year': year,
                        'buffer_radius': radius
                    })

        result_df = pd.DataFrame(results)

        # Append results to the file using a lock to prevent write conflicts
        lock = FileLock(lock_file)
        with lock:
            with open(output_file, 'a') as f:
                result_df.to_csv(f, header=f.tell()==0, index=False)
        logging.info(f"DONE {city}, {year}, {radius}")

    except Exception as e:
        logging.error(f"Error processing {city}, {year}, {radius}: {e}")

def main():
    city_file_map = {
        'Greater_London': "urban_greenspace_classification/Boundaries/LA_London.shp",
        'Greater_Manchester': "urban_greenspace_classification/Boundaries/LA_Manchester.shp",
        'West_Midlands': "urban_greenspace_classification/Boundaries/LA_Westmindlands.shp"
    }

    tasks = [(city, year, radius) for city in city_file_map for year in range(2018, 2024) for radius in buffer_radii]


    completed_tasks= []
    outputfile = {
        "Greater_London":"_greater_london",
             "Greater_Manchester":"",
             "West_Midlands":"_westmidlands"
    }

    for city, item in outputfile.items():   
        output_file = f"multiprocessing_shp{outputfile[city]}.csv"
        if os.path.exists(output_file):
            csv_df = pd.read_csv(output_file, usecols=['city', 'year', 'buffer_radius'])
            csv_df = set(csv_df[['city', 'year', 'buffer_radius']].itertuples(index=False, name=None))
        else:
            csv_df = set()
        completed_tasks.extend(csv_df)  

    print(completed_tasks)
    tasks = [task for task in tasks if (task[0], task[1], task[2]) not in completed_tasks]
    # tasks.insert(0,('Greater_Manchester', 2021, 2816))
    max_workers = min(4, os.cpu_count())  # Adjusting max_workers further down

    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        futures = []
        for task in tasks:
            city, year, radius = task
            city_boundaries_path = city_file_map[city]
            futures.append(executor.submit(process_task, city, year, radius, city_boundaries_path))

        for future in futures:
            try:
                future.result()
            except Exception as e:
                logging.error(f"Error in future: {e}")

if __name__ == "__main__":
    main()

[('Greater_London', 2016, 965), ('Greater_London', 2020, 1890), ('Greater_London', 2018, 965), ('Greater_London', 2023, 482), ('Greater_London', 2022, 965), ('Greater_London', 2017, 1890), ('Greater_London', 2020, 965), ('Greater_London', 2015, 1890), ('Greater_London', 2016, 2816), ('Greater_London', 2018, 2816), ('Greater_London', 2018, 482), ('Greater_London', 2016, 482), ('Greater_London', 2017, 965), ('Greater_London', 2019, 1890), ('Greater_London', 2021, 1890), ('Greater_London', 2022, 482), ('Greater_London', 2015, 965), ('Greater_London', 2020, 2816), ('Greater_London', 2019, 965), ('Greater_London', 2020, 482), ('Greater_London', 2021, 965), ('Greater_London', 2023, 1890), ('Greater_London', 2017, 2816), ('Greater_London', 2017, 482), ('Greater_London', 2015, 2816), ('Greater_London', 2016, 1890), ('Greater_London', 2023, 965), ('Greater_London', 2015, 482), ('Greater_London', 2018, 1890), ('Greater_London', 2019, 482), ('Greater_London', 2019, 2816), ('Greater_London', 2021,

2024-08-01 10:20:14,883 - INFO - Memory usage: 1008.90625 MB
2024-08-01 10:20:14,883 - INFO - Memory usage: 1008.8828125 MB
2024-08-01 10:20:14,885 - INFO - Processing Greater_London, 2023, 2816
2024-08-01 10:20:14,885 - INFO - Processing Greater_London, 2022, 2816
2024-08-02 04:25:43,324 - INFO - DONE Greater_London, 2022, 2816
2024-08-02 06:26:59,244 - INFO - DONE Greater_London, 2023, 2816
