# Net migration
Data source: https://zenodo.org/records/7997134   
Paper: https://www.nature.com/articles/s41562-023-01689-4#data-availability


In [1]:
import os
import gc
import fiona
import rasterio
import numpy as np
import pandas as pd
from tqdm import tqdm
import geopandas as gpd
from pathlib import Path
from osgeo import gdal, osr
from rasterstats import zonal_stats
from rasterio.warp import calculate_default_transform, reproject, Resampling

BASE_DIR = Path('/Users/wenlanzhang/PycharmProjects/Mapineq/src/data-wrangling')
DATA_DIR = Path('/Users/wenlanzhang/Downloads/PhD_UCL/Data/Oxford')

# Reproject to 3035

In [2]:
# def reproject_raster(src_path, dst_path, dst_crs='EPSG:3035'):
#     with rasterio.open(src_path) as src:
#         transform, width, height = calculate_default_transform(
#             src.crs, dst_crs, src.width, src.height, *src.bounds)
#         kwargs = src.meta.copy()
#         kwargs.update({
#             'crs': dst_crs,
#             'transform': transform,
#             'width': width,
#             'height': height
#         })

#         with rasterio.open(dst_path, 'w', **kwargs) as dst:
#             for i in range(1, src.count + 1):
#                 reproject(
#                     source=rasterio.band(src, i),
#                     destination=rasterio.band(dst, i),
#                     src_transform=src.transform,
#                     src_crs=src.crs,
#                     dst_transform=transform,
#                     dst_crs=dst_crs,
#                     resampling=Resampling.nearest)  # You can choose other resampling methods

# # Usage
# src_raster = DATA_DIR / "Migration/NetMigration/raster_netMgr_2000_2019_annual.tif"
# dst_raster = DATA_DIR / "Migration/NetMigration/raster_netMgr_2000_2019_annual_3035.tif"
# reproject_raster(src_raster, dst_raster)
# print("Reprojection complete.")

# Processing

In [3]:
raster_fp = DATA_DIR / "Migration/NetMigration/raster_netMgr_2000_2019_annual_3035.tif"
years = list(range(2000, 2020))  # Raster bands correspond to these years

# List of vector files and their corresponding geo_source labels
nuts_years = [2003, 2006, 2010, 2013, 2016, 2021, 2024]
# nuts_years = [2003, 2006]
vector_files = [DATA_DIR / f"NUTS/NUTS_RG_01M_{year}_3035.geojson" for year in nuts_years]

results_list = []

with rasterio.open(raster_fp) as src:
    # Outer tqdm for vector files
    for vector_fp, nuts_year in tqdm(zip(vector_files, nuts_years), total=len(nuts_years), desc="Processing NUTS vectors"):
        gdf = gpd.read_file(vector_fp)
        gdf_3035 = gdf.to_crs(epsg=3035)

        id_cols = ['NUTS_ID']
        gdf_ids = gdf_3035[id_cols].reset_index(drop=True)

        # Inner tqdm for bands/years
        for band_index, year in enumerate(tqdm(years, desc=f"Processing years for NUTS{nuts_year}", leave=False), start=1):
            stats = zonal_stats(
                gdf_3035.geometry,
                raster_fp,
                stats=['mean', 'sum', 'max', 'min', 'median', 'std'],
                band=band_index,
                geojson_out=False
            )

            year_df = pd.DataFrame(stats)
            year_df['obsTime'] = year
            year_df[id_cols] = gdf_ids

            df_long = year_df.melt(
                id_vars=id_cols + ['obsTime'],
                var_name='metric',
                value_name='obsValue'
            )

            # Add the geo_source column
            df_long['geo_source'] = f"NUTS{nuts_year}"

            results_list.append(df_long)

# Combine all results
df_melt = pd.concat(results_list, ignore_index=True)

# Rename and reorder columns
df_melt = df_melt.rename(columns={'NUTS_ID': 'geo'})
df_melt = df_melt[['geo', 'obsTime', 'metric', 'obsValue', 'geo_source']]

df_melt

Processing NUTS vectors:   0%|                            | 0/7 [00:00<?, ?it/s]
Processing years for NUTS2003:   0%|                     | 0/20 [00:00<?, ?it/s][A
Processing years for NUTS2003:   5%|▋            | 1/20 [00:14<04:32, 14.36s/it][A
Processing years for NUTS2003:  10%|█▎           | 2/20 [00:28<04:21, 14.51s/it][A
Processing years for NUTS2003:  15%|█▉           | 3/20 [00:43<04:09, 14.68s/it][A
Processing years for NUTS2003:  20%|██▌          | 4/20 [00:58<03:53, 14.58s/it][A
Processing years for NUTS2003:  25%|███▎         | 5/20 [01:12<03:36, 14.43s/it][A
Processing years for NUTS2003:  30%|███▉         | 6/20 [01:27<03:23, 14.55s/it][A
Processing years for NUTS2003:  35%|████▌        | 7/20 [01:42<03:11, 14.72s/it][A
Processing years for NUTS2003:  40%|█████▏       | 8/20 [01:56<02:56, 14.68s/it][A
Processing years for NUTS2003:  45%|█████▊       | 9/20 [02:11<02:40, 14.58s/it][A
Processing years for NUTS2003:  50%|██████      | 10/20 [02:25<02:24, 14.46s/it

Unnamed: 0,geo,obsTime,metric,obsValue,geo_source
0,NL11,2000,min,-1211.751465,NUTS2003
1,IE01,2000,min,-231.577240,NUTS2003
2,NL12,2000,min,-1217.874512,NUTS2003
3,DE93,2000,min,-3657.027832,NUTS2003
4,NL13,2000,min,-2188.597900,NUTS2003
...,...,...,...,...,...
1622035,RO,2019,median,-12.950260,NUTS2024
1622036,NO,2019,median,-0.740463,NUTS2024
1622037,PL,2019,median,-2.050199,NUTS2024
1622038,PT,2019,median,-4.778418,NUTS2024


In [4]:
df_melt.to_csv(DATA_DIR/"Migration/Output/Net_migration_20year.csv", index=True, index_label="id")