# NUTS_In/Out_flow_map

# 做个全球的？

To generate the time series map .gif

Source: Meta  
Publisher: HDX (https://data.humdata.org/dataset/international-migration-flows)   
New York Times Article: https://www.nytimes.com/interactive/2025/04/17/opinion/global-migration-facebook-data.html   
Special countries: BA, UA, XK, UK/GB, EL/GR 

In [1]:
import os
import gc
import rasterio
import numpy as np
import pandas as pd
from tqdm import tqdm
import geopandas as gpd
from pathlib import Path
from osgeo import gdal, osr
import imageio.v2 as imageio
import dask_geopandas as dgpd
import matplotlib.pyplot as plt
from rasterstats import zonal_stats

BASE_DIR = Path('/Users/wenlanzhang/PycharmProjects/Mapineq/src/data-wrangling/')
DATA_DIR = Path('/Users/wenlanzhang/Downloads/PhD_UCL/Data/Oxford')

# Load NTUS

In [4]:
# Load a GeoJSON file
gdf_2024 = gpd.read_file(DATA_DIR/"NUTS/NUTS_RG_01M_2024_3035.geojson")

gdf_country_2024 = gdf_2024[gdf_2024['LEVL_CODE'] == 0]
gdf_country_2024 = gdf_country_2024[['CNTR_CODE', 'geometry']].reset_index(drop=True)
gdf_country_2024

Unnamed: 0,CNTR_CODE,geometry
0,EL,"MULTIPOLYGON (((6083881.558 1676236.011, 60838..."
1,ES,"MULTIPOLYGON (((3815056.85 1904971.998, 381575..."
2,FI,"MULTIPOLYGON (((4999905.463 5305310.537, 50031..."
3,FR,"MULTIPOLYGON (((9980485.23 -3029930.054, 99812..."
4,HR,"MULTIPOLYGON (((4809521.18 2624665.768, 480963..."
5,EE,"MULTIPOLYGON (((5200614.71 4159725.901, 520083..."
6,DE,"MULTIPOLYGON (((4355225.354 2715902.995, 43548..."
7,DK,"MULTIPOLYGON (((4650283.775 3591676.53, 465057..."
8,BA,"MULTIPOLYGON (((4866544.51 2485914.128, 486624..."
9,AT,"MULTIPOLYGON (((4354847.685 2714710.627, 43552..."


In [5]:
# Load a GeoJSON file
gdf_2021 = gpd.read_file(DATA_DIR/"NUTS/NUTS_RG_01M_2021_3035.geojson")

gdf_country_2021 = gdf_2021[gdf_2021['LEVL_CODE'] == 0]
gdf_country_2021 = gdf_country_2021[['CNTR_CODE', 'geometry']].reset_index(drop=True)
uk_2021 = gdf_country_2021[gdf_country_2021['CNTR_CODE'] == 'UK']
uk_2021

Unnamed: 0,CNTR_CODE,geometry
34,UK,"MULTIPOLYGON (((3546135.14 4022028.934, 354660..."


In [6]:
# Concatenate with 2024 data
gdf_country = pd.concat(
    [gdf_country_2024, uk_2021],
    ignore_index=True  # Reset index
)
gdf_country

Unnamed: 0,CNTR_CODE,geometry
0,EL,"MULTIPOLYGON (((6083881.558 1676236.011, 60838..."
1,ES,"MULTIPOLYGON (((3815056.85 1904971.998, 381575..."
2,FI,"MULTIPOLYGON (((4999905.463 5305310.537, 50031..."
3,FR,"MULTIPOLYGON (((9980485.23 -3029930.054, 99812..."
4,HR,"MULTIPOLYGON (((4809521.18 2624665.768, 480963..."
5,EE,"MULTIPOLYGON (((5200614.71 4159725.901, 520083..."
6,DE,"MULTIPOLYGON (((4355225.354 2715902.995, 43548..."
7,DK,"MULTIPOLYGON (((4650283.775 3591676.53, 465057..."
8,BA,"MULTIPOLYGON (((4866544.51 2485914.128, 486624..."
9,AT,"MULTIPOLYGON (((4354847.685 2714710.627, 43552..."


In [7]:
country_list = gdf_country['CNTR_CODE'].unique()
len(country_list)
country_list

array(['EL', 'ES', 'FI', 'FR', 'HR', 'EE', 'DE', 'DK', 'BA', 'AT', 'BG',
       'CH', 'CY', 'BE', 'CZ', 'AL', 'LU', 'LV', 'ME', 'IE', 'IS', 'IT',
       'MK', 'MT', 'LI', 'NL', 'LT', 'HU', 'RS', 'SE', 'SI', 'TR', 'UA',
       'SK', 'RO', 'NO', 'PL', 'PT', 'XK', 'UK'], dtype=object)

# Load Migration

In [8]:
df = pd.read_csv(DATA_DIR/f"Migration/international_migration_flow.csv") 
df['year'] = pd.to_datetime(df['migration_month']).dt.year
df['month'] = pd.to_datetime(df['migration_month']).dt.month

# Define the mapping of old codes to new codes
country_code_mapping = {
    'GR': 'EL',  # Greece (GR → EL)
    'GB': 'UK'   # United Kingdom (GB → UK)
}

# Apply the replacement to both columns
df['country_from'] = df['country_from'].replace(country_code_mapping)
df['country_to'] = df['country_to'].replace(country_code_mapping)

df
# len(df['country_from'].unique())

Unnamed: 0,country_from,country_to,migration_month,num_migrants,year,month
0,AD,AE,2019-01,12,2019,1
1,AD,AE,2019-02,2,2019,2
2,AD,AE,2019-03,1,2019,3
3,AD,AE,2019-04,7,2019,4
4,AD,AE,2019-05,0,2019,5
...,...,...,...,...,...,...
1563149,ZW,ZM,2022-08,138,2022,8
1563150,ZW,ZM,2022-09,162,2022,9
1563151,ZW,ZM,2022-10,149,2022,10
1563152,ZW,ZM,2022-11,104,2022,11


In [9]:
# len(df['country_from'].unique())
len(df['country_to'].unique())
# df['country_to'].unique()

181

In [10]:
df[df['country_to'].isna()].groupby('country_from').size().unique()
# df[df['country_from'].isna()].groupby('country_to').size().unique()

array([48])

# Country Aggregation

In [11]:
def prepare_flow_data(df_input, flow_type: str, country_scope: str, country_list: list):
    # Determine the column based on flow direction
    flow_col = 'country_to' if flow_type == 'inflow' else 'country_from'

    # Optional: filter for EU countries
    if country_scope == 'eu_countries':
        df_input = df_input[df_input[flow_col].isin(country_list)]

    # Ensure 'month' column exists
    if 'month' not in df_input.columns:
        df_input['month'] = pd.to_datetime(df_input['migration_month']).dt.month

    # Yearly aggregation
    yearly = (
        df_input.groupby([flow_col, 'year'])
        .agg(migration_count_total=('num_migrants', 'sum'))
        .reset_index()
        .assign(month='all')  # Placeholder month
        .rename(columns={flow_col: 'geo', 'year': 'obsTime'})
    )

    # Monthly aggregation
    monthly = (
        df_input.groupby([flow_col, 'year', 'month'])
        .agg(migration_count_total=('num_migrants', 'sum'))
        .reset_index()
        .rename(columns={flow_col: 'geo', 'year': 'obsTime'})
    )

    # Combine both
    combined = pd.concat([yearly, monthly], ignore_index=True)

    # Melt to long format
    melted = combined.melt(
        id_vars=['geo', 'obsTime', 'month'],
        value_vars=['migration_count_total'],
        var_name='time_granularity',
        value_name='obsValue'
    )

    # Clean and tag
    melted['time_granularity'] = melted['time_granularity'].map({
        'migration_count_total': 'Total Migration'
    })
    melted['flow_type'] = flow_type
    melted['country'] = country_scope

    return melted

In [12]:
# Inflow
inflow_all = prepare_flow_data(df, 'inflow', 'all_countries', country_list)
# inflow_eu = prepare_flow_data(df, 'inflow', 'eu_countries', country_list)
# inflow_combined = pd.concat([inflow_all, inflow_eu], ignore_index=True)

# Outflow
outflow_all = prepare_flow_data(df, 'outflow', 'all_countries', country_list)
# outflow_eu = prepare_flow_data(df, 'outflow', 'eu_countries', country_list)
# outflow_combined = pd.concat([outflow_all, outflow_eu], ignore_index=True)

# Combine inflow and outflow into one final dataset
# migration_combined = pd.concat([inflow_combined, outflow_combined], ignore_index=True)
migration_combined = pd.concat([inflow_all, outflow_all], ignore_index=True)
migration_combined

Unnamed: 0,geo,obsTime,month,time_granularity,obsValue,flow_type,country
0,AD,2019,all,Total Migration,8390,inflow,all_countries
1,AD,2020,all,Total Migration,7908,inflow,all_countries
2,AD,2021,all,Total Migration,6923,inflow,all_countries
3,AD,2022,all,Total Migration,9685,inflow,all_countries
4,AE,2019,all,Total Migration,1248145,inflow,all_countries
...,...,...,...,...,...,...,...
18715,ZW,2022,8,Total Migration,4339,outflow,all_countries
18716,ZW,2022,9,Total Migration,4867,outflow,all_countries
18717,ZW,2022,10,Total Migration,6280,outflow,all_countries
18718,ZW,2022,11,Total Migration,5907,outflow,all_countries


In [13]:
migration_combined['geo'].unique()

array(['AD', 'AE', 'AF', 'AL', 'AM', 'AO', 'AR', 'AT', 'AU', 'AZ', 'BA',
       'BB', 'BD', 'BE', 'BF', 'BG', 'BH', 'BI', 'BJ', 'BN', 'BO', 'BR',
       'BS', 'BT', 'BW', 'BY', 'BZ', 'CA', 'CD', 'CF', 'CG', 'CH', 'CI',
       'CL', 'CM', 'CO', 'CR', 'CV', 'CY', 'CZ', 'DE', 'DJ', 'DK', 'DO',
       'DZ', 'EC', 'EE', 'EG', 'EL', 'ER', 'ES', 'ET', 'FI', 'FJ', 'FM',
       'FR', 'GA', 'GD', 'GE', 'GH', 'GM', 'GN', 'GQ', 'GT', 'GW', 'GY',
       'HK', 'HN', 'HR', 'HT', 'HU', 'ID', 'IE', 'IL', 'IN', 'IQ', 'IS',
       'IT', 'JM', 'JO', 'JP', 'KE', 'KG', 'KH', 'KI', 'KM', 'KR', 'KW',
       'KZ', 'LA', 'LB', 'LC', 'LK', 'LR', 'LS', 'LT', 'LU', 'LV', 'LY',
       'MA', 'MD', 'ME', 'MG', 'MK', 'ML', 'MM', 'MN', 'MO', 'MR', 'MT',
       'MU', 'MV', 'MW', 'MX', 'MY', 'MZ', 'NE', 'NG', 'NI', 'NL', 'NO',
       'NP', 'NZ', 'OM', 'PA', 'PE', 'PG', 'PH', 'PK', 'PL', 'PT', 'PY',
       'QA', 'RO', 'RS', 'RU', 'RW', 'SA', 'SB', 'SD', 'SE', 'SG', 'SI',
       'SK', 'SL', 'SN', 'SR', 'SS', 'ST', 'SV', 'S

# Individual Country

In [14]:
# Create inflow DataFrame
inflow_df = df.rename(columns={
    'country_to': 'geo',
    'country_from': 'country'
})
inflow_df['flow_type'] = 'inflow'

# Create outflow DataFrame
outflow_df = df.rename(columns={
    'country_from': 'geo',
    'country_to': 'country',
})
outflow_df['flow_type'] = 'outflow'

# Concatenate both
individual_df = pd.concat([inflow_df, outflow_df], ignore_index=True)

# Optional: reorder columns
individual_df = individual_df.rename(columns={
    'num_migrants': 'obsValue',
    'year': 'obsTime',
})
individual_df['time_granularity'] = 'Monthly'
individual_df = individual_df[['geo', 'obsTime', 'month', 'time_granularity', 'obsValue', 'flow_type', 'country']]
individual_df

Unnamed: 0,geo,obsTime,month,time_granularity,obsValue,flow_type,country
0,AE,2019,1,Monthly,12,inflow,AD
1,AE,2019,2,Monthly,2,inflow,AD
2,AE,2019,3,Monthly,1,inflow,AD
3,AE,2019,4,Monthly,7,inflow,AD
4,AE,2019,5,Monthly,0,inflow,AD
...,...,...,...,...,...,...,...
3126303,ZW,2022,8,Monthly,138,outflow,ZM
3126304,ZW,2022,9,Monthly,162,outflow,ZM
3126305,ZW,2022,10,Monthly,149,outflow,ZM
3126306,ZW,2022,11,Monthly,104,outflow,ZM


# Check
Same country, Same year, Same flow direction should have same number

In [15]:
# individual_df[(individual_df['geo'] == 'AE') & (individual_df['obsTime'] == 2019) & (individual_df['flow_type'] == 'inflow') & (individual_df['month'] == 1)]['obsValue'].sum()
individual_df[(individual_df['geo'] == 'AE') & (individual_df['obsTime'] == 2019) & (individual_df['flow_type'] == 'inflow')]['obsValue'].sum()

np.int64(1248145)

In [16]:
migration_combined[(migration_combined['geo'] == 'AE') & (migration_combined['obsTime'] == 2019) & (migration_combined['flow_type'] == 'inflow') & (migration_combined['month'] == 'all')]

Unnamed: 0,geo,obsTime,month,time_granularity,obsValue,flow_type,country
4,AE,2019,all,Total Migration,1248145,inflow,all_countries


In [17]:
migration_combined[(migration_combined['geo'] == 'AE') & (migration_combined['obsTime'] == 2019) & (migration_combined['flow_type'] == 'inflow') & (migration_combined['month'] != 'all')]['obsValue'].sum()

np.int64(1248145)

# Final Combine

In [18]:
final = pd.concat([individual_df, migration_combined], ignore_index=True)
final

Unnamed: 0,geo,obsTime,month,time_granularity,obsValue,flow_type,country
0,AE,2019,1,Monthly,12,inflow,AD
1,AE,2019,2,Monthly,2,inflow,AD
2,AE,2019,3,Monthly,1,inflow,AD
3,AE,2019,4,Monthly,7,inflow,AD
4,AE,2019,5,Monthly,0,inflow,AD
...,...,...,...,...,...,...,...
3145023,ZW,2022,8,Total Migration,4339,outflow,all_countries
3145024,ZW,2022,9,Total Migration,4867,outflow,all_countries
3145025,ZW,2022,10,Total Migration,6280,outflow,all_countries
3145026,ZW,2022,11,Total Migration,5907,outflow,all_countries


# Merge

In [19]:
merged_migration = final.merge(gdf_country, left_on='geo', right_on='CNTR_CODE', how='inner').drop(columns='CNTR_CODE')

merged_migration['geo_source'] = np.where(
    merged_migration['geo'] == 'UK',  # Condition
    'NUTS2021',                             # Value if True (UK)
    'NUTS2024'                              # Value if False (all others)
)

merged_migration

Unnamed: 0,geo,obsTime,month,time_granularity,obsValue,flow_type,country,geometry,geo_source
0,AL,2019,1,Monthly,0,inflow,AD,"MULTIPOLYGON (((5120936.933 2221189.677, 51208...",NUTS2024
1,AL,2019,2,Monthly,5,inflow,AD,"MULTIPOLYGON (((5120936.933 2221189.677, 51208...",NUTS2024
2,AL,2019,3,Monthly,2,inflow,AD,"MULTIPOLYGON (((5120936.933 2221189.677, 51208...",NUTS2024
3,AL,2019,4,Monthly,0,inflow,AD,"MULTIPOLYGON (((5120936.933 2221189.677, 51208...",NUTS2024
4,AL,2019,5,Monthly,0,inflow,AD,"MULTIPOLYGON (((5120936.933 2221189.677, 51208...",NUTS2024
...,...,...,...,...,...,...,...,...,...
677297,XK,2022,8,Total Migration,2690,outflow,all_countries,"POLYGON ((5201301.166 2301673.012, 5202865.563...",NUTS2024
677298,XK,2022,9,Total Migration,4658,outflow,all_countries,"POLYGON ((5201301.166 2301673.012, 5202865.563...",NUTS2024
677299,XK,2022,10,Total Migration,6711,outflow,all_countries,"POLYGON ((5201301.166 2301673.012, 5202865.563...",NUTS2024
677300,XK,2022,11,Total Migration,3905,outflow,all_countries,"POLYGON ((5201301.166 2301673.012, 5202865.563...",NUTS2024


In [20]:
# Convert date columns
merged_migration = merged_migration[(merged_migration['month'] != 'all') & (merged_migration['time_granularity'] == 'Total Migration')].copy()
merged_migration['date'] = pd.to_datetime(merged_migration['obsTime'].astype(str) + '-' + merged_migration['month'].astype(str).str.zfill(2))
merged_migration

Unnamed: 0,geo,obsTime,month,time_granularity,obsValue,flow_type,country,geometry,geo_source,date
673402,AL,2019,1,Total Migration,2309,inflow,all_countries,"MULTIPOLYGON (((5120936.933 2221189.677, 51208...",NUTS2024,2019-01-01
673403,AL,2019,2,Total Migration,2438,inflow,all_countries,"MULTIPOLYGON (((5120936.933 2221189.677, 51208...",NUTS2024,2019-02-01
673404,AL,2019,3,Total Migration,2593,inflow,all_countries,"MULTIPOLYGON (((5120936.933 2221189.677, 51208...",NUTS2024,2019-03-01
673405,AL,2019,4,Total Migration,2566,inflow,all_countries,"MULTIPOLYGON (((5120936.933 2221189.677, 51208...",NUTS2024,2019-04-01
673406,AL,2019,5,Total Migration,2499,inflow,all_countries,"MULTIPOLYGON (((5120936.933 2221189.677, 51208...",NUTS2024,2019-05-01
...,...,...,...,...,...,...,...,...,...,...
677297,XK,2022,8,Total Migration,2690,outflow,all_countries,"POLYGON ((5201301.166 2301673.012, 5202865.563...",NUTS2024,2022-08-01
677298,XK,2022,9,Total Migration,4658,outflow,all_countries,"POLYGON ((5201301.166 2301673.012, 5202865.563...",NUTS2024,2022-09-01
677299,XK,2022,10,Total Migration,6711,outflow,all_countries,"POLYGON ((5201301.166 2301673.012, 5202865.563...",NUTS2024,2022-10-01
677300,XK,2022,11,Total Migration,3905,outflow,all_countries,"POLYGON ((5201301.166 2301673.012, 5202865.563...",NUTS2024,2022-11-01


In [21]:
# Make sure geometry is interpreted as Geo
gdf = gpd.GeoDataFrame(merged_migration, geometry='geometry')
gdf = gdf.set_crs("EPSG:3035")  # Use your projection or convert to EPSG:4326 for WGS84

# Filter date range
gdf = gdf[(gdf['date'] >= '2019-01-01') & (gdf['date'] <= '2022-12-31')]
gdf

Unnamed: 0,geo,obsTime,month,time_granularity,obsValue,flow_type,country,geometry,geo_source,date
673402,AL,2019,1,Total Migration,2309,inflow,all_countries,"MULTIPOLYGON (((5120936.933 2221189.677, 51208...",NUTS2024,2019-01-01
673403,AL,2019,2,Total Migration,2438,inflow,all_countries,"MULTIPOLYGON (((5120936.933 2221189.677, 51208...",NUTS2024,2019-02-01
673404,AL,2019,3,Total Migration,2593,inflow,all_countries,"MULTIPOLYGON (((5120936.933 2221189.677, 51208...",NUTS2024,2019-03-01
673405,AL,2019,4,Total Migration,2566,inflow,all_countries,"MULTIPOLYGON (((5120936.933 2221189.677, 51208...",NUTS2024,2019-04-01
673406,AL,2019,5,Total Migration,2499,inflow,all_countries,"MULTIPOLYGON (((5120936.933 2221189.677, 51208...",NUTS2024,2019-05-01
...,...,...,...,...,...,...,...,...,...,...
677297,XK,2022,8,Total Migration,2690,outflow,all_countries,"POLYGON ((5201301.166 2301673.012, 5202865.563...",NUTS2024,2022-08-01
677298,XK,2022,9,Total Migration,4658,outflow,all_countries,"POLYGON ((5201301.166 2301673.012, 5202865.563...",NUTS2024,2022-09-01
677299,XK,2022,10,Total Migration,6711,outflow,all_countries,"POLYGON ((5201301.166 2301673.012, 5202865.563...",NUTS2024,2022-10-01
677300,XK,2022,11,Total Migration,3905,outflow,all_countries,"POLYGON ((5201301.166 2301673.012, 5202865.563...",NUTS2024,2022-11-01


In [27]:
# Create folder to save images
output_dir = Path("gif_frames")
output_dir.mkdir(exist_ok=True)

# Set flow type
flow_ty = 'inflow'  # or 'outflow'

# Filter data for the selected flow type
gdf_plot = gdf[gdf['flow_type'] == flow_ty]

# Compute global color scale limits using quantiles
vmin = gdf_plot['obsValue'].quantile(0.05)
vmax = gdf_plot['obsValue'].quantile(0.95)

# Choose color map
cmap = 'Blues' if flow_ty == 'inflow' else 'Reds'

def plot_monthly_flows(date, gdf, save_path):
    fig, ax = plt.subplots(1, 1, figsize=(10, 6))

    # Filter by date
    monthly_data = gdf[gdf['date'] == date]

    # Pivot and merge
    pivot = monthly_data.pivot_table(index='geo', columns='flow_type', values='obsValue', aggfunc='sum', fill_value=0)
    merged = monthly_data.drop_duplicates('geo').set_index('geo').join(pivot)

    # Plot the selected flow type
    if flow_ty in merged.columns:
        merged.plot(
            ax=ax,
            column=flow_ty,
            cmap=cmap,
            legend=True,
            legend_kwds={'label': flow_ty.capitalize()},
            vmin=vmin,
            vmax=vmax,
            # alpha=0.6
        )

    ax.set_title(f"Migration Flows: {flow_ty.capitalize()} {date.strftime('%Y-%m')}")
    ax.axis("off")
    plt.savefig(save_path)
    plt.close()


In [28]:
# Create list of monthly dates
dates = pd.date_range(start="2019-01-01", end="2022-12-01", freq='MS')

# Generate and save each plot
for date in tqdm(dates):
    save_path = output_dir / f"{flow_ty}_{date.strftime('%Y-%m')}.png"
    plot_monthly_flows(date, gdf_plot, save_path)

100%|███████████████████████████████████████████| 48/48 [00:53<00:00,  1.12s/it]


In [29]:
# Read images in order
images = []
for date in dates:
    filepath = output_dir / f"{flow_ty}_{date.strftime('%Y-%m')}.png"
    images.append(imageio.imread(filepath))

# Save as GIF
imageio.mimsave(f"{flow_ty}_migration_flows.gif", images, fps=5)