# international_migration_flow

This note works to produce the dataset to be put in Mapineq database  
Source: Meta  
Publisher: HDX (https://data.humdata.org/dataset/international-migration-flows)   
New York Times Article: https://www.nytimes.com/interactive/2025/04/17/opinion/global-migration-facebook-data.html   
Special countries: BA, UA, XK, UK/GB, EL/GR 

In [2]:
import os
import gc
import rasterio
import numpy as np
import pandas as pd
from tqdm import tqdm
import geopandas as gpd
from pathlib import Path
import dask_geopandas as dgpd
from osgeo import gdal, osr
from rasterstats import zonal_stats

BASE_DIR = Path('/Users/wenlanzhang/PycharmProjects/Mapineq/src/data-wrangling/')
DATA_DIR = Path('/Users/wenlanzhang/Downloads/PhD_UCL/Data/Oxford')

# Load NTUS

In [3]:
# Load a GeoJSON file
gdf_2024 = gpd.read_file(DATA_DIR/"NUTS/NUTS_RG_01M_2024_3035.geojson")

gdf_country_2024 = gdf_2024[gdf_2024['LEVL_CODE'] == 0]
gdf_country_2024 = gdf_country_2024[['CNTR_CODE', 'geometry']].reset_index(drop=True)
# gdf_country_2024

# Load a GeoJSON file
gdf_2021 = gpd.read_file(DATA_DIR/"NUTS/NUTS_RG_01M_2021_3035.geojson")

gdf_country_2021 = gdf_2021[gdf_2021['LEVL_CODE'] == 0]
gdf_country_2021 = gdf_country_2021[['CNTR_CODE', 'geometry']].reset_index(drop=True)
uk_2021 = gdf_country_2021[gdf_country_2021['CNTR_CODE'] == 'UK']
# uk_2021

# Concatenate with 2024 data
gdf_country = pd.concat(
    [gdf_country_2024, uk_2021],
    ignore_index=True  # Reset index
)
gdf_country

Unnamed: 0,CNTR_CODE,geometry
0,EL,"MULTIPOLYGON (((6083881.558 1676236.011, 60838..."
1,ES,"MULTIPOLYGON (((3815056.85 1904971.998, 381575..."
2,FI,"MULTIPOLYGON (((4999905.463 5305310.537, 50031..."
3,FR,"MULTIPOLYGON (((9980485.23 -3029930.054, 99812..."
4,HR,"MULTIPOLYGON (((4809521.18 2624665.768, 480963..."
5,EE,"MULTIPOLYGON (((5200614.71 4159725.901, 520083..."
6,DE,"MULTIPOLYGON (((4355225.354 2715902.995, 43548..."
7,DK,"MULTIPOLYGON (((4650283.775 3591676.53, 465057..."
8,BA,"MULTIPOLYGON (((4866544.51 2485914.128, 486624..."
9,AT,"MULTIPOLYGON (((4354847.685 2714710.627, 43552..."


In [4]:
country_list = gdf_country['CNTR_CODE'].unique()
len(country_list)
country_list

array(['EL', 'ES', 'FI', 'FR', 'HR', 'EE', 'DE', 'DK', 'BA', 'AT', 'BG',
       'CH', 'CY', 'BE', 'CZ', 'AL', 'LU', 'LV', 'ME', 'IE', 'IS', 'IT',
       'MK', 'MT', 'LI', 'NL', 'LT', 'HU', 'RS', 'SE', 'SI', 'TR', 'UA',
       'SK', 'RO', 'NO', 'PL', 'PT', 'XK', 'UK'], dtype=object)

# Load Migration

In [6]:
df = pd.read_csv(DATA_DIR/f"Migration/international_migration_flow.csv") 
df['year'] = pd.to_datetime(df['migration_month']).dt.year
df['month'] = pd.to_datetime(df['migration_month']).dt.month

# Define the mapping of old codes to new codes
country_code_mapping = {
    'GR': 'EL',  # Greece (GR → EL)
    'GB': 'UK'   # United Kingdom (GB → UK)
}

# Apply the replacement to both columns
df['country_from'] = df['country_from'].replace(country_code_mapping)
df['country_to'] = df['country_to'].replace(country_code_mapping)

df
# len(df['country_from'].unique())

Unnamed: 0,country_from,country_to,migration_month,num_migrants,year,month
0,AD,AE,2019-01,12,2019,1
1,AD,AE,2019-02,2,2019,2
2,AD,AE,2019-03,1,2019,3
3,AD,AE,2019-04,7,2019,4
4,AD,AE,2019-05,0,2019,5
...,...,...,...,...,...,...
1563149,ZW,ZM,2022-08,138,2022,8
1563150,ZW,ZM,2022-09,162,2022,9
1563151,ZW,ZM,2022-10,149,2022,10
1563152,ZW,ZM,2022-11,104,2022,11


In [7]:
len(df['country_to'].unique())
# df['country_to'].unique()

# df[df['country_to'].isna()].groupby('country_from').size().unique()
# df[df['country_from'].isna()].groupby('country_to').size().unique()

181

In [8]:
def generate_migration_flow(df, direction='inflow'):
    import pandas as pd
    import calendar

    if direction not in ['inflow', 'outflow']:
        raise ValueError("Direction must be 'inflow' or 'outflow'")

    if direction == 'inflow':
        main_col = 'country_to'
        other_col = 'country_from'
        new_main = 'geo'
        new_other = 'origin'
    else:
        main_col = 'country_from'
        other_col = 'country_to'
        new_main = 'geo'
        new_other = 'destination'

    # Convert month numbers to full month names
    df = df.copy()
    df['month'] = df['month'].apply(lambda x: calendar.month_name[x] if pd.notna(x) and x != 'all' else 'all')

    # Monthly data
    monthly = df.rename(columns={main_col: new_main, other_col: new_other}).copy()
    monthly['time_granularity'] = 'Monthly'
    monthly = monthly.rename(columns={'num_migrants': 'obsValue', 'year': 'obsTime'})

    # Add "All countries"
    monthly_total = (
        monthly.groupby([new_main, 'obsTime', 'month'], as_index=False)
        .agg({'obsValue': 'sum'})
    )
    monthly_total[new_other] = 'All countries'
    monthly_total['time_granularity'] = 'Monthly'

    monthly_full = pd.concat([monthly, monthly_total], ignore_index=True)

    # Yearly data
    yearly = (
        df.groupby([main_col, other_col, 'year'], as_index=False)
        .agg({'num_migrants': 'sum'})
        .rename(columns={main_col: new_main, other_col: new_other, 'num_migrants': 'obsValue', 'year': 'obsTime'})
    )
    yearly['month'] = 'All'
    yearly['time_granularity'] = 'Annual'

    # Add "All countries" for yearly
    yearly_total = (
        yearly.groupby([new_main, 'obsTime'], as_index=False)
        .agg({'obsValue': 'sum'})
    )
    yearly_total[new_other] = 'All countries'
    yearly_total['month'] = 'All'
    yearly_total['time_granularity'] = 'Annual'

    yearly_full = pd.concat([yearly, yearly_total], ignore_index=True)

    # Final combined dataframe
    result_df = pd.concat([monthly_full, yearly_full], ignore_index=True)

    result_df['']
    
    # Reorder columns (no migration_month)
    columns_order = [new_main, new_other, 'obsValue', 'obsTime', 'month', 'time_granularity']
    return result_df[columns_order]


In [9]:
inflow_df = generate_migration_flow(df, direction='inflow')
inflow_df

Unnamed: 0,geo,origin,obsValue,obsTime,month,time_granularity
0,AE,AD,12,2019,January,Monthly
1,AE,AD,2,2019,February,Monthly
2,AE,AD,1,2019,March,Monthly
3,AE,AD,7,2019,April,Monthly
4,AE,AD,0,2019,May,Monthly
...,...,...,...,...,...,...
1701335,ZM,All countries,26761,2022,All,Annual
1701336,ZW,All countries,38420,2019,All,Annual
1701337,ZW,All countries,37373,2020,All,Annual
1701338,ZW,All countries,42824,2021,All,Annual


In [10]:
outflow_df = generate_migration_flow(df, direction='outflow')
outflow_df

Unnamed: 0,geo,destination,obsValue,obsTime,month,time_granularity
0,AD,AE,12,2019,January,Monthly
1,AD,AE,2,2019,February,Monthly
2,AD,AE,1,2019,March,Monthly
3,AD,AE,7,2019,April,Monthly
4,AD,AE,0,2019,May,Monthly
...,...,...,...,...,...,...
1701335,ZM,All countries,27535,2022,All,Annual
1701336,ZW,All countries,70887,2019,All,Annual
1701337,ZW,All countries,52552,2020,All,Annual
1701338,ZW,All countries,50344,2021,All,Annual


In [12]:
# outflow_df[(outflow_df['geo'] == 'UA') & (outflow_df['destination'] == 'All countries') & (outflow_df['month'] == 'All')]

Unnamed: 0,geo,destination,obsValue,obsTime,month,time_granularity
1701276,UA,All countries,205455,2019,All,Annual
1701277,UA,All countries,164486,2020,All,Annual
1701278,UA,All countries,277724,2021,All,Annual
1701279,UA,All countries,2402143,2022,All,Annual


In [30]:
merged_inflow_df

Unnamed: 0,geo,origin,obsValue,obsTime,month,time_granularity,geo_source
0,AL,AD,0,2019,January,Monthly,NUTS2024
1,AL,AD,5,2019,February,Monthly,NUTS2024
2,AL,AD,2,2019,March,Monthly,NUTS2024
3,AL,AD,0,2019,April,Monthly,NUTS2024
4,AL,AD,0,2019,May,Monthly,NUTS2024
...,...,...,...,...,...,...,...
366543,UK,All countries,1334991,2022,All,Annual,NUTS2024
366544,XK,All countries,19295,2019,All,Annual,NUTS2024
366545,XK,All countries,14507,2020,All,Annual,NUTS2024
366546,XK,All countries,14140,2021,All,Annual,NUTS2024


# Merge

In [24]:
merged_outflow_df = outflow_df.merge(gdf_country, left_on='geo', right_on='CNTR_CODE', how='inner').drop(columns='CNTR_CODE')
merged_inflow_df = inflow_df.merge(gdf_country, left_on='geo', right_on='CNTR_CODE', how='inner').drop(columns='CNTR_CODE')

# merged_outflow_df['geo_source'] = np.where(
#     merged_outflow_df['geo'] == 'UK',  # Condition
#     'NUTS2021',                             # Value if True (UK)
#     'NUTS2024'                              # Value if False (all others)
# )

merged_outflow_df['geo_source'] = 'NUTS2024' 
merged_inflow_df['geo_source'] = 'NUTS2024' 

merged_inflow_df = merged_inflow_df.drop(columns=['geometry'])
merged_outflow_df = merged_outflow_df.drop(columns=['geometry'])

merged_outflow_df

Unnamed: 0,geo,destination,obsValue,obsTime,month,time_granularity,geo_source
0,AL,AD,4,2019,January,Monthly,NUTS2024
1,AL,AD,11,2019,February,Monthly,NUTS2024
2,AL,AD,2,2019,March,Monthly,NUTS2024
3,AL,AD,4,2019,April,Monthly,NUTS2024
4,AL,AD,0,2019,May,Monthly,NUTS2024
...,...,...,...,...,...,...,...
366541,UK,All countries,442661,2022,All,Annual,NUTS2024
366542,XK,All countries,35133,2019,All,Annual,NUTS2024
366543,XK,All countries,31414,2020,All,Annual,NUTS2024
366544,XK,All countries,34475,2021,All,Annual,NUTS2024


In [25]:
# merged_inflow_df
len(merged_inflow_df['geo'].unique())

39

In [29]:
merged_inflow_df.to_csv(DATA_DIR/"Migration/Output/Meta_Migration_Inflow.csv", index=True, index_label="id")
merged_outflow_df.to_csv(DATA_DIR/"Migration/Output/Meta_Migration_Outflow.csv", index=True, index_label="id")