# international_migration Network Analysis

This note works with international migration with unemployment  

Meta Migration Data: https://data.humdata.org/dataset/international-migration-flows)    

Special countries: BA, UA, XK, UK/GB, EL/GR 

In [1]:
import os
import gc
import rasterio
import numpy as np
import pandas as pd
import networkx as nx
from tqdm import tqdm
import seaborn as sns
import geopandas as gpd
from pathlib import Path
from osgeo import gdal, osr
import dask_geopandas as dgpd
import matplotlib.pyplot as plt
from rasterstats import zonal_stats
from scipy.stats import linregress
from pyvis.network import Network

BASE_DIR = Path('/Users/wenlanzhang/PycharmProjects/Mapineq/src/data-wrangling/')
DATA_DIR = Path('/Users/wenlanzhang/Downloads/PhD_UCL/Data/Oxford')

# Load & Clean Migration Data

In [2]:
# Step 1: Load international migration data
def load_international_migration(filepath):
    df = pd.read_csv(filepath)
    df['year'] = pd.to_datetime(df['migration_month']).dt.year
    df['month'] = pd.to_datetime(df['migration_month']).dt.month

    # Country code mapping
    # country_code_mapping = {'GR': 'EL', 'GB': 'UK'}
    # df['country_from'] = df['country_from'].replace(country_code_mapping)
    # df['country_to'] = df['country_to'].replace(country_code_mapping)
    
    return df

migration_df = load_international_migration(DATA_DIR / "Migration/international_migration_flow.csv")
migration_df

Unnamed: 0,country_from,country_to,migration_month,num_migrants,year,month
0,AD,AE,2019-01,12,2019,1
1,AD,AE,2019-02,2,2019,2
2,AD,AE,2019-03,1,2019,3
3,AD,AE,2019-04,7,2019,4
4,AD,AE,2019-05,0,2019,5
...,...,...,...,...,...,...
1563149,ZW,ZM,2022-08,138,2022,8
1563150,ZW,ZM,2022-09,162,2022,9
1563151,ZW,ZM,2022-10,149,2022,10
1563152,ZW,ZM,2022-11,104,2022,11


In [3]:
# Check for missing values
print("\nMissing values per column (before cleaning):")
print(migration_df.isnull().sum())


Missing values per column (before cleaning):
country_from       8640
country_to         8640
migration_month       0
num_migrants          0
year                  0
month                 0
dtype: int64


In [4]:
migration_df_cleaned = migration_df.dropna()  # drop missing data
migration_df_cleaned = migration_df_cleaned[migration_df_cleaned['num_migrants'] > 0]  # filter num_migrants > 0

# Result preview after cleaning
print(f"\nNumber of rows after cleaning: {len(migration_df_cleaned)}")
print("Cleaned data preview:")
display(migration_df_cleaned.head())

# # Final missing value check
# print("\nMissing values per column (after cleaning):")
# print(migration_df_cleaned.isnull().sum())


Number of rows after cleaning: 939922
Cleaned data preview:


Unnamed: 0,country_from,country_to,migration_month,num_migrants,year,month
0,AD,AE,2019-01,12,2019,1
1,AD,AE,2019-02,2,2019,2
2,AD,AE,2019-03,1,2019,3
3,AD,AE,2019-04,7,2019,4
5,AD,AE,2019-06,14,2019,6


# Basic Flow Statistics

In [5]:
# Summary statistics of migration flows
print("Migration flow statistics:")
print(migration_df_cleaned['num_migrants'].describe())

# Number of unique countries
num_origin_countries = migration_df_cleaned['country_from'].nunique()
num_destination_countries = migration_df_cleaned['country_to'].nunique()

print(f"\nNumber of unique origin countries: {num_origin_countries}")
print(f"Number of unique destination countries: {num_destination_countries}")

Migration flow statistics:
count    939922.000000
mean        126.291461
std        1155.268020
min           1.000000
25%           4.000000
50%           9.000000
75%          24.000000
max      163937.000000
Name: num_migrants, dtype: float64

Number of unique origin countries: 180
Number of unique destination countries: 180


# Full Migration Network (Static Network)

- High density (~1), high reciprocity (1), means your data records nearly all possible migration relations.
- Some countries (US, IN, SA, AE) clearly dominate in inflow and outflow.
- Interesting that some small countries (FM, BN, VC) have high betweenness centrality — suggesting "bridging" roles.

## Build full graph and basic stats

In [6]:
# Aggregate total number of migrants per country pair
df_agg = migration_df_cleaned.groupby(['country_from', 'country_to'])['num_migrants'].sum().reset_index()

# Build full directed graph
G = nx.DiGraph()
for idx, row in df_agg.iterrows():
    G.add_edge(row['country_from'], row['country_to'], weight=row['num_migrants'])

# Basic graph stats
print(f"\nFull Network Statistics:")
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")

# Network density
density = nx.density(G)
print(f"Network density: {density:.4f}")

# Reciprocity
reciprocity = nx.overall_reciprocity(G)
print(f"Network reciprocity: {reciprocity:.4f}")

# Flow range
print(f"Flow range: min={df_agg['num_migrants'].min()}, max={df_agg['num_migrants'].max()}")


Full Network Statistics:
Number of nodes: 180
Number of edges: 32208
Network density: 0.9996
Network reciprocity: 1.0000
Flow range: min=40, max=2266749


## Degree and Strength

In [7]:
# Degree
in_degree = dict(G.in_degree())
out_degree = dict(G.out_degree())

print(f"\nAverage in-degree: {sum(in_degree.values()) / len(in_degree):.2f}")
print(f"Average out-degree: {sum(out_degree.values()) / len(out_degree):.2f}")

# Strength
in_strength = dict(G.in_degree(weight='weight'))
out_strength = dict(G.out_degree(weight='weight'))

# Top 5 by inflow
print(f"\nTop 5 countries by migration inflow:")
for country, strength in sorted(in_strength.items(), key=lambda x: x[1], reverse=True)[:5]:
    print(f"{country}: {strength}")

# Top 5 by outflow
print(f"\nTop 5 countries by migration outflow:")
for country, strength in sorted(out_strength.items(), key=lambda x: x[1], reverse=True)[:5]:
    print(f"{country}: {strength}")



Average in-degree: 178.93
Average out-degree: 178.93

Top 5 countries by migration inflow:
US: 10248130
SA: 6445863
IN: 6254349
AE: 5187573
GB: 3694022

Top 5 countries by migration outflow:
IN: 8916761
SA: 7295844
US: 3869882
VE: 3791224
PH: 3321684


## Centrality Measures

In [8]:
# Betweenness centrality
bet_centrality = nx.betweenness_centrality(G, weight='weight', normalized=True)

# Eigenvector centrality
eigen_centrality = nx.eigenvector_centrality_numpy(G, weight='weight')

# Top 5 by betweenness
top_betweenness = sorted(bet_centrality.items(), key=lambda x: x[1], reverse=True)[:5]
print(f"\nTop 5 countries by betweenness centrality:")
for country, centrality in top_betweenness:
    print(f"{country}: {centrality:.4f}")


Top 5 countries by betweenness centrality:
FM: 0.0327
BN: 0.0247
VC: 0.0244
ST: 0.0181
BZ: 0.0160


### PageRank (importance in terms of flow passing through)

In [9]:
pagerank = nx.pagerank(G, weight='weight')
top_pagerank = sorted(pagerank.items(), key=lambda x: x[1], reverse=True)[:5]
print("Top 5 countries by PageRank:", top_pagerank)


Top 5 countries by PageRank: [('US', 0.06551843799150323), ('IN', 0.039828844069561574), ('GB', 0.03465947903440195), ('DE', 0.03075938063701663), ('SA', 0.030039197758895304)]


### Closeness Centrality (how close a node is to all others)

In [10]:
closeness = nx.closeness_centrality(G, distance='weight')
top_closeness = sorted(closeness.items(), key=lambda x: x[1], reverse=True)[:5]
print("Top 5 countries by closeness centrality:", top_closeness)


Top 5 countries by closeness centrality: [('FM', 0.0077368603042876904), ('WS', 0.007572552669430578), ('LC', 0.007503038940352936), ('VU', 0.007425229186543328), ('TO', 0.0074080205272524105)]


### Katz Centrality
(for strongly connected graphs with high density, Katz centrality can highlight influence propagation)

In [11]:
katz_centrality = nx.katz_centrality_numpy(G, weight='weight')
top_katz = sorted(katz_centrality.items(), key=lambda x: x[1], reverse=True)[:5]
print("Top 5 countries by Katz centrality:", top_katz)


Top 5 countries by Katz centrality: [('VC', 0.2717081946107067), ('ST', 0.2641578168051126), ('DJ', 0.24426649294956532), ('LC', 0.23443881059180458), ('SZ', 0.20663748484196998)]


## Community Detection
In this section, we detect communities in the migration network.  
This can reveal clusters of countries with higher migration flow between them.
We use the Louvain algorithm, which optimizes modularity to find good community partitions.

In [12]:
# Convert to undirected graph for Louvain (works better this way)
G_undirected = G.to_undirected()

# Run Louvain community detection
import community as community_louvain

partition = community_louvain.best_partition(G_undirected, weight='weight')

# Number of communities
num_communities = len(set(partition.values()))
print(f"Number of communities detected: {num_communities}")

# Size of each community
from collections import Counter
community_sizes = Counter(partition.values())
print("Community sizes:", community_sizes)

# Example: list countries in largest community
largest_community_id = community_sizes.most_common(1)[0][0]
largest_community_countries = [country for country, comm in partition.items() if comm == largest_community_id]

print(f"Countries in largest community ({len(largest_community_countries)} countries):")
print(largest_community_countries)


AttributeError: module 'community' has no attribute 'best_partition'

## Flow Imbalance Analysis
In this section, we compute **net migration flow** for each country:  
**Net flow = inflow - outflow**  
Countries with high net inflow are migration "magnets", while high net outflow countries are "sources" of migrants.

In [None]:
# Compute net flow: inflow - outflow
net_flow = {country: in_strength.get(country, 0) - out_strength.get(country, 0) for country in G.nodes()}

# Top net inflow countries
top_net_inflow = sorted(net_flow.items(), key=lambda x: x[1], reverse=True)[:5]
print("Top 5 net inflow countries:")
for country, value in top_net_inflow:
    print(f"{country}: {value}")

# Top net outflow countries
top_net_outflow = sorted(net_flow.items(), key=lambda x: x[1], reverse=False)[:5]
print("\nTop 5 net outflow countries:")
for country, value in top_net_outflow:
    print(f"{country}: {value}")

## Temporal Network Dynamics
### Nodes and Edges per Yea

In [None]:
# Analyze network per year
for year in migration_df_cleaned['year'].unique():
    df_year = migration_df_cleaned[migration_df_cleaned['year'] == year]
    df_agg_year = df_year.groupby(['country_from', 'country_to'])['num_migrants'].sum().reset_index()

    G_year = nx.DiGraph()
    for idx, row in df_agg_year.iterrows():
        G_year.add_edge(row['country_from'], row['country_to'], weight=row['num_migrants'])

    print(f"\nYear {year} Statistics:")
    print(f"Number of nodes: {G_year.number_of_nodes()}")
    print(f"Number of edges: {G_year.number_of_edges()}")

### Density per Year
We compute network density for each year to see if connectivity changes over time.

In [13]:
for year in sorted(migration_df_cleaned['year'].unique()):
    df_year = migration_df_cleaned[migration_df_cleaned['year'] == year]
    df_agg_year = df_year.groupby(['country_from', 'country_to'])['num_migrants'].sum().reset_index()
    
    G_year = nx.DiGraph()
    for idx, row in df_agg_year.iterrows():
        G_year.add_edge(row['country_from'], row['country_to'], weight=row['num_migrants'])
    
    density_year = nx.density(G_year)
    print(f"Year {year}: density = {density_year:.4f}")


Year 2019: density = 0.9993
Year 2020: density = 0.9995
Year 2021: density = 0.9988
Year 2022: density = 0.9992


### Top Inflow/Outflow per Year
We compute the top 5 countries by migration inflow and outflow for each year.

In [14]:
for year in sorted(migration_df_cleaned['year'].unique()):
    df_year = migration_df_cleaned[migration_df_cleaned['year'] == year]
    df_agg_year = df_year.groupby(['country_from', 'country_to'])['num_migrants'].sum().reset_index()
    
    G_year = nx.DiGraph()
    for idx, row in df_agg_year.iterrows():
        G_year.add_edge(row['country_from'], row['country_to'], weight=row['num_migrants'])
    
    in_strength_year = dict(G_year.in_degree(weight='weight'))
    out_strength_year = dict(G_year.out_degree(weight='weight'))
    
    top_inflow_year = sorted(in_strength_year.items(), key=lambda x: x[1], reverse=True)[:5]
    top_outflow_year = sorted(out_strength_year.items(), key=lambda x: x[1], reverse=True)[:5]
    
    print(f"\nYear {year} - Top 5 Inflow Countries:")
    for country, value in top_inflow_year:
        print(f"{country}: {value}")
    
    print(f"\nYear {year} - Top 5 Outflow Countries:")
    for country, value in top_outflow_year:
        print(f"{country}: {value}")



Year 2019 - Top 5 Inflow Countries:
US: 2283443
SA: 1599334
IN: 1544724
AE: 1248096
CO: 991173

Year 2019 - Top 5 Outflow Countries:
IN: 2520488
SA: 2032124
VE: 1701418
US: 1242349
PH: 1113302

Year 2020 - Top 5 Inflow Countries:
IN: 1654002
US: 1263920
SA: 840582
PH: 752719
GB: 657852

Year 2020 - Top 5 Outflow Countries:
SA: 1666458
IN: 1329305
US: 980398
MY: 832512
AE: 803513

Year 2021 - Top 5 Inflow Countries:
US: 2591779
IN: 1683903
SA: 1207846
AE: 1144569
GB: 845050

Year 2021 - Top 5 Outflow Countries:
SA: 1882098
IN: 1638009
VE: 809954
US: 806117
AE: 749804

Year 2022 - Top 5 Inflow Countries:
US: 4108988
SA: 2798101
AE: 2151213
IN: 1371720
DE: 1345548

Year 2022 - Top 5 Outflow Countries:
IN: 3428959
UA: 2402143
SA: 1715164
PK: 1413776
BD: 1391282


### Flow imbalance evolution over time (optional)
We compute net flow (inflow - outflow) per year.  
This shows if a country is becoming more of a migration source or destination over time.

In [15]:
# Example: track net flow for one country over time (e.g., US)
country_to_track = 'UA'

net_flow_over_time = {}

for year in sorted(migration_df_cleaned['year'].unique()):
    df_year = migration_df_cleaned[migration_df_cleaned['year'] == year]
    df_agg_year = df_year.groupby(['country_from', 'country_to'])['num_migrants'].sum().reset_index()
    
    G_year = nx.DiGraph()
    for idx, row in df_agg_year.iterrows():
        G_year.add_edge(row['country_from'], row['country_to'], weight=row['num_migrants'])
    
    in_strength_year = dict(G_year.in_degree(weight='weight'))
    out_strength_year = dict(G_year.out_degree(weight='weight'))
    
    net_flow_year = in_strength_year.get(country_to_track, 0) - out_strength_year.get(country_to_track, 0)
    net_flow_over_time[year] = net_flow_year

# Print result
print(f"\nNet flow over time for {country_to_track}:")
for year, value in net_flow_over_time.items():
    print(f"{year}: {value}")


Net flow over time for UA:
2019: -74422
2020: 6089
2021: -180810
2022: -2335585


## Filtered Network for Visualization (Strong Flows)

In [27]:
# Define threshold for "strong" flows
threshold = 10000

# Filter flows
df_agg_filtered = df_agg[df_agg['num_migrants'] >= threshold]

# Build filtered graph
G_filtered = nx.DiGraph()
for idx, row in df_agg_filtered.iterrows():
    G_filtered.add_edge(row['country_from'], row['country_to'], weight=row['num_migrants'])

# Compute in-strength
in_strength_filtered = dict(G_filtered.in_degree(weight='weight'))

# PyVis visualization
net = Network(height='800px', width='100%', bgcolor='#222222', font_color='white', notebook=True, directed=True, cdn_resources='in_line')
net.force_atlas_2based()

# Scaling node size
if in_strength_filtered:
    max_inflow = max(in_strength_filtered.values())
else:
    max_inflow = 1

for node in G_filtered.nodes():
    inflow = in_strength_filtered.get(node, 0)
    size = 10 + 40 * (inflow / max_inflow)
    net.add_node(node, label=node, size=size, title=f"Inflow: {inflow} migrants")

# Add edges
for u, v, data in G_filtered.edges(data=True):
    net.add_edge(u, v, value=data['weight'] / 100000, title=f"{u} → {v}: {data['weight']} migrants")

# Show network
net.show("migration_network.html")


migration_network.html


In [28]:
net = Network(height='800px', width='100%', bgcolor='#222222', font_color='white', notebook=True, directed=True, cdn_resources='in_line')
net.force_atlas_2based()

# Set options FIRST
options = """
{
  "nodes": {
    "font": {
      "size": 50,
      "color": "white"
    },
    "color": {
      "background": "#1f78b4",
      "border": "#ffffff"
    }
  },
  "edges": {
    "color": {
      "color": "#aaaaaa"
    },
    "arrows": {
      "to": {"enabled": true, "scaleFactor": 1.2}
    }
  },
  "physics": {
    "forceAtlas2Based": {
      "gravitationalConstant": -50,
      "centralGravity": 0.01,
      "springLength": 100,
      "springConstant": 0.08
    },
    "solver": "forceAtlas2Based"
  }
}
"""
net.set_options(options)


In [29]:
# Add nodes
for node in G_filtered.nodes():
    inflow = in_strength_filtered.get(node, 0)
    size = 10 + 40 * (inflow / max_inflow)
    net.add_node(node, label=node, size=size, title=f"Inflow: {inflow} migrants")

# Add edges
for u, v, data in G_filtered.edges(data=True):
    net.add_edge(u, v, value=data['weight'] / 100000, title=f"{u} → {v}: {data['weight']} migrants")

# Show
net.show("migration_network.html")


migration_network.html


# Group by region

In [30]:
all_df = pd.read_csv(DATA_DIR/'all.csv')
all_df

Unnamed: 0,name,alpha-2,alpha-3,country-code,iso_3166-2,region,sub-region,intermediate-region,region-code,sub-region-code,intermediate-region-code
0,Afghanistan,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,,142.0,34.0,
1,Åland Islands,AX,ALA,248,ISO 3166-2:AX,Europe,Northern Europe,,150.0,154.0,
2,Albania,AL,ALB,8,ISO 3166-2:AL,Europe,Southern Europe,,150.0,39.0,
3,Algeria,DZ,DZA,12,ISO 3166-2:DZ,Africa,Northern Africa,,2.0,15.0,
4,American Samoa,AS,ASM,16,ISO 3166-2:AS,Oceania,Polynesia,,9.0,61.0,
...,...,...,...,...,...,...,...,...,...,...,...
244,Wallis and Futuna,WF,WLF,876,ISO 3166-2:WF,Oceania,Polynesia,,9.0,61.0,
245,Western Sahara,EH,ESH,732,ISO 3166-2:EH,Africa,Northern Africa,,2.0,15.0,
246,Yemen,YE,YEM,887,ISO 3166-2:YE,Asia,Western Asia,,142.0,145.0,
247,Zambia,ZM,ZMB,894,ISO 3166-2:ZM,Africa,Sub-Saharan Africa,Eastern Africa,2.0,202.0,14.0


In [36]:
region_lookup = dict(zip(all_df['alpha-2'], all_df['region']))
subregion_lookup = dict(zip(all_df['alpha-2'], all_df['sub-region']))

# Now, map the values to df_agg_filtered
df_agg_filtered.loc[:, 'region_from'] = df_agg_filtered['country_from'].map(region_lookup)
df_agg_filtered.loc[:, 'sub_region_from'] = df_agg_filtered['country_from'].map(subregion_lookup)

df_agg_filtered.loc[:, 'region_to'] = df_agg_filtered['country_to'].map(region_lookup)
df_agg_filtered.loc[:, 'sub_region_to'] = df_agg_filtered['country_to'].map(subregion_lookup)
df_agg_filtered

Unnamed: 0,country_from,country_to,num_migrants,region_from,sub_region_from,region_to,sub_region_to
180,AE,AF,15481,Asia,Western Asia,Asia,Southern Asia
186,AE,AU,11857,Asia,Western Asia,Oceania,Australia and New Zealand
190,AE,BD,208064,Asia,Western Asia,Asia,Southern Asia
205,AE,CA,28115,Asia,Western Asia,Americas,Northern America
225,AE,EG,115237,Asia,Western Asia,Africa,Northern Africa
...,...,...,...,...,...,...,...
31848,ZA,ZM,11168,Africa,Sub-Saharan Africa,Africa,Sub-Saharan Africa
31849,ZA,ZW,117845,Africa,Sub-Saharan Africa,Africa,Sub-Saharan Africa
32053,ZW,BW,12916,Africa,Sub-Saharan Africa,Africa,Sub-Saharan Africa
32085,ZW,GB,22411,Africa,Sub-Saharan Africa,Europe,Northern Europe


In [32]:
# Example regions in your data
regions = df_agg_filtered['region_from'].dropna().unique().tolist() + df_agg_filtered['region_to'].dropna().unique().tolist()
regions = list(set(regions))  # remove duplicates

# Color palette
color_palette = [
    "#1f78b4",  # blue
    "#33a02c",  # green
    "#e31a1c",  # red
    "#ff7f00",  # orange
    "#6a3d9a",  # purple
    "#b15928",  # brown
]

# Map each region to a color
region_color_map = {region: color_palette[i % len(color_palette)] for i, region in enumerate(regions)}


In [33]:
# Build a lookup for country_code → region
country_region_lookup = {}

# Add country_from
for idx, row in df_agg_filtered.iterrows():
    country_region_lookup[row['country_from']] = row['region_from']

# Add country_to
for idx, row in df_agg_filtered.iterrows():
    country_region_lookup[row['country_to']] = row['region_to']


In [35]:
net = Network(height='800px', width='100%', bgcolor='#222222', font_color='white', notebook=True, directed=True, cdn_resources='in_line')
net.force_atlas_2based()

# Set options FIRST (your same options)
net.set_options(options)

# Add nodes with region-based color
for node in G_filtered.nodes():
    inflow = in_strength_filtered.get(node, 0)
    size = 60 + 100 * (inflow / max_inflow)
    
    region = country_region_lookup.get(node, "Unknown")
    color = region_color_map.get(region, "#999999")  # fallback to gray if unknown
    
    net.add_node(
        node,
        label=node,
        size=size,
        title=f"Inflow: {inflow} migrants<br>Region: {region}",
        color=color
    )

for u, v, data in G_filtered.edges(data=True):
    net.add_edge(u, v, value=data['weight'] / 100000, title=f"{u} → {v}: {data['weight']} migrants")

net.show("migration_network_region_grouped.html")

migration_network.html


If you want to make the plot even more informative:

- Add a legend explaining the color coding (regions) and node size.
- If feasible, show edge weights (thicker line = stronger flow) to visually differentiate strong corridors from weaker ones.
- Label a few key nodes (big hubs) explicitly — e.g. US, UK, DE, IN — to help the reader orient themselves.
- Consider using a slightly lighter background or lighter edges to improve visual contrast.