In [1]:
import pandas as pd
from pathlib import Path

path_to_icij_data = Path("~/Projects/BachelorThesisCode/datasets/icij/")
path_to_officers = path_to_icij_data / "FullDatasetLong" / "nodes-officers.csv"

df_officers = pd.read_csv(path_to_officers)

df_officers.head()

Unnamed: 0,node_id,name,countries,country_codes,sourceID,valid_until,note
0,12000001,KIM SOO IN,South Korea,KOR,Panama Papers,The Panama Papers data is current through 2015,
1,12000002,Tian Yuan,China,CHN,Panama Papers,The Panama Papers data is current through 2015,
2,12000003,GREGORY JOHN SOLOMON,Australia,AUS,Panama Papers,The Panama Papers data is current through 2015,
3,12000004,MATSUDA MASUMI,Japan,JPN,Panama Papers,The Panama Papers data is current through 2015,
4,12000005,HO THUY NGA,Viet Nam,VNM,Panama Papers,The Panama Papers data is current through 2015,


In [2]:
df_officers['countries'].value_counts()

countries
Malta                                    45042
Not identified                           39450
China                                    38275
Hong Kong                                30226
United States                            26958
                                         ...  
Australia;Germany;Netherlands                1
Jersey;Portugal                              1
Germany;Singapore;United Kingdom             1
Australia;South Africa;United Kingdom        1
Turkmenistan;Russian Federation              1
Name: count, Length: 4090, dtype: int64

In [3]:
import plotly.express as px
import numpy as np

countries_split = df_officers['countries'].str.split(';').explode()
country_counts = countries_split.value_counts()

fig = px.choropleth(
    locations=country_counts.index,
    locationmode='country names',
    color=np.log10(country_counts.values),  # Apply log transformation
    color_continuous_scale='Viridis',
    title='Number of Officers by Country (Log Scale)',
    labels={'color': 'Log10(Number of Officers)'}
)

# Update the layout
fig.update_layout(
    title_x=0.5,
    geo=dict(showframe=False, showcoastlines=True, projection_type='equirectangular'),
    width=1000,
    height=600
)

fig.show()

In [10]:
country_counts.index.unique()

Index(['Malta', 'China', 'Not identified', 'Hong Kong', 'United States',
       'United Kingdom', 'Taiwan', 'British Virgin Islands', 'Italy',
       'Switzerland',
       ...
       'Channel Islands', 'Tuvalu', 'Korea, Republic of', 'Mayotte',
       'French Southern Territories', 'Greenland', 'Bhutan',
       'Sao Tome and Principe', 'Eswatini', ' Croatia'],
      dtype='object', name='countries', length=268)

In [23]:
import wbdata
import pandas as pd
import plotly.express as px
import numpy as np
from datetime import datetime

# Split and count countries as before
countries_split = df_officers['countries'].str.split(';').explode()
country_counts = countries_split.value_counts()

# Get population data
population_indicator = 'SP.POP.TOTL'  # Total population indicator
date = datetime(2015, 1, 1)  # Using 2015 as reference year

# Fetch population data
pop_data = wbdata.get_dataframe({population_indicator: 'Population'})
pop_data = pop_data.reset_index()

# Create a mapping dictionary for country names (you might need to adjust some names)
country_name_fixes = {
    'United States': 'United States of America',
    'UK': 'United Kingdom',
    # Add more mappings if needed
}

# Apply name fixes
countries_fixed = countries_split.map(lambda x: country_name_fixes.get(x, x))
country_counts_fixed = countries_fixed.value_counts()

# Merge population data with officer counts
merged_data = pd.DataFrame({
    'country': country_counts_fixed.index,
    'officers': country_counts_fixed.values
})
merged_data = merged_data.merge(pop_data, left_on='country', 
                              right_on='country', how='left')

# Calculate officers per million
merged_data['officers_per_million'] = (merged_data['officers'] / 
                                     merged_data['Population']) * 1_000_000


merged_data_filtered = merged_data[merged_data['officers_per_million'] < 300]

fig = px.choropleth(
    merged_data_filtered,
    locations='country',
    locationmode='country names',
    color=merged_data_filtered['officers_per_million'].fillna(0),
    # color=np.log10(merged_data['officers_per_million'].fillna(0) + 1),  # Add 1 to handle zeros
    color_continuous_scale='Viridis_r',
    title='nodes_officers per Million (WB, 2015)',
    labels={'color': 'nodes_officers per Million'}
)

fig.update_layout(
    title_x=0.5,
    geo=dict(showframe=False, showcoastlines=True, projection_type='equirectangular'),
    width=1000,
    height=600
)

fig.show()

print("\nTop 15 countries by officers per million inhabitants:")
print(merged_data_filtered.sort_values('officers_per_million', ascending=False)
      .head(15)[['country', 'officers_per_million', 'officers', 'Population']]
      .round(2))


Top 15 countries by officers per million inhabitants:
          country  officers_per_million  officers  Population
4948      Estonia                299.94       411   1370286.0
4968      Estonia                299.84       411   1370720.0
9238        Tonga                299.82        29     96725.0
4628       Latvia                298.88       572   1913822.0
3437       Norway                298.85      1258   4209488.0
5000      Estonia                298.48       411   1376955.0
3282  New Zealand                298.46      1348   4516500.0
9237        Tonga                298.34        29     97204.0
1807       Sweden                298.09      2921   9799186.0
4969      Estonia                297.97       411   1379350.0
3436       Norway                297.62      1258   4226901.0
1331     Malaysia                297.44      3756  12627862.0
6742         Fiji                297.30       128    430536.0
9236        Tonga                297.30        29     97544.0
3223      Leban

In [24]:
# Define Global South countries (this is a simplified list, you might want to adjust it)
global_south = [
    'Afghanistan', 'Algeria', 'Angola', 'Argentina', 'Bangladesh', 'Benin', 
    'Bolivia', 'Botswana', 'Brazil', 'Burkina Faso', 'Burundi', 'Cambodia', 
    'Cameroon', 'Central African Republic', 'Chad', 'Chile', 'China', 'Colombia', 
    'Congo', 'Costa Rica', 'Cuba', 'DR Congo', 'Dominican Republic', 'Ecuador', 
    'Egypt', 'El Salvador', 'Ethiopia', 'Gabon', 'Ghana', 'Guatemala', 'Guinea', 
    'Haiti', 'Honduras', 'India', 'Indonesia', 'Iran', 'Iraq', 'Ivory Coast', 
    'Jamaica', 'Jordan', 'Kenya', 'Laos', 'Lebanon', 'Lesotho', 'Liberia', 
    'Libya', 'Madagascar', 'Malawi', 'Malaysia', 'Mali', 'Mauritania', 'Mexico', 
    'Morocco', 'Mozambique', 'Myanmar', 'Namibia', 'Nepal', 'Nicaragua', 'Niger', 
    'Nigeria', 'Pakistan', 'Panama', 'Papua New Guinea', 'Paraguay', 'Peru', 
    'Philippines', 'Rwanda', 'Saudi Arabia', 'Senegal', 'Sierra Leone', 'Somalia', 
    'South Africa', 'South Sudan', 'Sri Lanka', 'Sudan', 'Syria', 'Tanzania', 
    'Thailand', 'Togo', 'Tunisia', 'Uganda', 'Uruguay', 'Venezuela', 'Vietnam', 
    'Yemen', 'Zambia', 'Zimbabwe'
]

# Filter the merged data for Global South countries
global_south_data = merged_data[merged_data['country'].isin(global_south)]

# Create choropleth map for Global South
fig = px.choropleth(
    global_south_data,
    locations='country',
    locationmode='country names',
    color=np.log10(global_south_data['officers_per_million'].fillna(0) + 1),
    color_continuous_scale='Viridis',
    title='Officers per Million Inhabitants in Global South Countries (Log Scale)',
    labels={'color': 'Log10(Officers per Million + 1)'}
)

# Update the layout to focus on Global South
fig.update_layout(
    title_x=0.5,
    geo=dict(
        showframe=False,
        showcoastlines=True,
        projection_type='equirectangular',
        # Adjust the map center and zoom to focus on Global South
        center=dict(lat=0, lon=20),
        projection_scale=1.5
    ),
    width=1000,
    height=600
)

fig.show()

# Print statistics for Global South countries
print("\nTop 15 Global South countries by officers per million inhabitants:")
print(global_south_data.sort_values('officers_per_million', ascending=False)
      .head(15)[['country', 'officers_per_million', 'officers', 'Population']]
      .round(2))

# Print some summary statistics
print("\nSummary statistics for Global South countries:")
print(f"Total number of countries with data: {len(global_south_data)}")
print(f"Total number of officers: {global_south_data['officers'].sum():,}")
print(f"Average officers per million: {global_south_data['officers_per_million'].mean():.2f}")


Top 15 Global South countries by officers per million inhabitants:
    country  officers_per_million  officers  Population
517  Panama               6215.78      6999   1126005.0
516  Panama               6034.30      6999   1159870.0
515  Panama               5855.43      6999   1195301.0
514  Panama               5680.43      6999   1232124.0
513  Panama               5510.74      6999   1270065.0
512  Panama               5347.50      6999   1308835.0
511  Panama               5190.64      6999   1348388.0
510  Panama               5040.23      6999   1388628.0
509  Panama               4896.30      6999   1429447.0
508  Panama               4758.36      6999   1470886.0
507  Panama               4625.69      6999   1513070.0
506  Panama               4497.84      6999   1556081.0
505  Panama               4375.13      6999   1599723.0
504  Panama               4258.10      6999   1643690.0
503  Panama               4146.51      6999   1687926.0

Summary statistics for Global South