In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv("../data/raw/infrastructure/decadal-average-annual-number-of-deaths-from-disasters.csv")

# Preview data
df.head(), df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3094 entries, 0 to 3093
Data columns (total 3 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country name                     3094 non-null   object 
 1   Year                             3094 non-null   int64  
 2   Number of deaths from disasters  3094 non-null   float64
dtypes: float64(1), int64(1), object(1)
memory usage: 72.6+ KB


(  Country name  Year  Number of deaths from disasters
 0  Afghanistan  1900                              0.0
 1  Afghanistan  1910                              0.0
 2  Afghanistan  1920                              0.0
 3  Afghanistan  1930                              0.0
 4  Afghanistan  1940                              0.0,
 None)

In [2]:
df = df.rename(columns={
    "Country name": "country",
    "Year": "year",
    "Number of deaths from disasters": "disaster_deaths"
})

df.head()


Unnamed: 0,country,year,disaster_deaths
0,Afghanistan,1900,0.0
1,Afghanistan,1910,0.0
2,Afghanistan,1920,0.0
3,Afghanistan,1930,0.0
4,Afghanistan,1940,0.0


In [3]:
!pip install pycountry





[notice] A new release of pip is available: 24.3.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import pycountry

def get_iso3(name):
    try:
        return pycountry.countries.lookup(name).alpha_3
    except:
        return None

df["iso_code"] = df["country"].apply(get_iso3)

df.head()


Unnamed: 0,country,year,disaster_deaths,iso_code
0,Afghanistan,1900,0.0,AFG
1,Afghanistan,1910,0.0,AFG
2,Afghanistan,1920,0.0,AFG
3,Afghanistan,1930,0.0,AFG
4,Afghanistan,1940,0.0,AFG


In [5]:
missing = df[df["iso_code"].isna()]["country"].unique()
missing


array(['Africa', 'Asia', 'Brunei', 'Cape Verde', "Cote d'Ivoire",
       'Czechoslovakia', 'Democratic Republic of Congo', 'East Germany',
       'East Timor', 'Europe', 'European Union (27)',
       'High-income countries', 'Low-income countries',
       'Lower-middle-income countries', 'Micronesia (country)',
       'Netherlands Antilles', 'North America', 'Oceania', 'Palestine',
       'Reunion', 'Russia', 'Saint Barthelemy', 'Saint Helena',
       'Serbia and Montenegro', 'South America', 'Turkey', 'USSR',
       'United States Virgin Islands', 'Upper-middle-income countries',
       'West Germany', 'World', 'Yemen Arab Republic',
       "Yemen People's Republic", 'Yugoslavia'], dtype=object)

In [6]:
# Manual ISO mappings
manual_iso = {
    "Brunei": "BRN",
    "Cape Verde": "CPV",
    "Cote d'Ivoire": "CIV",
    "Democratic Republic of Congo": "COD",
    "East Germany": "DEU",
    "West Germany": "DEU",
    "East Timor": "TLS",
    "Micronesia (country)": "FSM",
    "Netherlands Antilles": "ANT",
    "Palestine": "PSE",
    "Reunion": "REU",
    "Russia": "RUS",
    "Saint Barthelemy": "BLM",
    "Saint Helena": "SHN",
    "Serbia and Montenegro": "SRB",
    "Turkey": "TUR",
    "United States Virgin Islands": "VIR",
    "Yemen Arab Republic": "YEM",
    "Yemen People's Republic": "YEM",
}

# Apply manual ISO fixes
df['iso_code'] = df.apply(
    lambda row: manual_iso.get(row['country'], row['iso_code']),
    axis=1
)

# Countries to remove (regions + aggregates + unmappable historic countries)
remove_list = [
    "Africa", "Asia", "Europe", "European Union (27)", "High-income countries",
    "Low-income countries", "Lower-middle-income countries", "North America",
    "Oceania", "South America", "Upper-middle-income countries", "World",
    "Czechoslovakia", "Yugoslavia", "USSR"
]

df = df[~df['country'].isin(remove_list)].reset_index(drop=True)

# Check remaining missing ISO codes
df[df['iso_code'].isna()]['country'].unique()


array([], dtype=object)

In [8]:
print("Missing values (%):")
print(df.isna().mean() * 100)

print("\nSummary statistics:")
print(df.describe())


Missing values (%):
country            0.0
year               0.0
disaster_deaths    0.0
iso_code           0.0
dtype: float64

Summary statistics:
              year  disaster_deaths
count  2899.000000      2899.000000
mean   1960.000000       759.001150
std      37.423029     12752.205653
min    1900.000000         0.000000
25%    1930.000000         0.000000
50%    1960.000000         0.000000
75%    1990.000000         7.300000
max    2020.000000    438157.700000


In [9]:
unusual = df[df["disaster_deaths"] < 0]
unusual if len(unusual) else "No negative values found."


'No negative values found.'

In [11]:
output_path = "../data/interim/disaster_deaths_clean.csv"
df.to_csv(output_path, index=False)

print(f"Saved cleaned dataset to: {output_path}")


Saved cleaned dataset to: ../data/interim/disaster_deaths_clean.csv
