In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import os

In [2]:
# Get the current working directory (where your notebook is running)
NOTEBOOK_DIR = Path(os.getcwd())

# Navigate to your dataset subfolder
DATA_DIR = NOTEBOOK_DIR.parent /"raw" / "Disease_and_Mobidity_data"

FILE_NAME = "WHO_TOP_CAUSES_OF_DEATH_FEMALE.csv"
FILE_PATH = DATA_DIR / FILE_NAME

print("Looking for file at:", FILE_PATH)

Looking for file at: C:\Users\USER\Desktop\WORK\RenewedCare\data\raw\Disease_and_Mobidity_data\WHO_TOP_CAUSES_OF_DEATH_FEMALE.csv


In [3]:
print(DATA_DIR.exists())   # Should be True

True


In [4]:
# Load dataset
df = pd.read_csv(FILE_PATH)

# === 1. Check basic info ===
print("✅ Dataset loaded successfully!")
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")
print("\n📋 Column names:")
print(df.columns.tolist())

✅ Dataset loaded successfully!
Number of rows: 24522
Number of columns: 5

📋 Column names:
['DIM_COUNTRY_CODE', 'DIM_YEAR_CODE', 'DIM_GHECAUSE_TITLE', 'DIM_SEX_CODE', 'VAL_DTHS_RATE100K_NUMERIC']


In [5]:
# === 2. Check for missing values ===
print("\n🔍 Missing values summary:")
missing_summary = df.isnull().sum().sort_values(ascending=False)
missing_percent = (df.isnull().mean() * 100).sort_values(ascending=False)

# Combine both into a single DataFrame for clarity
missing_report = pd.DataFrame({
    'Missing Values': missing_summary,
    'Percentage (%)': missing_percent.round(2)
})

# Show only columns with missing values
print(missing_report[missing_report['Missing Values'] > 0])

# === 3. (Optional) Show quick stats for numeric columns ===
print("\n📊 Quick numeric summary:")
display(df.describe())


🔍 Missing values summary:
Empty DataFrame
Index: []

📊 Quick numeric summary:


Unnamed: 0,DIM_YEAR_CODE,VAL_DTHS_RATE100K_NUMERIC
count,24522.0,24522.0
mean,2021.0,5.479543
std,0.0,22.383233
min,2021.0,0.0
25%,2021.0,0.0
50%,2021.0,0.17
75%,2021.0,2.39
max,2021.0,720.9


In [7]:
# Rename columns (matching lowercase names)
df = df.rename(columns={
    "DIM_GHECAUSE_TITLE": "diseases",
    "DIM_COUNTRY_CODE": "country_code",
    "DIM_YEAR_CODE": "year",
    "DIM_SEX_CODE": "sex",
    "VAL_DTHS_RATE100K_NUMERIC": "death_rate"
})

# Verify the rename
print(df.columns.tolist())


['country_code', 'year', 'diseases', 'sex', 'death_rate']


In [17]:
df["diseases"] = df["diseases"].str.strip().str.lower()
df["country_code"] = df["country_code"].str.strip().str.lower()
df["sex"] = df["sex"].str.strip().str.low()

In [23]:
print(df["year"].dtype)


int64


In [18]:
# List of diseases to look for
target_diseases = [
    "malaria", "tuberculosis", "cholera", "chlamydia", "gonorrhea",
    "syphilis", "trichomoniasis", "public lice", "crabs", "scabies",
    "mycoplasma genitalium", "hiv", "herpes", "human papillomavirus",
    "hpv", "hepatitis", "mpox"
]

# Create a mask to check if any disease name appears in the GHECAUSE title
mask = df['diseases'].apply(
    lambda x: any(disease in x for disease in target_diseases)
)

# Filter the dataset
disease_df = df[mask].copy()

# Group by disease and country
result = (
    disease_df.groupby(['diseases', 'country_code'])
    .size()
    .reset_index(name='Count')
    .sort_values(by=['diseases', 'Count'], ascending=[True, False])
)

# Show result
print("Diseases and countries they appear in:")
display(result)

Diseases and countries they appear in:


Unnamed: 0,diseases,country_code,Count
0,acute hepatitis a,afg,1
1,acute hepatitis a,ago,1
2,acute hepatitis a,alb,1
3,acute hepatitis a,are,1
4,acute hepatitis a,arg,1
...,...,...,...
2008,tuberculosis,wsm,1
2009,tuberculosis,yem,1
2010,tuberculosis,zaf,1
2011,tuberculosis,zmb,1


In [19]:
# Function to count distinct countries for each disease
def count_countries_per_disease(df, diseases):
    counts = []
    for disease in diseases:
        mask = df['diseases'].str.contains(disease, case=False, na=False)
        countries = df.loc[mask, 'country_code'].unique()
        counts.append({
            'Disease': disease,
            'Distinct Countries': len(countries),
            'Country List': list(countries)
        })
    return pd.DataFrame(counts).sort_values(by='Distinct Countries', ascending=False)

# Apply function
country_counts = count_countries_per_disease(df, target_diseases)

# Display results
print("✅ Distinct country counts per disease:")
display(country_counts)

✅ Distinct country counts per disease:


Unnamed: 0,Disease,Distinct Countries,Country List
0,malaria,183,"[afg, ago, alb, are, arg, arm, atg, aus, aut, ..."
11,hiv,183,"[afg, ago, alb, are, arg, arm, atg, aus, aut, ..."
15,hepatitis,183,"[afg, ago, alb, are, arg, arm, atg, aus, aut, ..."
3,chlamydia,183,"[afg, ago, alb, are, arg, arm, atg, aus, aut, ..."
5,syphilis,183,"[afg, ago, alb, are, arg, arm, atg, aus, aut, ..."
6,trichomoniasis,183,"[afg, ago, alb, are, arg, arm, atg, aus, aut, ..."
1,tuberculosis,183,"[afg, ago, alb, are, arg, arm, atg, aus, aut, ..."
12,herpes,183,"[afg, ago, alb, are, arg, arm, atg, aus, aut, ..."
14,hpv,0,[]
13,human papillomavirus,0,[]


In [20]:
# Filter to only rows that contain one of the target diseases
mask = df['diseases'].apply(
    lambda x: any(disease in x for disease in target_diseases)
)

# Keep only matching rows (drop all others)
filtered_df = df[mask].copy()

print(f"✅ Filtered dataset now contains {len(filtered_df)} rows (from {len(df)} originally).")
print(f"✅ Diseases included: {filtered_df['diseases'].nunique()} unique causes.")

# Optional: inspect what remains
display(filtered_df.head())

✅ Filtered dataset now contains 2013 rows (from 24522 originally).
✅ Diseases included: 11 unique causes.


Unnamed: 0,country_code,year,diseases,sex,death_rate
0,afg,2021,syphilis,fmle,1.4
1,afg,2021,genital herpes,fmle,0.0
19,afg,2021,tuberculosis,fmle,33.05
20,afg,2021,hiv/aids,fmle,1.17
21,afg,2021,acute hepatitis b,fmle,3.5


In [21]:
print("Number of rows left:", len(filtered_df))


Number of rows left: 2013


In [22]:
# Case-insensitive filtering for partial matches
mask = df["diseases"].str.lower().apply(
    lambda x: any(disease in x for disease in target_diseases)
)

filtered_df = df[mask].copy()

print(f"✅ Filtered dataset now contains {len(filtered_df)} rows (from {len(df)} originally).")
print(f"✅ Diseases included: {filtered_df['diseases'].nunique()} unique causes.")

# Define output file path
output_path = NOTEBOOK_DIR.parent / "processed" / "WHO_TOP_CAUSES_OF_DEATH_FEMALE.csv"

# Save to CSV
filtered_df.to_csv(output_path, index=False)

print(f"Filtered data saved to: {output_path}")
print(f"Number of rows left: {len(filtered_df)}")

✅ Filtered dataset now contains 2013 rows (from 24522 originally).
✅ Diseases included: 11 unique causes.
Filtered data saved to: C:\Users\USER\Desktop\WORK\RenewedCare\data\processed\WHO_TOP_CAUSES_OF_DEATH_FEMALE.csv
Number of rows left: 2013


In [24]:
print(df.dtypes)


country_code     object
year              int64
diseases         object
sex              object
death_rate      float64
dtype: object
