In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import os

In [2]:
# Get the current working directory (where your notebook is running)
NOTEBOOK_DIR = Path(os.getcwd())

# Navigate to your dataset subfolder
DATA_DIR = NOTEBOOK_DIR.parent /"raw" / "Disease_and_Mobidity_data"

FILE_NAME = "cholera_adm0_public.csv"
FILE_PATH = DATA_DIR / FILE_NAME

#print("Looking for file at:", FILE_PATH)

print(DATA_DIR.exists())   # Should be True

# Load dataset
df = pd.read_csv(FILE_PATH)

# === 1. Check basic info ===
print("✅ Dataset loaded successfully!")
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")
print("\n📋 Column names:")
print(df.columns.tolist())

True
✅ Dataset loaded successfully!
Number of rows: 34
Number of columns: 7

📋 Column names:
['adm0_name', 'who_region', 'iso_3_code', 'first_epiwk', 'last_epiwk', 'case_total', 'death_total']


In [3]:
display(df.head())

Unnamed: 0,adm0_name,who_region,iso_3_code,first_epiwk,last_epiwk,case_total,death_total
0,AFGHANISTAN,Eastern Mediterranean Region,AFG,2024-12-30,2025-09-15,129325,64
1,ANGOLA,African Region,AGO,2025-01-06,2025-09-15,28773,799
2,BANGLADESH,South-East Asia Region,BGD,2024-12-30,2025-09-01,67,0
3,BURUNDI,African Region,BDI,2024-12-30,2025-09-15,1108,6
4,CHAD,African Region,TCD,2025-07-07,2025-09-15,2321,139


In [4]:
# Rename columns (matching lowercase names)
# 🧹 Rename columns to clear, Python-friendly names
df.columns = [
    "country_name", "region", "country_code", "first_epiwk",
    "last_epiwk", "case_total", "death_total"
]

# ✅ Verify the rename
print("✅ Columns renamed successfully:")
print(df.columns.tolist())

# Optional — quick look at first few rows
display(df.head())

✅ Columns renamed successfully:
['country_name', 'region', 'country_code', 'first_epiwk', 'last_epiwk', 'case_total', 'death_total']


Unnamed: 0,country_name,region,country_code,first_epiwk,last_epiwk,case_total,death_total
0,AFGHANISTAN,Eastern Mediterranean Region,AFG,2024-12-30,2025-09-15,129325,64
1,ANGOLA,African Region,AGO,2025-01-06,2025-09-15,28773,799
2,BANGLADESH,South-East Asia Region,BGD,2024-12-30,2025-09-01,67,0
3,BURUNDI,African Region,BDI,2024-12-30,2025-09-15,1108,6
4,CHAD,African Region,TCD,2025-07-07,2025-09-15,2321,139


In [5]:
# ---Check distinct countries by code and name ---
unique_countries = df[['country_code', 'country_name']].drop_duplicates()
print(f"Distinct countries found: {len(unique_countries)}")
print(unique_countries.head())

Distinct countries found: 34
  country_code country_name
0          AFG  AFGHANISTAN
1          AGO       ANGOLA
2          BGD   BANGLADESH
3          BDI      BURUNDI
4          TCD         CHAD


In [6]:
df.dtypes

country_name    object
region          object
country_code    object
first_epiwk     object
last_epiwk      object
case_total       int64
death_total      int64
dtype: object

In [7]:
# Convert 'first_epiwk' and 'last_epiwk' to datetime
df['first_epiwk'] = pd.to_datetime(df['first_epiwk'], errors='coerce')
df['last_epiwk'] = pd.to_datetime(df['last_epiwk'], errors='coerce')

# Verify conversion
print(df.dtypes)

country_name            object
region                  object
country_code            object
first_epiwk     datetime64[ns]
last_epiwk      datetime64[ns]
case_total               int64
death_total              int64
dtype: object


In [8]:
display(df.head())

Unnamed: 0,country_name,region,country_code,first_epiwk,last_epiwk,case_total,death_total
0,AFGHANISTAN,Eastern Mediterranean Region,AFG,2024-12-30,2025-09-15,129325,64
1,ANGOLA,African Region,AGO,2025-01-06,2025-09-15,28773,799
2,BANGLADESH,South-East Asia Region,BGD,2024-12-30,2025-09-01,67,0
3,BURUNDI,African Region,BDI,2024-12-30,2025-09-15,1108,6
4,CHAD,African Region,TCD,2025-07-07,2025-09-15,2321,139


In [9]:
df.columns.tolist()

['country_name',
 'region',
 'country_code',
 'first_epiwk',
 'last_epiwk',
 'case_total',
 'death_total']

In [10]:
df.columns = df.columns.str.strip()

In [11]:
df.columns.tolist()

['country_name',
 'region',
 'country_code',
 'first_epiwk',
 'last_epiwk',
 'case_total',
 'death_total']

In [12]:
for col in df.select_dtypes(include='object').columns:
    has_spaces = df[col].str.contains(r'^\s|\s$', na=False).any()
    if has_spaces:
        print(f"Column '{col}' has leading or trailing spaces.")


In [15]:
# Define output file path
output_path = NOTEBOOK_DIR.parent / "processed" / "cholera_adm0_public.csv"

# Save to CSV
df.to_csv(output_path, index=False)

print(f"Filtered data saved to: {output_path}")
print(f"Number of rows left: {len(df)}")


Filtered data saved to: C:\Users\USER\Desktop\WORK\RenewedCare\data\processed\cholera_adm0_public.csv
Number of rows left: 34
