In [36]:
import pandas as pd

In [39]:
# Load all Excel files (update the path if needed)
coverage_data = pd.read_excel("../data/coverage-data.xlsx")
incidence_data = pd.read_excel("../data/incidence-rate-data.xlsx")
reported_cases_data = pd.read_excel("../data/reported-cases-data.xlsx")
vaccine_intro_data = pd.read_excel("../data/vaccine-introduction-data.xlsx")
vaccine_schedule_data = pd.read_excel("../data/vaccine-schedule-data.xlsx")

In [40]:
# Step 3: Define a generic data cleaning function
def clean_dataset(df):
    df = df.copy()
    # Drop duplicate rows
    df.drop_duplicates(inplace=True)
    
    # Clean column names
    df.columns = df.columns.str.strip().str.upper().str.replace(" ", "_")
    
    # Drop columns that have all missing values
    df.dropna(axis=1, how='all', inplace=True)
    
    # Convert 'YEAR' column to integer if it exists
    if 'YEAR' in df.columns:
        df['YEAR'] = pd.to_numeric(df['YEAR'], errors='coerce').astype('Int64')
    
    return df

In [41]:
# Step 4: Clean all datasets
cleaned_coverage = clean_dataset(coverage_data)
cleaned_incidence = clean_dataset(incidence_data)
cleaned_reported_cases = clean_dataset(reported_cases_data)
cleaned_vaccine_intro = clean_dataset(vaccine_intro_data)
cleaned_vaccine_schedule = clean_dataset(vaccine_schedule_data)

In [44]:
# Step 5: Preview first few rows of each cleaned dataset
print("Cleaned Coverage Data:\n", cleaned_coverage.head())
print("Cleaned Incidence Rate Data:\n", cleaned_incidence.head())
print("Cleaned Reported Cases Data:\n", cleaned_reported_cases.head())
print("Cleaned Vaccine Introduction Data:\n", cleaned_vaccine_intro.head())
print("Cleaned Vaccine Schedule Data:\n", cleaned_vaccine_schedule.head())

Cleaned Coverage Data:
        GROUP CODE   NAME  YEAR  ANTIGEN  \
0  COUNTRIES  ABW  Aruba  2023      BCG   
1  COUNTRIES  ABW  Aruba  2023      BCG   
2  COUNTRIES  ABW  Aruba  2023  DIPHCV4   
3  COUNTRIES  ABW  Aruba  2023  DIPHCV4   
4  COUNTRIES  ABW  Aruba  2023  DIPHCV5   

                                 ANTIGEN_DESCRIPTION COVERAGE_CATEGORY  \
0                                                BCG             ADMIN   
1                                                BCG          OFFICIAL   
2  Diphtheria-containing vaccine, 4th dose (1st b...             ADMIN   
3  Diphtheria-containing vaccine, 4th dose (1st b...          OFFICIAL   
4  Diphtheria-containing vaccine, 5th dose (2nd b...             ADMIN   

  COVERAGE_CATEGORY_DESCRIPTION  TARGET_NUMBER   DOSES  COVERAGE  
0       Administrative coverage            NaN     NaN       NaN  
1             Official coverage            NaN     NaN       NaN  
2       Administrative coverage         1044.0   945.0     90.52  
3   

In [45]:
cleaned_coverage.rename(columns={'GROUP': 'GROUP_NAME'}, inplace=True)
cleaned_incidence.rename(columns={'GROUP': 'GROUP_NAME'}, inplace=True)
cleaned_reported_cases.rename(columns={'GROUP': 'GROUP_NAME'}, inplace=True)

In [46]:
# Save cleaned datasets to the 'data/' folder as CSV files
cleaned_coverage.to_csv("../data/cleaned_coverage_data.csv", index=False)
cleaned_incidence.to_csv("../data/cleaned_incidence_rate_data.csv", index=False)
cleaned_reported_cases.to_csv("../data/cleaned_reported_cases_data.csv", index=False)
cleaned_vaccine_intro.to_csv("../data/cleaned_vaccine_introduction_data.csv", index=False)
cleaned_vaccine_schedule.to_csv("../data/cleaned_vaccine_schedule_data.csv", index=False)

print("✅ All cleaned datasets have been saved to the data folder.")

✅ All cleaned datasets have been saved to the data folder.
