In [1]:
import pandas as pd
import numpy as np

# Read the CSV file
df = pd.read_csv('WHO-COVID-19-global-table-data.csv')

# Define countries of interest
countries_of_interest = ['Kenya', 'United States of America', 'India', 'South Africa', 'China']

# Filter for countries of interest
df_filtered = df[df['Name'].isin(countries_of_interest)]

# Convert numeric columns to float, replacing empty strings with NaN
numeric_columns = [
    'Cases - cumulative total',
    'Cases - cumulative total per 100000 population',
    'Cases - newly reported in last 7 days',
    'Cases - newly reported in last 24 hours',
    'Deaths - cumulative total',
    'Deaths - cumulative total per 100000 population',
    'Deaths - newly reported in last 7 days',
    'Deaths - newly reported in last 24 hours'
]

for col in numeric_columns:
    df_filtered[col] = pd.to_numeric(df_filtered[col], errors='coerce')

# Handle missing values
# For cumulative totals, forward fill
cumulative_cols = [col for col in numeric_columns if 'cumulative' in col]
df_filtered[cumulative_cols] = df_filtered[cumulative_cols].fillna(method='ffill')

# For new cases/deaths, fill with 0
new_cols = [col for col in numeric_columns if 'newly' in col]
df_filtered[new_cols] = df_filtered[new_cols].fillna(0)

# Display the cleaned data
print("\nCleaned COVID-19 Data for Selected Countries:")
print(df_filtered)

# Save the cleaned data
df_filtered.to_csv('cleaned_covid_data.csv', index=False)
print("\nCleaned data saved to 'cleaned_covid_data.csv'")


Cleaned COVID-19 Data for Selected Countries:
                         Name       WHO Region  Cases - cumulative total  \
1                       China  Western Pacific                99375079.0   
89                      India  South-East Asia                45042054.0   
95               South Africa           Africa                 4072765.0   
119                     Kenya           Africa                  344106.0   
233  United States of America         Americas               103436829.0   

     Cases - cumulative total per 100000 population  \
1                                            6754.0   
89                                           3264.0   
95                                           6867.0   
119                                           640.0   
233                                         31250.0   

     Cases - newly reported in last 7 days  \
1                                   1860.0   
89                                   306.0   
95                         

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered[col] = pd.to_numeric(df_filtered[col], errors='coerce')
  df_filtered[cumulative_cols] = df_filtered[cumulative_cols].fillna(method='ffill')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered[cumulative_cols] = df_filtered[cumulative_cols].fillna(method='ffill')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#re