In [1]:
import pandas as pd

In [5]:
# Load dataset
df = pd.read_csv("covid_19_clean_complete.csv")

# Check how many missing values are in each column
print(df.isnull().sum())

# Convert 'Date' column to datetime
df['Date'] = pd.to_datetime(df['Date']) # So that Python knows it's a date — this helps when we want to make graphs over time, 
# or group the data by month, week, etc.

# Standardize country names
df['Country/Region'] = df['Country/Region'].str.strip().str.title() # This removes extra spaces and makes sure every country name starts with a capital letter.

# Check for duplicate rows
duplicates = df[df.duplicated()] # Duplicate rows are repeated information — and that can mess up totals or averages.
print(f"Duplicate rows found: {len(duplicates)}") # This line checks how many rows are repeated.

# Unique WHO Regions and sample countries
print("Unique WHO Regions:", df['WHO Region'].unique())
print("Top 10 Countries by row count:\n", df['Country/Region'].value_counts().head(10))

Province/State    34404
Country/Region        0
Lat                   0
Long                  0
Date                  0
Confirmed             0
Deaths                0
Recovered             0
Active                0
WHO Region            0
dtype: int64
Duplicate rows found: 0
Unique WHO Regions: ['Eastern Mediterranean' 'Europe' 'Africa' 'Americas' 'Western Pacific'
 'South-East Asia']
Top 10 Countries by row count:
 China             6204
Canada            2256
France            2068
United Kingdom    2068
Australia         1504
Netherlands        752
Denmark            376
Afghanistan        188
Saudi Arabia       188
Romania            188
Name: Country/Region, dtype: int64


In [3]:
# Check how many rows have negative active cases
# This helps us know if the data has any wrong numbers
negative_count = (df['Active'] < 0).sum()
print(f"Number of negative active case rows: {negative_count}")


# Replace any negative number in the 'Active' column with 0
# Because active cases can't be negative in real life
df.loc[df['Active'] < 0, 'Active'] = 0

# Check again to make sure it's fixed
print(f"Negative values after cleaning: {(df['Active'] < 0).sum()}")


Number of negative active case rows: 18
Negative values after cleaning: 0


In [4]:
# Save the cleaned version of the dataset to a new CSV file
df.to_csv("covid_19_cleaned.csv", index=False)
print("Cleaned dataset saved successfully as 'covid_19_cleaned.csv'")

Cleaned dataset saved successfully as 'covid_19_cleaned.csv'
