In [1]:
# Import required libraries
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
# Function to clean the dataset
def clean_data(df):
    # Drop unnecessary columns
    df.drop(columns=['FIPS', 'Admin2', 'Province_State'], inplace=True, errors='ignore')

    # Drop rows with missing values in critical columns
    df.dropna(subset=['Lat', 'Long_', 'Incident_Rate', 'Case_Fatality_Ratio'], inplace=True)

    # Convert 'Last_Update' column to datetime format
    df['Last_Update'] = pd.to_datetime(df['Last_Update'], errors='coerce')

    # Rename columns for better understanding
    df.rename(columns={'Country_Region': 'Country', 'Lat': 'Latitude', 'Long_': 'Longitude'}, inplace=True)

    # Convert numeric columns to proper data types
    numeric_columns = ['Confirmed', 'Deaths', 'Recovered', 'Active', 'Incident_Rate', 'Case_Fatality_Ratio']
    df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')

    return df


In [8]:
# Define the input directory containing the raw CSV files
input_directory = "C:\\Users\\nik24\\Downloads\\COVID-19-master\\COVID-19-master\\csse_covid_19_data\\csse_covid_19_daily_reports\\"

# Define the output directory to save cleaned files
output_directory = os.path.join(input_directory, 'cleaned')

# Create the output folder if it doesn't exist
if not os.path.exists(output_directory):
    os.makedirs(output_directory)


In [9]:
# Get a list of all CSV files in the input directory
csv_files = [file for file in os.listdir(input_directory) if file.endswith('.csv')]

# Print the number of files found
print(f"Found {len(csv_files)} CSV files.")

# List to store cleaned DataFrames
cleaned_dataframes = []

# Loop through each file
for file in csv_files:
    file_path = os.path.join(input_directory, file)  # Full file path
    df = pd.read_csv(file_path)                     # Load the data
    cleaned_df = clean_data(df)                     # Clean the data
    cleaned_dataframes.append(cleaned_df)           # Add to list

    # Save cleaned data to the output directory
    output_path = os.path.join(output_directory, f"cleaned_{file}")
    cleaned_df.to_csv(output_path, index=False)
    print(f"Cleaned file saved: {output_path}")


Found 1144 CSV files.
Cleaned file saved: C:\Users\nik24\Downloads\COVID-19-master\COVID-19-master\csse_covid_19_data\csse_covid_19_daily_reports\cleaned\cleaned_01-01-2021.csv
Cleaned file saved: C:\Users\nik24\Downloads\COVID-19-master\COVID-19-master\csse_covid_19_data\csse_covid_19_daily_reports\cleaned\cleaned_01-01-2022.csv
Cleaned file saved: C:\Users\nik24\Downloads\COVID-19-master\COVID-19-master\csse_covid_19_data\csse_covid_19_daily_reports\cleaned\cleaned_01-01-2023.csv
Cleaned file saved: C:\Users\nik24\Downloads\COVID-19-master\COVID-19-master\csse_covid_19_data\csse_covid_19_daily_reports\cleaned\cleaned_01-02-2021.csv
Cleaned file saved: C:\Users\nik24\Downloads\COVID-19-master\COVID-19-master\csse_covid_19_data\csse_covid_19_daily_reports\cleaned\cleaned_01-02-2022.csv
Cleaned file saved: C:\Users\nik24\Downloads\COVID-19-master\COVID-19-master\csse_covid_19_data\csse_covid_19_daily_reports\cleaned\cleaned_01-02-2023.csv
Cleaned file saved: C:\Users\nik24\Downloads\COV

KeyError: ['Lat', 'Long_', 'Incident_Rate', 'Case_Fatality_Ratio']

In [10]:
# Combine all cleaned DataFrames
all_cleaned_data = pd.concat(cleaned_dataframes, ignore_index=True)

# Save the combined DataFrame to a new CSV file
combined_output_path = os.path.join(output_directory, "combined_cleaned_data.csv")
all_cleaned_data.to_csv(combined_output_path, index=False)
print(f"All cleaned data combined and saved: {combined_output_path}")


All cleaned data combined and saved: C:\Users\nik24\Downloads\COVID-19-master\COVID-19-master\csse_covid_19_data\csse_covid_19_daily_reports\cleaned\combined_cleaned_data.csv


In [11]:
# Group data by country and calculate totals
country_aggregated_data = all_cleaned_data.groupby('Country', as_index=False).agg({
    'Confirmed': 'sum',
    'Deaths': 'sum',
    'Recovered': 'sum',
    'Active': 'sum'
})

# Calculate death rate
country_aggregated_data['Death_Rate'] = (country_aggregated_data['Deaths'] / country_aggregated_data['Confirmed']) * 100

# Replace NaN and infinite values
country_aggregated_data['Death_Rate'] = country_aggregated_data['Death_Rate'].replace([float('inf'), -float('inf')], 0)
country_aggregated_data['Death_Rate'] = country_aggregated_data['Death_Rate'].fillna(0)
