In [3]:
import os
import pandas as pd
import logging
from tqdm.notebook import tqdm

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Function to read CSV data in chunks
def read_csv_in_chunks(file_path, chunksize=10000):
    logging.info(f"Reading data in chunks from {file_path}")
    for chunk in pd.read_csv(file_path, chunksize=chunksize):
        yield chunk

# Function to parse and verify date columns
def parse_date(data, column_name):
    data[column_name] = pd.to_datetime(data[column_name], errors='coerce')

# Function to check and convert columns to numeric, handling errors
def check_numeric(data, columns):
    for column in columns:
        data[column] = pd.to_numeric(data[column], errors='coerce')

# Function to handle missing values
def handle_missing_values(data):
    data.fillna(0, inplace=True)

# Function to clean data chunk
def clean_data_chunk(chunk, date_columns, numeric_columns):
    for date_column in date_columns:
        parse_date(chunk, date_column)
    check_numeric(chunk, numeric_columns)
    handle_missing_values(chunk)
    return chunk

# Ensure the cleaned data directory exists
cleaned_data_dir = "datasets/cleaned"
if not os.path.exists(cleaned_data_dir):
    os.makedirs(cleaned_data_dir)
    logging.info(f"Created directory: {cleaned_data_dir}")

# Define columns to be cleaned for each dataset
owid_date_columns = ["date"]
owid_numeric_columns = ["new_cases", "new_deaths", "total_cases", "total_deaths", "weekly_cases", "weekly_deaths", "biweekly_cases", "biweekly_deaths"]

google_cloud_date_columns = ["date"]
google_cloud_numeric_columns = ["new_confirmed", "new_deceased", "new_recovered", "new_tested", "total_confirmed", "total_deceased", "total_recovered", "total_tested", "new_hospitalized", "total_hospitalized", "current_hospitalized", "new_intensive_care", "total_intensive_care", "current_intensive_care", "new_ventilator", "total_ventilator", "current_ventilator", "population", "population_male", "population_female", "rural_population", "urban_population", "largest_city_population", "clustered_population", "population_density", "human_development_index", "population_age_00_09", "population_age_10_19", "population_age_20_29", "population_age_30_39", "population_age_40_49", "population_age_50_59", "population_age_60_69", "population_age_70_79", "population_age_80_89", "population_age_90_99", "population_age_80_and_older", "gdp", "gdp_per_capita", "human_capital_index", "latitude", "longitude", "elevation", "area", "rural_area", "urban_area", "life_expectancy", "smoking_prevalence", "diabetes_prevalence", "infant_mortality_rate", "adult_male_mortality_rate", "adult_female_mortality_rate", "pollution_mortality_rate", "comorbidity_mortality_rate", "hospital_beds", "nurses", "physicians", "health_expenditure", "out_of_pocket_health_expenditure", "mobility_retail_and_recreation", "mobility_grocery_and_pharmacy", "mobility_parks", "mobility_transit_stations", "mobility_workplaces", "mobility_residential", "school_closing", "workplace_closing", "cancel_public_events", "restrictions_on_gatherings", "public_transport_closing", "stay_at_home_requirements", "restrictions_on_internal_movement", "international_travel_controls", "income_support", "debt_relief", "fiscal_measures", "international_support", "public_information_campaigns", "testing_policy", "contact_tracing", "emergency_investment_in_healthcare", "investment_in_vaccines", "facial_coverings", "vaccination_policy", "stringency_index", "noaa_station", "noaa_distance", "average_temperature", "minimum_temperature", "maximum_temperature", "rainfall", "snowfall", "dew_point", "relative_humidity"]

csse_confirmed_date_columns = ''
csse_confirmed_numeric_columns = [col for col in pd.date_range(start='1/22/20', end='3/9/23').strftime('%-m/%-d/%y')]

csse_deaths_date_columns = ''
csse_deaths_numeric_columns = [col for col in pd.date_range(start='1/22/20', end='3/9/23').strftime('%-m/%-d/%y')]

who_date_columns = ["Date_reported"]
who_numeric_columns = ["New_cases", "New_deaths", "Cumulative_cases", "Cumulative_deaths"]

# Clean datasets in chunks and save cleaned chunks
dataset_files = {
    "owid": ("datasets/owid/full_data.csv", owid_date_columns, owid_numeric_columns),
    #"google_cloud": ("datasets/google_cloud/main.csv", google_cloud_date_columns, google_cloud_numeric_columns),
    "csse_confirmed": ("datasets/csse/time_series_covid19_confirmed_global.csv", csse_confirmed_date_columns, csse_confirmed_numeric_columns),
    "csse_deaths": ("datasets/csse/time_series_covid19_deaths_global.csv", csse_deaths_date_columns, csse_deaths_numeric_columns),
    "who": ("datasets/who/WHO-COVID-19-global-data.csv", who_date_columns, who_numeric_columns)
}

logging.info("Starting cleaning of datasets in batches.")
for dataset, (file_path, date_columns, numeric_columns) in dataset_files.items():
    logging.info(f"Processing dataset: {dataset}")
    chunk_iter = read_csv_in_chunks(file_path)
    cleaned_chunks = []
    for chunk in tqdm(chunk_iter, desc=f"Cleaning {dataset}"):
        cleaned_chunk = clean_data_chunk(chunk, date_columns, numeric_columns)
        cleaned_chunks.append(cleaned_chunk)
    cleaned_data = pd.concat(cleaned_chunks, ignore_index=True)
    cleaned_data.to_csv(f"{cleaned_data_dir}/{dataset}_data_cleaned.csv", index=False)
    logging.info(f"Finished cleaning and saved {dataset} dataset.")
logging.info("Finished cleaning all datasets in batches.")


2024-08-04 09:15:35,998 - INFO - Starting cleaning of datasets in batches.
2024-08-04 09:15:36,001 - INFO - Processing dataset: owid


Cleaning owid: 0it [00:00, ?it/s]

2024-08-04 09:15:36,054 - INFO - Reading data in chunks from datasets/owid/full_data.csv
2024-08-04 09:15:42,300 - INFO - Finished cleaning and saved owid dataset.
2024-08-04 09:15:42,302 - INFO - Processing dataset: csse_confirmed


Cleaning csse_confirmed: 0it [00:00, ?it/s]

2024-08-04 09:15:42,358 - INFO - Reading data in chunks from datasets/csse/time_series_covid19_confirmed_global.csv
2024-08-04 09:15:43,138 - INFO - Finished cleaning and saved csse_confirmed dataset.
2024-08-04 09:15:43,141 - INFO - Processing dataset: csse_deaths


Cleaning csse_deaths: 0it [00:00, ?it/s]

2024-08-04 09:15:43,253 - INFO - Reading data in chunks from datasets/csse/time_series_covid19_deaths_global.csv
2024-08-04 09:15:43,924 - INFO - Finished cleaning and saved csse_deaths dataset.
2024-08-04 09:15:43,926 - INFO - Processing dataset: who


Cleaning who: 0it [00:00, ?it/s]

2024-08-04 09:15:43,983 - INFO - Reading data in chunks from datasets/who/WHO-COVID-19-global-data.csv
2024-08-04 09:15:45,320 - INFO - Finished cleaning and saved who dataset.
2024-08-04 09:15:45,322 - INFO - Finished cleaning all datasets in batches.
