In [7]:
import pandas as pd
import logging
from tqdm.notebook import tqdm

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Function to standardize column names
def standardize_columns(df, country_columns):
    for col in country_columns:
        if col in df.columns:
            df.rename(columns={col: 'Country'}, inplace=True)
            break
    return df

# Load cleaned datasets in chunks
def load_cleaned_data_in_chunks(file_path, chunksize=10000):
    logging.info(f"Loading cleaned data in chunks from {file_path}")
    country_columns = ["Country", "Country/Region", "location"]
    for chunk in pd.read_csv(file_path, chunksize=chunksize):
        standardize_columns(chunk, country_columns)
        yield chunk

# Merge datasets in chunks
def merge_datasets_in_batches(dataset_files, keys, output_file, chunksize=10000):
    merged_data = None
    for dataset, file_path in dataset_files.items():
        logging.info(f"Merging dataset: {dataset}")
        chunk_iter = load_cleaned_data_in_chunks(file_path, chunksize)
        for chunk in tqdm(chunk_iter, desc=f"Merging {dataset}"):
            if merged_data is None:
                merged_data = chunk
            else:
                merged_data = pd.merge(merged_data, chunk, on=keys, how='outer')
            merged_data.to_csv(output_file, index=False)
            logging.info(f"Saved merged data to {output_file}")
    return merged_data

# Cleaned datasets paths
cleaned_dataset_files = {
    "owid": "datasets/cleaned/owid_data_cleaned.csv",
    #"google_cloud": "datasets/cleaned/google_cloud_data_cleaned.csv",
    "csse_confirmed": "datasets/cleaned/csse_confirmed_data_cleaned.csv",
    "csse_deaths": "datasets/cleaned/csse_deaths_data_cleaned.csv",
    "who": "datasets/cleaned/who_data_cleaned.csv"
}


# Merge datasets
logging.info("Merging datasets in batches.")
keys = ["date", "Country"]
output_file = "datasets/merged_final_dataset.csv"
merged_data = merge_datasets_in_batches(cleaned_dataset_files, keys, output_file)
logging.info("Finished merging datasets in batches.")


2024-08-04 09:55:45,789 - INFO - Merging datasets in batches.
2024-08-04 09:55:45,791 - INFO - Merging dataset: owid


Merging owid: 0it [00:00, ?it/s]

2024-08-04 09:55:45,839 - INFO - Loading cleaned data in chunks from datasets/cleaned/owid_data_cleaned.csv
2024-08-04 09:55:46,037 - INFO - Saved merged data to datasets/merged_final_dataset.csv
2024-08-04 09:55:46,334 - INFO - Saved merged data to datasets/merged_final_dataset.csv
2024-08-04 09:55:46,834 - INFO - Saved merged data to datasets/merged_final_dataset.csv
2024-08-04 09:55:47,626 - INFO - Saved merged data to datasets/merged_final_dataset.csv
2024-08-04 09:55:48,778 - INFO - Saved merged data to datasets/merged_final_dataset.csv
2024-08-04 09:55:50,491 - INFO - Saved merged data to datasets/merged_final_dataset.csv
2024-08-04 09:55:52,763 - INFO - Saved merged data to datasets/merged_final_dataset.csv
2024-08-04 09:55:55,554 - INFO - Saved merged data to datasets/merged_final_dataset.csv
2024-08-04 09:55:58,961 - INFO - Saved merged data to datasets/merged_final_dataset.csv
2024-08-04 09:56:03,148 - INFO - Saved merged data to datasets/merged_final_dataset.csv
2024-08-04 0

Merging csse_confirmed: 0it [00:00, ?it/s]

2024-08-04 09:59:47,455 - INFO - Loading cleaned data in chunks from datasets/cleaned/csse_confirmed_data_cleaned.csv


KeyError: 'date'