In [8]:
import pandas as pd
import logging
from io import StringIO
import numpy as np

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Sample data for illustration purposes
owid_csv = """date,location,new_cases,new_deaths,total_cases,total_deaths,reproduction_rate,icu_patients,hosp_patients,weekly_icu_admissions,weekly_hosp_admissions,total_tests,new_tests,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,new_vaccinations,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index
2020-01-22,Albania,0.0,0.0,0.0,0.0,,,,,,,,,,,,,2877797.0,104.6,38.2,14.8,4.8,11843.0,,,,,,,,,78.5,0.796
2020-01-22,Andorra,0.0,0.0,0.0,0.0,,,,,,,,,,,,,77265.0,,,,,,,,,,,,,,,
2020-01-22,Angola,0.0,0.0,0.0,0.0,,,,,,,,,,,,,32866272.0,26.3,16.7,2.5,0.7,6779.0,,,,,,,,,60.8,0.581
"""
google_cloud_csv = """date,location_key,new_confirmed,new_deceased,new_recovered,new_tested,total_confirmed,total_deceased,total_recovered,total_tested,population,population_density,aged_65_older,gdp_per_capita,smoking_prevalence
2020-01-22,AFG,0.0,0.0,,,0.0,0.0,,,,38928341.0,58.0,3.0,1803.0,7.1
2020-01-22,AGO,0.0,0.0,,,0.0,0.0,,,,32866272.0,26.0,2.5,6779.0,7.9
2020-01-22,ALB,0.0,0.0,,,0.0,0.0,,,,2877797.0,104.0,14.8,11843.0,
"""
csse_confirmed_csv = """Province/State,Country/Region,Lat,Long,1/22/20
NaN,Thailand,15.0000,101.0000,2
NaN,Japan,36.0000,138.0000,2
"""
csse_deaths_csv = """Province/State,Country/Region,Lat,Long,1/22/20
NaN,Thailand,15.0000,101.0000,0
NaN,Japan,36.0000,138.0000,0
"""
who_csv = """Date_reported,Country,New_cases,New_deaths,Cumulative_cases,Cumulative_deaths
2020-01-22,Thailand,0,0,3,0
2020-01-22,Japan,0,0,2,0
"""

# Read sample data into DataFrames
owid_data = pd.read_csv(StringIO(owid_csv))
google_cloud_data = pd.read_csv(StringIO(google_cloud_csv))
csse_confirmed = pd.read_csv(StringIO(csse_confirmed_csv))
csse_deaths = pd.read_csv(StringIO(csse_deaths_csv))
who_data = pd.read_csv(StringIO(who_csv))


In [9]:
# Function to clean OWID data
def clean_owid(data):
    data = data.rename(columns={"date": "Date", "location": "Country", "new_cases": "NewCases", "new_deaths": "NewDeaths"})
    data["Date"] = pd.to_datetime(data["Date"])
    return data

# Function to clean Google Cloud data
def clean_google_cloud(data):
    data = data.rename(columns={"date": "Date", "location_key": "Country", "new_confirmed": "NewCases", "new_deceased": "NewDeaths"})
    data["Date"] = pd.to_datetime(data["Date"], errors='coerce')
    data = data.dropna(subset=["Date"])  # Remove rows with invalid dates
    return data

# Function to clean CSSE Confirmed Cases data
def clean_csse_confirmed(data):
    data = data.melt(id_vars=["Province/State", "Country/Region", "Lat", "Long"], var_name="Date", value_name="ConfirmedCases")
    data["Date"] = pd.to_datetime(data["Date"], errors='coerce')
    data = data.dropna(subset=["Date"])  # Remove rows with invalid dates
    return data

# Function to clean CSSE Deaths data
def clean_csse_deaths(data):
    data = data.melt(id_vars=["Province/State", "Country/Region", "Lat", "Long"], var_name="Date", value_name="Deaths")
    data["Date"] = pd.to_datetime(data["Date"], errors='coerce')
    data = data.dropna(subset=["Date"])  # Remove rows with invalid dates
    return data

# Function to clean WHO data
def clean_who(data):
    data = data.rename(columns={"Date_reported": "Date", "Country": "Country", "New_cases": "NewCases", "New_deaths": "NewDeaths"})
    data["Date"] = pd.to_datetime(data["Date"])
    return data

# Clean datasets
owid_data_cleaned = clean_owid(owid_data)
google_cloud_data_cleaned = clean_google_cloud(google_cloud_data)
csse_confirmed_cleaned = clean_csse_confirmed(csse_confirmed)
csse_deaths_cleaned = clean_csse_deaths(csse_deaths)
who_data_cleaned = clean_who(who_data)


In [10]:
# Merge datasets on Country and Date
merged_data = pd.merge(owid_data_cleaned, google_cloud_data_cleaned, on=["Country", "Date"], suffixes=("_owid", "_gc"), how="outer")
merged_data = pd.merge(merged_data, who_data_cleaned, on=["Country", "Date"], suffixes=("", "_who"), how="outer")
merged_data = pd.merge(merged_data, csse_confirmed_cleaned, left_on=["Country", "Date"], right_on=["Country/Region", "Date"], suffixes=("", "_csse"), how="outer")
merged_data = pd.merge(merged_data, csse_deaths_cleaned, left_on=["Country", "Date"], right_on=["Country/Region", "Date"], suffixes=("", "_csse_deaths"), how="outer")

# Rename columns for consistency
merged_data = merged_data.rename(columns={"Country/Region": "Country"})

# Fill NaN values with 0 for numerical columns where appropriate
for col in merged_data.select_dtypes(include=[np.number]).columns:
    merged_data[col] = merged_data[col].fillna(0)

# Inspect merged data
logging.info("Merged Data:")
logging.info(merged_data.head())


2024-08-03 15:11:24,218 - INFO - Merged Data:
2024-08-03 15:11:24,219 - INFO -         Date   Country  NewCases_owid  NewDeaths_owid  total_cases  \
0 2020-01-22   Albania            0.0             0.0          0.0   
1 2020-01-22   Andorra            0.0             0.0          0.0   
2 2020-01-22    Angola            0.0             0.0          0.0   
3 2020-01-22  Thailand            0.0             0.0          0.0   
4 2020-01-22     Japan            0.0             0.0          0.0   

   total_deaths  reproduction_rate  icu_patients  hosp_patients  \
0           0.0                0.0           0.0            0.0   
1           0.0                0.0           0.0            0.0   
2           0.0                0.0           0.0            0.0   
3           0.0                0.0           0.0            0.0   
4           0.0                0.0           0.0            0.0   

   weekly_icu_admissions  ...  Province/State   Country   Lat   Long  \
0                    0.0 

In [11]:
# Save the final merged dataset
final_dataset_path = "final_covid19_data_sample.csv"
merged_data.to_csv(final_dataset_path, index=False)
logging.info(f"Final dataset saved to {final_dataset_path}")


2024-08-03 15:11:25,266 - INFO - Final dataset saved to final_covid19_data_sample.csv
