In [None]:
from kaggle.api.kaggle_api_extended import KaggleApi

# Authenticate and download dataset
api = KaggleApi()
api.authenticate()

try:
    api.dataset_download_files('sudalairajkumar/novel-corona-virus-2019-dataset', path='kaggle_data/', unzip=True)
    kaggle_df = pd.read_csv('kaggle_data/covid_19_data.csv')

    # Check the first few rows of the Kaggle dataframe
    print("\nKaggle DataFrame:")
    print(kaggle_df.head())

except Exception as e:
    print(f"Error downloading or reading Kaggle dataset: {e}")

In [None]:
import requests

url = "https://health.google.com/covid-19/open-data/raw-data"

try:
    response = requests.get(url)
    with open("google_health_data.csv", "wb") as file:
        file.write(response.content)
    google_health_df = pd.read_csv("google_health_data.csv")

    # Check the first few rows of the Google Health dataframe
    print("\nGoogle Health DataFrame:")
    print(google_health_df.head())

except Exception as e:
    print(f"Error downloading or reading Google Health dataset: {e}")


In [None]:
def normalize_data(df, column_mapping, source_name):
    df = df.rename(columns=column_mapping)
    df['date'] = pd.to_datetime(df['date'])
    df['source'] = source_name
    required_columns = ['date', 'location', 'cases', 'deaths', 'recovered', 'tests', 'hospitalized', 'icu', 'age_group', 'gender', 'source']
    for col in required_columns:
        if col not in df.columns:
            df[col] = None
    return df[required_columns]

# Example column mappings for different datasets
column_mappings = [
    {
        'Date': 'date',
        'Country/Region': 'location',
        'Confirmed': 'cases',
        'Deaths': 'deaths',
        'Recovered': 'recovered',
        'Tests': 'tests',
        'Hospitalized': 'hospitalized',
        'ICU': 'icu',
        'Age Group': 'age_group',
        'Gender': 'gender'
    },
    {
        'date': 'date',
        'subregion1_name': 'location',
        'total_confirmed': 'cases',
        'total_deceased': 'deaths',
        'total_recovered': 'recovered',
        'total_tests': 'tests',
        'total_hospitalized': 'hospitalized',
        'total_icu': 'icu',
        'age_group': 'age_group',
        'gender': 'gender'
    },
    {
        'date': 'date',
        'location': 'location',
        'total_cases': 'cases',
        'total_deaths': 'deaths',
        'total_recovered': 'recovered',
        'total_tests': 'tests',
        'hospitalized': 'hospitalized',
        'icu': 'icu',
        'age_group': 'age_group',
        'gender': 'gender'
    },
    {
        'Date': 'date',
        'Country_Region': 'location',
        'Confirmed': 'cases',
        'Deaths': 'deaths',
        'Recovered': 'recovered',
        'Tests': 'tests',
        'Hospitalized': 'hospitalized',
        'ICU': 'icu',
        'Age Group': 'age_group',
        'Gender': 'gender'
    },
    {
        'ObservationDate': 'date',
        'Country/Region': 'location',
        'Confirmed': 'cases',
        'Deaths': 'deaths',
        'Recovered': 'recovered',
        'Tests': 'tests',
        'Hospitalized': 'hospitalized',
        'ICU': 'icu',
        'Age Group': 'age_group',
        'Gender': 'gender'
    },
    {
        'Date': 'date',
        'Country/Region': 'location',
        'Confirmed Cases': 'cases',
        'Deaths': 'deaths',
        'Recovered': 'recovered',
        'Tests Conducted': 'tests',
        'Hospitalized Patients': 'hospitalized',
        'ICU Patients': 'icu',
        'Age Group': 'age_group',
        'Gender': 'gender'
    }
]

# Normalize datasets
try:
    normalized_dfs = [
        normalize_data(govex_df, column_mappings[0], 'GovEx'),
        normalize_data(google_df, column_mappings[1], 'Google'),
        normalize_data(owid_df, column_mappings[2], 'OWID'),
        normalize_data(csse_df, column_mappings[3], 'CSSE'),
        normalize_data(kaggle_df, column_mappings[4], 'Kaggle'),
        normalize_data(google_health_df, column_mappings[5], 'GoogleHealth')
    ]

    # Check the first few rows of the normalized dataframes
    for i, df in enumerate(normalized_dfs):
        print(f"\nNormalized DataFrame {i + 1} ({df['source'].iloc[0]}):")
        print(df.head())

except Exception as e:
    print(f"Error normalizing datasets: {e}")


In [None]:
# Combine all dataframes into one
try:
    final_df = pd.concat(normalized_dfs, ignore_index=True)

    # Handle missing values
    final_df['cases'] = final_df['cases'].fillna(0)
    final_df['deaths'] = final_df['deaths'].fillna(0)
    final_df['recovered'] = final_df['recovered'].fillna(0)
    final_df['tests'] = final_df['tests'].fillna(0)
    final_df['hospitalized'] = final_df['hospitalized'].fillna(0)
    final_df['icu'] = final_df['icu'].fillna(0)
    final_df['age_group'] = final_df['age_group'].fillna('Unknown')
    final_df['gender'] = final_df['gender'].fillna('Unknown')

    # Display the final dataframe
    print("\nFinal Integrated DataFrame:")
    print(final_df.head())

except Exception as e:
    print(f"Error integrating datasets: {e}")


In [None]:
# Save the final dataset
try:
    final_df.to_csv('integrated_covid_data_complex.csv', index=False)
    final_df.to_parquet('integrated_covid_data_complex.parquet', index=False)
    print("\nFinal dataset saved successfully.")

except Exception as e:
    print(f"Error saving the final dataset: {e}")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plotting trends over time
try:
    plt.figure(figsize=(10, 6))
    sns.lineplot(data=final_df, x='date', y='cases', hue='location')
    plt.title('Covid-19 Cases Over Time by Location')
    plt.show()
    print("\nData analysis and visualization completed successfully.")

except Exception as e:
    print(f"Error in data analysis and visualization: {e}")
