In [None]:
pip install pandas numpy matplotlib seaborn plotly requests folium openpyxl

In [None]:
# Core libraries
import pandas as pd
import numpy as np
import requests # For fetching data if not downloaded
# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import folium # For maps

# Notebook display settings
%matplotlib inline
sns.set_style("whitegrid")
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

print("Libraries imported successfully!")

In [None]:
# --- Data Acquisition ---
# URL for the Our World in Data (OWID) COVID-19 dataset
owid_url = 'https://covid.ourworldindata.org/data/owid-covid-data.csv'

# Attempt to load the data
try:
    print(f"Attempting to download data from: {owid_url}")
    df_covid_raw = pd.read_csv(owid_url)
    print("Data downloaded and loaded successfully!")
except Exception as e:
    print(f"Error downloading or loading data: {e}")
    print("Please ensure you have an internet connection or download the file manually and provide the local path.")
    # Example for local file:
    # local_path = 'path/to/your/owid-covid-data.csv'
    # df_covid_raw = pd.read_csv(local_path)
    df_covid_raw = pd.DataFrame() # Create an empty df if download fails

# Display basic info if data loaded
if not df_covid_raw.empty:
    print("\n--- Initial Data Overview ---")
    print(f"Shape of the dataset: {df_covid_raw.shape}")
    print("\nFirst 5 rows:")
    print(df_covid_raw.head())
    print("\nData types and non-null counts:")
    df_covid_raw.info()

In [None]:
if not df_covid_raw.empty:
    print("\n--- Data Cleaning and Preprocessing ---")
    df_covid = df_covid_raw.copy()

    # 1. Convert 'date' column to datetime objects
    df_covid['date'] = pd.to_datetime(df_covid['date'])
    print("\n'date' column converted to datetime.")

    # 2. Handle Missing Values (NaNs)
    # For key numerical columns often used in time series, fill NaNs with 0.
    # This assumes that if data isn't reported, it's zero for that day/metric.
    # This is a simplification; more sophisticated imputation might be needed for specific analyses.
    cols_to_fill_zero = [
        'total_cases', 'new_cases', 'total_deaths', 'new_deaths',
        'total_vaccinations', 'people_vaccinated', 'people_fully_vaccinated', 'total_boosters',
        'new_vaccinations', 'icu_patients', 'hosp_patients'
    ]
    for col in cols_to_fill_zero:
        if col in df_covid.columns:
            df_covid[col] = df_covid[col].fillna(0)

    # For per-million/per-hundred metrics, fill with 0 if the base metric is 0
    # or forward-fill for some scenarios (can be complex)
    # For simplicity, we'll also fill these with 0 for now where their base is 0.
    # Example: total_cases_per_million
    per_million_cols = [col for col in df_covid.columns if 'per_million' in col or 'per_hundred' in col]
    for col in per_million_cols:
         if col in df_covid.columns:
            df_covid[col] = df_covid[col].fillna(0)

    # Continent: OWID uses specific ISO codes for continents/world (e.g., OWID_WRL, OWID_EUR).
    # Actual countries have a non-null 'continent'. Rows where 'continent' is NaN are aggregates.
    # We'll keep them for now and filter as needed.
    print(f"\nMissing values in 'continent' before fill: {df_covid['continent'].isnull().sum()}")
    df_covid['continent'] = df_covid['continent'].fillna('Aggregate') # Label aggregates
    print(f"Missing values in 'continent' after fill: {df_covid['continent'].isnull().sum()}")


    # 3. Create Derived Features (if not already present or for clarity)
    # OWID data is quite comprehensive, but let's ensure some key metrics are clear.
    # Case Fatality Rate (CFR)
    df_covid['case_fatality_rate'] = (df_covid['total_deaths'] / df_covid['total_cases']) * 100
    df_covid['case_fatality_rate'] = df_covid['case_fatality_rate'].fillna(0).replace([np.inf, -np.inf], 0)

    # Vaccination Rate (people fully vaccinated per hundred)
    # This is usually 'people_fully_vaccinated_per_hundred' in OWID
    if 'people_fully_vaccinated_per_hundred' not in df_covid.columns and 'population' in df_covid.columns:
        df_covid['people_fully_vaccinated_per_hundred'] = (df_covid['people_fully_vaccinated'] / df_covid['population']) * 100
        df_covid['people_fully_vaccinated_per_hundred'] = df_covid['people_fully_vaccinated_per_hundred'].fillna(0)
    elif 'people_fully_vaccinated_per_hundred' in df_covid.columns:
         df_covid['people_fully_vaccinated_per_hundred'] = df_covid['people_fully_vaccinated_per_hundred'].fillna(0)


    # Ensure smoothed versions exist or calculate them (OWID provides many)
    for metric in ['new_cases', 'new_deaths', 'new_vaccinations']:
        smoothed_col = f'{metric}_smoothed'
        if smoothed_col not in df_covid.columns:
            df_covid[smoothed_col] = df_covid.groupby('location')[metric].transform(lambda x: x.rolling(window=7, min_periods=1).mean())
            print(f"Calculated '{smoothed_col}'.")
        else:
            df_covid[smoothed_col] = df_covid[smoothed_col].fillna(0) # Fill NaNs in existing smoothed cols

    print("\n--- Cleaned Data Overview ---")
    print(f"Shape of the cleaned dataset: {df_covid.shape}")
    print(df_covid[['date', 'location', 'total_cases', 'new_cases_smoothed', 'total_deaths', 'new_deaths_smoothed', 'people_fully_vaccinated_per_hundred', 'case_fatality_rate']].tail())
    print("\nData cleaning and preprocessing complete.")
else:
    print("Skipping cleaning as data failed to load.")

In [None]:
if not df_covid.empty:
    print("\n--- Objective: Analyze Global Time Trends ---")

    # Filter for global data (OWID often uses 'World' as location or specific ISO codes)
    # Let's find the common way OWID represents 'World'
    world_iso_codes = ['OWID_WRL'] # Common ISO code for World aggregate
    df_global = df_covid[df_covid['iso_code'].isin(world_iso_codes)].copy()

    if df_global.empty and 'World' in df_covid['location'].unique():
        df_global = df_covid[df_covid['location'] == 'World'].copy()


    if not df_global.empty:
        # 1. Global Cases and Deaths Over Time
        fig_global_cases_deaths = make_subplots(rows=2, cols=1, shared_xaxes=True,
                                                subplot_titles=('Global Daily New Cases (7-day avg)',
                                                                'Global Daily New Deaths (7-day avg)'))

        fig_global_cases_deaths.add_trace(go.Scatter(x=df_global['date'], y=df_global['new_cases_smoothed'],
                                                     mode='lines', name='New Cases Smoothed'), row=1, col=1)
        fig_global_cases_deaths.add_trace(go.Scatter(x=df_global['date'], y=df_global['new_deaths_smoothed'],
                                                     mode='lines', name='New Deaths Smoothed', line=dict(color='red')), row=2, col=1)

        fig_global_cases_deaths.update_layout(height=600, title_text='Global COVID-19 Cases and Deaths Trends')
        fig_global_cases_deaths.show()

        # 2. Global Vaccinations Over Time
        fig_global_vaccinations = go.Figure()
        fig_global_vaccinations.add_trace(go.Scatter(x=df_global['date'], y=df_global['people_vaccinated_per_hundred'],
                                                     mode='lines', name='People Vaccinated (%)'))
        fig_global_vaccinations.add_trace(go.Scatter(x=df_global['date'], y=df_global['people_fully_vaccinated_per_hundred'],
                                                     mode='lines', name='People Fully Vaccinated (%)'))
        if 'total_boosters_per_hundred' in df_global.columns:
             fig_global_vaccinations.add_trace(go.Scatter(x=df_global['date'], y=df_global['total_boosters_per_hundred'],
                                                     mode='lines', name='Total Boosters Administered (%)'))

        fig_global_vaccinations.update_layout(title='Global COVID-19 Vaccination Progress (% of Population)',
                                              yaxis_title='Percentage of Population', height=500)
        fig_global_vaccinations.show()
    else:
        print("Could not find 'World' data in the dataset. Check 'iso_code' or 'location' for global aggregate.")
else:
    print("Skipping global trends analysis as data is not loaded.")

In [None]:
if not df_covid.empty:
    print("\n--- Objective: Compare Metrics Across Countries/Regions ---")

    # Prepare data: get the latest figures for each country (excluding aggregates)
    df_countries_only = df_covid[df_covid['continent'] != 'Aggregate'].copy()
    latest_date = df_countries_only['date'].max()
    df_latest_country_data = df_countries_only[df_countries_only['date'] == latest_date].sort_values(by='total_cases', ascending=False)

    # 1. Top N Countries by Metrics
    N = 20 # Number of top countries to show

    # Top N by Total Cases per Million
    top_n_cases_pm = df_latest_country_data.sort_values(by='total_cases_per_million', ascending=False).head(N)
    fig_top_cases = px.bar(top_n_cases_pm, x='location', y='total_cases_per_million',
                           title=f'Top {N} Countries by Total Cases per Million (as of {latest_date.date()})',
                           labels={'location':'Country', 'total_cases_per_million':'Total Cases per Million'},
                           color='total_cases_per_million', color_continuous_scale=px.colors.sequential.Viridis)
    fig_top_cases.show()

    # Top N by Total Deaths per Million
    top_n_deaths_pm = df_latest_country_data.sort_values(by='total_deaths_per_million', ascending=False).head(N)
    fig_top_deaths = px.bar(top_n_deaths_pm, x='location', y='total_deaths_per_million',
                            title=f'Top {N} Countries by Total Deaths per Million (as of {latest_date.date()})',
                            labels={'location':'Country', 'total_deaths_per_million':'Total Deaths per Million'},
                            color='total_deaths_per_million', color_continuous_scale=px.colors.sequential.Reds)
    fig_top_deaths.show()

    # Top N by People Fully Vaccinated per Hundred
    top_n_vacc_pfh = df_latest_country_data.sort_values(by='people_fully_vaccinated_per_hundred', ascending=False).head(N)
    fig_top_vacc = px.bar(top_n_vacc_pfh, x='location', y='people_fully_vaccinated_per_hundred',
                          title=f'Top {N} Countries by People Fully Vaccinated per Hundred (as of {latest_date.date()})',
                          labels={'location':'Country', 'people_fully_vaccinated_per_hundred':'% Fully Vaccinated'},
                          color='people_fully_vaccinated_per_hundred', color_continuous_scale=px.colors.sequential.Greens)
    fig_top_vacc.show()


    # 2. Regional (Continental) Comparison
    # OWID often has direct entries for continents, e.g. location = 'Asia', 'Europe'
    # These are marked with 'iso_code' like 'OWID_ASI', 'OWID_EUR', etc. and continent = 'Aggregate' from our cleaning
    df_continents = df_covid[df_covid['iso_code'].str.startswith('OWID_') &
                             ~df_covid['iso_code'].isin(['OWID_WRL', 'OWID_INT', # World, International
                                                         'OWID_HIC', 'OWID_LIC', 'OWID_MIC', 'OWID_UMC', 'OWID_LMC' # Income groups
                                                         ])].copy()
    # Use location as a cleaner name if iso_code is like OWID_EUR
    df_continents['continent_name'] = df_continents['location']


    if not df_continents.empty:
        # New Cases Smoothed per Million by Continent
        fig_continent_cases = px.line(df_continents, x='date', y='new_cases_smoothed_per_million', color='continent_name',
                                      title='New Cases per Million (7-day smoothed) by Continent',
                                      labels={'new_cases_smoothed_per_million': 'New Cases/Million (Smoothed)', 'continent_name': 'Continent'})
        fig_continent_cases.show()

        # People Fully Vaccinated per Hundred by Continent
        fig_continent_vacc = px.line(df_continents, x='date', y='people_fully_vaccinated_per_hundred', color='continent_name',
                                     title='People Fully Vaccinated per Hundred by Continent',
                                     labels={'people_fully_vaccinated_per_hundred': '% Fully Vaccinated', 'continent_name': 'Continent'})
        fig_continent_vacc.show()
    else:
        print("Could not find specific continent aggregate data. You might need to group by 'continent' column and sum/average.")
        # Fallback: If OWID_XXX codes are not as expected, group by the 'continent' column for actual countries
        # This requires careful aggregation (e.g., sum cases, then divide by sum of populations)
        # For simplicity here, we'll skip this complex fallback.

else:
    print("Skipping country/regional comparison as data is not loaded.")

In [None]:
if not df_covid.empty and not df_latest_country_data.empty:
    print("\n--- Objective: Visualize Trends with Maps ---")

    # Ensure we have iso_alpha (3-letter ISO code) for Plotly choropleth
    # OWID provides 'iso_code' which is usually the 3-letter code needed.

    # Map 1: Total Cases per Million
    fig_map_cases = px.choropleth(df_latest_country_data,
                                  locations="iso_code",  # Column with ISO Alpha-3 codes
                                  color="total_cases_per_million",
                                  hover_name="location", # Column to display on hover
                                  color_continuous_scale=px.colors.sequential.YlOrRd,
                                  title=f'Total COVID-19 Cases per Million by Country (as of {latest_date.date()})',
                                  projection="natural earth")
    fig_map_cases.update_layout(margin={"r":0,"t":50,"l":0,"b":0})
    fig_map_cases.show()

    # Map 2: Total Deaths per Million
    fig_map_deaths = px.choropleth(df_latest_country_data,
                                   locations="iso_code",
                                   color="total_deaths_per_million",
                                   hover_name="location",
                                   color_continuous_scale=px.colors.sequential.OrRd,
                                   title=f'Total COVID-19 Deaths per Million by Country (as of {latest_date.date()})',
                                   projection="natural earth")
    fig_map_deaths.update_layout(margin={"r":0,"t":50,"l":0,"b":0})
    fig_map_deaths.show()

    # Map 3: People Fully Vaccinated per Hundred
    fig_map_vacc = px.choropleth(df_latest_country_data,
                                 locations="iso_code",
                                 color="people_fully_vaccinated_per_hundred",
                                 hover_name="location",
                                 color_continuous_scale=px.colors.sequential.Greens,
                                 title=f'People Fully Vaccinated per Hundred by Country (as of {latest_date.date()})',
                                 projection="natural earth")
    fig_map_vacc.update_layout(margin={"r":0,"t":50,"l":0,"b":0})
    fig_map_vacc.show()

    # Note: Folium can also be used for more customizable maps, but Plotly Express is simpler for quick choropleths.
    # Example with Folium (requires valid geometry which is built-in for countries)
    # world_geo = 'https://raw.githubusercontent.com/python-visualization/folium/master/examples/data/world-countries.json'
    # m = folium.Map(location=[0, 0], zoom_start=2)
    # folium.Choropleth(
    #     geo_data=world_geo,
    #     name='choropleth',
    #     data=df_latest_country_data,
    #     columns=['iso_code', 'total_cases_per_million'], # iso_code needs to match key in geo_data
    #     key_on='feature.id', # Path to ISO code in GeoJSON properties
    #     fill_color='YlOrRd',
    #     fill_opacity=0.7,
    #     line_opacity=0.2,
    #     legend_name='Total Cases per Million'
    # ).add_to(m)
    # # m # Display map (might need save to html and display iframe in some envs)

else:
    print("Skipping geographical maps as data is not loaded or latest country data is unavailable.")

In [None]:
# COVID-19 Global Trends Analysis: Findings

This report analyzes global COVID-19 data, focusing on time trends, comparisons across regions, and visualizations.

## 1. Introduction
- **Objective:** To import, clean, analyze, and visualize global COVID-19 data regarding cases, deaths, and vaccinations.
- **Data Source:** Our World in Data (OWID) COVID-19 dataset (`https://covid.ourworldindata.org/data/owid-covid-data.csv`).
- **Key Metrics:** Cases, deaths (total and per million), vaccination rates (people fully vaccinated per hundred), case fatality rate.

## 2. Data Cleaning and Preparation
- The raw data was loaded and underwent several cleaning steps:
    - Conversion of `date` column to datetime objects.
    - Handling of missing values, primarily by filling key numerical metrics with 0.
    - `continent` column NaN values (representing aggregates like 'World' or continents) were filled with 'Aggregate'.
    - Derived features like `case_fatality_rate` were calculated/ensured.
    - Smoothed 7-day averages for new cases and deaths were used for trend analysis.

## 3. Key Findings

### 3.1. Global Time Trends
- **Cases & Deaths:**
    - The global trend of new cases (7-day average) shows distinct waves of the pandemic. [Refer to "Global COVID-19 Cases and Deaths Trends" chart].
    - Similarly, global new deaths (7-day average) followed the case waves, often with a lag.
- **Vaccinations:**
    - Global vaccination rollout shows a steady increase in the percentage of the population vaccinated and fully vaccinated over time. [Refer to "Global COVID-19 Vaccination Progress" chart].
    - Booster dose administration also shows an upward trend, though starting later and at a lower percentage.

### 3.2. Comparative Analysis (Countries & Regions)
- **Top Countries (as of latest data):**
    - **Cases per Million:** Countries like [mention 1-2 from chart, e.g., Cyprus, San Marino - this will vary] reported the highest cumulative cases per million. [Refer to "Top N Countries by Total Cases per Million" chart].
    - **Deaths per Million:** Countries like [mention 1-2 from chart, e.g., Peru, Bulgaria - this will vary] had the highest cumulative deaths per million. [Refer to "Top N Countries by Total Deaths per Million" chart].
    - **Vaccination Rates:** Nations such as [mention 1-2 from chart, e.g., UAE, Portugal - this will vary] achieved very high full vaccination rates. [Refer to "Top N Countries by People Fully Vaccinated per Hundred" chart].
- **Continental Trends:**
    - Different continents experienced pandemic waves at varying times and intensities. [Refer to "New Cases per Million by Continent" chart].
    - Vaccination progress also varied significantly by continent, with [mention leading/lagging continents from chart] showing distinct trajectories. [Refer to "People Fully Vaccinated per Hundred by Continent" chart].

### 3.3. Geographical Distribution (Maps)
- The choropleth maps provide a visual overview of the pandemic's impact globally:
    - **Cases per Million:** Highlights regions with higher overall case burdens relative to their population. [Refer to "Total COVID-19 Cases per Million by Country" map].
    - **Deaths per Million:** Shows countries with higher mortality rates per capita. [Refer to "Total COVID-19 Deaths per Million by Country" map].
    - **Vaccination Coverage:** Illustrates the disparities in full vaccination coverage across the globe. [Refer to "People Fully Vaccinated per Hundred by Country" map].

## 4. Limitations
- **Data Reporting:** Data accuracy and reporting frequency can vary significantly between countries and over time. This can affect comparisons.
- **Definitions:** Definitions (e.g., "fully vaccinated") might have changed or varied.
- **Attribution:** Deaths attributed to COVID-19 can be complex and subject to different national methodologies.
- **Aggregates:** Global and continental aggregates are based on available data and may not perfectly represent all constituent populations if some countries have reporting gaps.

## 5. Conclusion
The analysis of the OWID COVID-19 dataset reveals the dynamic nature of the pandemic, with distinct waves and varying impacts across different regions and countries. Vaccination campaigns have progressed globally, but significant disparities remain. Continuous monitoring and data-driven insights are crucial for understanding and responding to public health challenges like COVID-19.

---
*(This report was generated on [Insert Current Date Here])*
---