In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load the datasets
who_df = pd.read_csv('/content/WHO-COVID-19-global-data.csv')
expenditure_df = pd.read_excel('/content/COVID_expenditure.xlsx')
population_df = pd.read_csv('/content/Population, female (% of total population).csv')
gdp_df = pd.read_excel('/content/GDP.xlsx')
area_df = pd.read_csv('/content/area.csv')
gender_df = pd.read_csv('/content/pop.csv')

In [None]:
# Standardize country names across datasets
who_df.rename(columns={'Country': 'Country'}, inplace=True)
expenditure_df.rename(columns={'Countries': 'Country'}, inplace=True)
population_df.rename(columns={'name': 'Country'}, inplace=True)
gdp_df.rename(columns={'Country': 'Country'}, inplace=True)
area_df.rename(columns={'name': 'Country'}, inplace=True)
gender_df.rename(columns={'Economy': 'Country'}, inplace=True)

In [None]:
# Check columns in each DataFrame to confirm column names
print("WHO Global Columns:", who_df.columns)
print("Expenditure Columns:", expenditure_df.columns)
print("Population Columns:", population_df.columns)
print("GDP Columns:", gdp_df.columns)
print("Area Columns:", area_df.columns)
print("Gender Columns:", gender_df.columns)

WHO Global Columns: Index(['Date_reported', 'Country_code', 'Country', 'WHO_region', 'New_cases',
       'Cumulative_cases', 'New_deaths', 'Cumulative_deaths'],
      dtype='object')
Expenditure Columns: Index(['Country', 'Indicators', 'Unnamed: 2', '2021'], dtype='object')
Population Columns: Index(['Economy', 'Year', 'Economy Code',
       'Population, female (% of total population)'],
      dtype='object')
GDP Columns: Index(['United States',  27360935000000,  25744108000000,  23594031000000,
        21322950000000,  21521395000000],
      dtype='object')
Area Columns: Index(['Country', 'slug', 'value', 'date_of_information', 'ranking', 'region'], dtype='object')
Gender Columns: Index(['name', 'slug', 'value', 'date_of_information', 'ranking', 'region'], dtype='object')


In [None]:
# Check columns in each DataFrame to confirm column names
who_df.rename(columns={'Country': 'Country'}, inplace=True)
expenditure_df.rename(columns={'Country': 'Country', '2021': 'Expenditure_Per_Capita_2021'}, inplace=True)
population_df.rename(columns={'Economy': 'Country', 'Population, female (% of total population)': 'Population_Female_Percent'}, inplace=True)
area_df.rename(columns={'name': 'Country', 'value': 'Area'}, inplace=True)
gender_df.rename(columns={'name': 'Country', 'value': 'Region_Size'}, inplace=True)

In [None]:
# Merge datasets sequentially on 'Country'
merged_df = who_df.merge(expenditure_df[['Country', 'Expenditure_Per_Capita_2021']], on='Country', how='left')
merged_df = merged_df.merge(population_df[['Country', 'Population_Female_Percent']], on='Country', how='left')
merged_df = merged_df.merge(area_df[['Country', 'Area']], on='Country', how='left')
merged_df = merged_df.merge(gender_df[['Country', 'Region_Size']], on='Country', how='left')

In [None]:
merged_df

Unnamed: 0,Date_reported,Country_code,Country,WHO_region,New_cases,Cumulative_cases,New_deaths,Cumulative_deaths,Expenditure_Per_Capita_2021,Population_Female_Percent,Area,Region_Size
0,2020-01-05,AF,Afghanistan,EMRO,,0,,0,2.324045,49.488,37466414,652230
1,2020-01-12,AF,Afghanistan,EMRO,,0,,0,2.324045,49.488,37466414,652230
2,2020-01-19,AF,Afghanistan,EMRO,,0,,0,2.324045,49.488,37466414,652230
3,2020-01-26,AF,Afghanistan,EMRO,,0,,0,2.324045,49.488,37466414,652230
4,2020-02-02,AF,Afghanistan,EMRO,,0,,0,2.324045,49.488,37466414,652230
...,...,...,...,...,...,...,...,...,...,...,...,...
60235,2024-09-22,ZW,Zimbabwe,AFRO,,266393,,5740,,52.833,14829988,390757
60236,2024-09-29,ZW,Zimbabwe,AFRO,0.0,266393,0.0,5740,,52.833,14829988,390757
60237,2024-10-06,ZW,Zimbabwe,AFRO,3.0,266396,0.0,5740,,52.833,14829988,390757
60238,2024-10-13,ZW,Zimbabwe,AFRO,,266396,,5740,,52.833,14829988,390757


In [None]:
merged_df.isnull().sum()

Unnamed: 0,0
Date_reported,0
Country_code,251
Country,0
WHO_region,4518
New_cases,15213
Cumulative_cases,0
New_deaths,15075
Cumulative_deaths,0
Expenditure_Per_Capita_2021,47941
Population_Female_Percent,14307


In [None]:
merged_df = merged_df.dropna(subset=['Country_code'])

In [None]:
merged_df['WHO_region'] = merged_df['WHO_region'].fillna(merged_df['WHO_region'].mode()[0])

In [None]:
merged_df['New_cases'] = merged_df['New_cases'].fillna(0)
merged_df['New_deaths'] = merged_df['New_deaths'].fillna(0)

In [None]:
merged_df['Expenditure_Per_Capita_2021'] = merged_df['Expenditure_Per_Capita_2021'].fillna(0)

In [None]:
merged_df['Population_Female_Percent'] = merged_df['Population_Female_Percent'].fillna(merged_df['Population_Female_Percent'].median())

In [None]:
# Remove commas from the columns and convert to numeric
merged_df['Area'] = merged_df['Area'].replace({',': ''}, regex=True).astype(float)
merged_df['Region_Size'] = merged_df['Region_Size'].replace({',': ''}, regex=True).astype(float)

In [None]:
# Now fill missing values with median
merged_df['Area'] = merged_df['Area'].fillna(merged_df['Area'].median())
merged_df['Region_Size'] = merged_df['Region_Size'].fillna(merged_df['Region_Size'].median())

In [None]:
print(merged_df.isnull().sum())

Date_reported                  0
Country_code                   0
Country                        0
WHO_region                     0
New_cases                      0
Cumulative_cases               0
New_deaths                     0
Cumulative_deaths              0
Expenditure_Per_Capita_2021    0
Population_Female_Percent      0
Area                           0
Region_Size                    0
dtype: int64


In [None]:
merged_df.columns

Index(['Date_reported', 'Country_code', 'Country', 'WHO_region', 'New_cases',
       'Cumulative_cases', 'New_deaths', 'Cumulative_deaths',
       'Expenditure_Per_Capita_2021', 'Population_Female_Percent', 'Area',
       'Region_Size'],
      dtype='object')

# Line Charts

In [None]:
# Get the top 10 countries with the highest Cumulative_deaths
top_countries = merged_df.groupby("Country")["Cumulative_deaths"].max().nlargest(10).index

In [None]:
# Filter the merged_df to include only these top countries
merged_df_top = merged_df[merged_df["Country"].isin(top_countries)]

In [None]:
# Create the line chart for New_cases over time for the top 10 countries
fig = px.line(merged_df_top, x="Date_reported", y="New_cases", color="Country",
              title="New Cases Over Time for Top 10 Countries with Highest Cumulative Deaths",
              labels={"New_cases": "New Cases", "Date_reported": "Date"})

# Update the layout to add a dropdown with multi-select
fig.update_layout(
    updatemenus=[{
        "buttons": [
            # Add button for showing all countries
            {"method": "restyle", "args": [{"visible": [True] * len(top_countries)}, "line"], "label": "Show All Countries"},

            # Add button to show only selected countries
            {
                "method": "relayout",
                "args": ["{visible}", {"title": "New Cases Over Time"}],
                "label": "Select Countries"
            }
        ],
        "direction": "down",
        "showactive": True,
        "active": 0,
        "x": 1.05,
        "xanchor": "left",
        "y": 1,
        "yanchor": "top"
    }]
)

# Show the figure
fig.show()

In [None]:
# Get the top 10 countries with the highest Cumulative_deaths
top_countries = merged_df.groupby("Country")["Cumulative_deaths"].max().nlargest(10).index

# Filter the merged_df to include only these top countries
merged_df_top = merged_df[merged_df["Country"].isin(top_countries)]

# Create the line chart for New_deaths over time for the top 10 countries
fig = px.line(merged_df_top, x="Date_reported", y="New_deaths", color="Country",
              title="New Deaths Over Time for Top 10 Countries with Highest Cumulative Deaths",
              labels={"New_deaths": "New Deaths", "Date_reported": "Date"})

# Update the layout to add a dropdown with multi-select for countries
fig.update_layout(
    updatemenus=[{
        "buttons": [
            # Button for showing all countries
            {"method": "restyle", "args": [{"visible": [True] * len(top_countries)}, "line"], "label": "Show All Countries"},

            # Button for selecting countries
            {
                "method": "relayout",
                "args": ["{visible}", {"title": "New Deaths Over Time"}],
                "label": "Select Countries"
            }
        ],
        "direction": "down",
        "showactive": True,
        "active": 0,
        "x": 1.05,  # Position the button to the right
        "xanchor": "left",
        "y": 1.1,  # Position it above the dropdown
        "yanchor": "top"
    }],

    # Adding a range slider for date selection on the x-axis
    xaxis=dict(
        rangeslider=dict(visible=True),
        type="date",
        rangeslider_thickness=0.1,
    )
)

# Show the figure
fig.show()

In [None]:
# Get the list of all countries in the dataset
all_countries = merged_df["Country"].unique()

# Filter the merged_df to include all countries in the dataset
merged_df_all = merged_df[merged_df["Country"].isin(all_countries)]

# Create the line chart for New_cases over time for all countries
fig = px.line(merged_df_all, x="Date_reported", y="New_cases", color="Country",
              title="New Cases Over Time for All Countries",
              labels={"New_cases": "New Cases", "Date_reported": "Date"})

# Add Multi-Select Dropdown for Countries
country_buttons = []
for country in all_countries:
    country_buttons.append(
        {
            "method": "restyle",
            "args": [{"visible": [c == country for c in all_countries]}, "line"],
            "label": country
        }
    )

# Add "Show All Countries" button
country_buttons.append(
    {
        "method": "restyle",
        "args": [{"visible": [True] * len(all_countries)}, "line"],
        "label": "Show All Countries"
    }
)

# Update Layout to add the Country Dropdown and Date Range Selection
fig.update_layout(
    updatemenus=[{
        "buttons": country_buttons,
        "direction": "down",
        "showactive": True,
        "active": 0,
        "x": 1.05,
        "xanchor": "left",
        "y": 1.1,
        "yanchor": "top"
    }],

    # Add a range slider for date selection on the x-axis
    xaxis=dict(
        rangeslider=dict(visible=True),
        type="date",
        rangeslider_thickness=0.1,
    )
)

# Show the figure
fig.show()

In [None]:
# Get the list of all countries in the dataset
all_countries = merged_df["Country"].unique()

# Filter the merged_df to include all countries in the dataset
merged_df_all = merged_df[merged_df["Country"].isin(all_countries)]

# Create the line chart for New_deaths over time for all countries
fig = px.line(merged_df_all, x="Date_reported", y="New_deaths", color="Country",
              title="New Deaths Over Time for All Countries",
              labels={"New_deaths": "New Deaths", "Date_reported": "Date"})

# Add Multi-Select Dropdown for Countries
country_buttons = []
for country in all_countries:
    country_buttons.append(
        {
            "method": "restyle",
            "args": [{"visible": [c == country for c in all_countries]}, "line"],
            "label": country
        }
    )

# Add "Show All Countries" button
country_buttons.append(
    {
        "method": "restyle",
        "args": [{"visible": [True] * len(all_countries)}, "line"],
        "label": "Show All Countries"
    }
)

# Update Layout to add the Country Dropdown and Date Range Selection
fig.update_layout(
    updatemenus=[{
        "buttons": country_buttons,
        "direction": "down",
        "showactive": True,
        "active": 0,
        "x": 1.05,
        "xanchor": "left",
        "y": 1.1,
        "yanchor": "top"
    }],

    # Add a range slider for date selection on the x-axis
    xaxis=dict(
        rangeslider=dict(visible=True),
        type="date",
        rangeslider_thickness=0.1,
    )
)

# Show the figure
fig.show()

In [None]:
# Aggregating data by WHO region
region_df = merged_df.groupby(['Date_reported', 'WHO_region'])[['New_cases']].sum().reset_index()

# Creating the line chart
fig = px.line(region_df, x="Date_reported", y="New_cases", color="WHO_region",
              title="COVID-19 New Cases Over Time by WHO Region",
              labels={"New_cases": "New Cases", "Date_reported": "Date"})

# Show the figure
fig.show()

In [None]:
# Aggregating data by WHO region for New Deaths
region_df_deaths = merged_df.groupby(['Date_reported', 'WHO_region'])[['New_deaths']].sum().reset_index()

# Creating the line chart for New Deaths
fig_deaths = px.line(region_df_deaths, x="Date_reported", y="New_deaths", color="WHO_region",
                     title="COVID-19 New Deaths Over Time by WHO Region",
                     labels={"New_deaths": "New Deaths", "Date_reported": "Date"})

# Show the figure
fig_deaths.show()