In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load the datasets
who_df = pd.read_csv('/content/WHO-COVID-19-global-data.csv')
expenditure_df = pd.read_excel('/content/COVID_expenditure.xlsx')
population_df = pd.read_csv('/content/Population, female (% of total population).csv')
gdp_df = pd.read_excel('/content/GDP.xlsx')
area_df = pd.read_csv('/content/area.csv')
gender_df = pd.read_csv('/content/pop.csv')

In [3]:
# Standardize country names across datasets
who_df.rename(columns={'Country': 'Country'}, inplace=True)
expenditure_df.rename(columns={'Countries': 'Country'}, inplace=True)
population_df.rename(columns={'name': 'Country'}, inplace=True)
gdp_df.rename(columns={'Country': 'Country'}, inplace=True)
area_df.rename(columns={'name': 'Country'}, inplace=True)
gender_df.rename(columns={'Economy': 'Country'}, inplace=True)

In [4]:
# Check columns in each DataFrame to confirm column names
print("WHO Global Columns:", who_df.columns)
print("Expenditure Columns:", expenditure_df.columns)
print("Population Columns:", population_df.columns)
print("GDP Columns:", gdp_df.columns)
print("Area Columns:", area_df.columns)
print("Gender Columns:", gender_df.columns)

WHO Global Columns: Index(['Date_reported', 'Country_code', 'Country', 'WHO_region', 'New_cases',
       'Cumulative_cases', 'New_deaths', 'Cumulative_deaths'],
      dtype='object')
Expenditure Columns: Index(['Country', 'Indicators', 'Unnamed: 2', '2021'], dtype='object')
Population Columns: Index(['Economy', 'Year', 'Economy Code',
       'Population, female (% of total population)'],
      dtype='object')
GDP Columns: Index(['United States',  27360935000000,  25744108000000,  23594031000000,
        21322950000000,  21521395000000],
      dtype='object')
Area Columns: Index(['Country', 'slug', 'value', 'date_of_information', 'ranking', 'region'], dtype='object')
Gender Columns: Index(['name', 'slug', 'value', 'date_of_information', 'ranking', 'region'], dtype='object')


In [5]:
# Check columns in each DataFrame to confirm column names
who_df.rename(columns={'Country': 'Country'}, inplace=True)
expenditure_df.rename(columns={'Country': 'Country', '2021': 'Expenditure_Per_Capita_2021'}, inplace=True)
population_df.rename(columns={'Economy': 'Country', 'Population, female (% of total population)': 'Population_Female_Percent'}, inplace=True)
area_df.rename(columns={'name': 'Country', 'value': 'Area'}, inplace=True)
gender_df.rename(columns={'name': 'Country', 'value': 'Region_Size'}, inplace=True)

In [6]:
# Merge datasets sequentially on 'Country'
merged_df = who_df.merge(expenditure_df[['Country', 'Expenditure_Per_Capita_2021']], on='Country', how='left')
merged_df = merged_df.merge(population_df[['Country', 'Population_Female_Percent']], on='Country', how='left')
merged_df = merged_df.merge(area_df[['Country', 'Area']], on='Country', how='left')
merged_df = merged_df.merge(gender_df[['Country', 'Region_Size']], on='Country', how='left')

In [7]:
merged_df

Unnamed: 0,Date_reported,Country_code,Country,WHO_region,New_cases,Cumulative_cases,New_deaths,Cumulative_deaths,Expenditure_Per_Capita_2021,Population_Female_Percent,Area,Region_Size
0,2020-01-05,AF,Afghanistan,EMRO,,0,,0,2.324045,49.488,37466414,652230
1,2020-01-12,AF,Afghanistan,EMRO,,0,,0,2.324045,49.488,37466414,652230
2,2020-01-19,AF,Afghanistan,EMRO,,0,,0,2.324045,49.488,37466414,652230
3,2020-01-26,AF,Afghanistan,EMRO,,0,,0,2.324045,49.488,37466414,652230
4,2020-02-02,AF,Afghanistan,EMRO,,0,,0,2.324045,49.488,37466414,652230
...,...,...,...,...,...,...,...,...,...,...,...,...
60235,2024-09-22,ZW,Zimbabwe,AFRO,,266393,,5740,,52.833,14829988,390757
60236,2024-09-29,ZW,Zimbabwe,AFRO,0.0,266393,0.0,5740,,52.833,14829988,390757
60237,2024-10-06,ZW,Zimbabwe,AFRO,3.0,266396,0.0,5740,,52.833,14829988,390757
60238,2024-10-13,ZW,Zimbabwe,AFRO,,266396,,5740,,52.833,14829988,390757


In [8]:
merged_df.isnull().sum()

Unnamed: 0,0
Date_reported,0
Country_code,251
Country,0
WHO_region,4518
New_cases,15213
Cumulative_cases,0
New_deaths,15075
Cumulative_deaths,0
Expenditure_Per_Capita_2021,47941
Population_Female_Percent,14307


In [9]:
merged_df = merged_df.dropna(subset=['Country_code'])

In [10]:
merged_df['WHO_region'] = merged_df['WHO_region'].fillna(merged_df['WHO_region'].mode()[0])

In [11]:
merged_df['New_cases'] = merged_df['New_cases'].fillna(0)
merged_df['New_deaths'] = merged_df['New_deaths'].fillna(0)

In [12]:
merged_df['Expenditure_Per_Capita_2021'] = merged_df['Expenditure_Per_Capita_2021'].fillna(0)

In [13]:
merged_df['Population_Female_Percent'] = merged_df['Population_Female_Percent'].fillna(merged_df['Population_Female_Percent'].median())

In [14]:
# Remove commas from the columns and convert to numeric
merged_df['Area'] = merged_df['Area'].replace({',': ''}, regex=True).astype(float)
merged_df['Region_Size'] = merged_df['Region_Size'].replace({',': ''}, regex=True).astype(float)

In [15]:
# Now fill missing values with median
merged_df['Area'] = merged_df['Area'].fillna(merged_df['Area'].median())
merged_df['Region_Size'] = merged_df['Region_Size'].fillna(merged_df['Region_Size'].median())

In [16]:
print(merged_df.isnull().sum())

Date_reported                  0
Country_code                   0
Country                        0
WHO_region                     0
New_cases                      0
Cumulative_cases               0
New_deaths                     0
Cumulative_deaths              0
Expenditure_Per_Capita_2021    0
Population_Female_Percent      0
Area                           0
Region_Size                    0
dtype: int64


In [17]:
merged_df.columns

Index(['Date_reported', 'Country_code', 'Country', 'WHO_region', 'New_cases',
       'Cumulative_cases', 'New_deaths', 'Cumulative_deaths',
       'Expenditure_Per_Capita_2021', 'Population_Female_Percent', 'Area',
       'Region_Size'],
      dtype='object')

# Choropleth

In [23]:
merged_df['Cumulative_deaths'] = pd.to_numeric(merged_df['Cumulative_deaths'], errors='coerce')

# Group the data by 'Country' and aggregate the total cumulative deaths
choropleth_data = merged_df.groupby('Country').agg({
    'Cumulative_deaths': 'max'
}).reset_index()

# Plot choropleth map
fig = px.choropleth(
    choropleth_data,
    locations='Country',
    locationmode='country names',
    color='Cumulative_deaths',
    color_continuous_scale="Viridis",
    title="Choropleth Map of COVID-19 Deaths by Country"
)

fig.show()

In [24]:
merged_df['Cumulative_deaths'] = pd.to_numeric(merged_df['Cumulative_deaths'], errors='coerce')
merged_df['Expenditure_Per_Capita_2021'] = pd.to_numeric(merged_df['Expenditure_Per_Capita_2021'], errors='coerce')
merged_df['Population_Female_Percent'] = pd.to_numeric(merged_df['Population_Female_Percent'], errors='coerce')
merged_df['Area'] = pd.to_numeric(merged_df['Area'], errors='coerce')
merged_df['Region_Size'] = pd.to_numeric(merged_df['Region_Size'], errors='coerce')

# Group the data by 'Country' and aggregate the total cumulative deaths
choropleth_data = merged_df.groupby('Country').agg({
    'Cumulative_deaths': 'max',
    'Expenditure_Per_Capita_2021': 'mean',
    'Population_Female_Percent': 'mean',
    'Area': 'mean',
    'Region_Size': 'mean'
}).reset_index()

# Plot choropleth map
fig = px.choropleth(
    choropleth_data,
    locations='Country',
    locationmode='country names',
    color='Cumulative_deaths',
    color_continuous_scale="Viridis",
    title="Choropleth Map of COVID-19 Deaths by Country",
    hover_data={
        'Cumulative_deaths': True,
        'Expenditure_Per_Capita_2021': True,
        'Population_Female_Percent': True,
        'Area': True,
        'Region_Size': True
    }
)

fig.show()

In [25]:
pip install folium



In [26]:
merged_df['Cumulative_deaths'] = pd.to_numeric(merged_df['Cumulative_deaths'], errors='coerce')

# Aggregate data by country
choropleth_data = merged_df.groupby('Country').agg({
    'Cumulative_deaths': 'max'
}).reset_index()

# Create the choropleth map w
fig = go.Figure(go.Choropleth(
    locations=choropleth_data['Country'],
    locationmode='country names',
    z=choropleth_data['Cumulative_deaths'],
    hoverinfo='location+z',
    colorscale='Viridis',
    colorbar_title='Cumulative Deaths',
))

# Update layout for the Earth-like shape
fig.update_geos(
    projection_type="orthographic",
    showcoastlines=True,
    coastlinecolor="Black",
    showland=True,
    landcolor="lightgray",
    showocean=True,
    oceancolor="lightblue"
)

fig.update_layout(
    title_text="Choropleth Map of COVID-19 Deaths by Country",
    geo=dict(showframe=False, showcoastlines=True, projection_type="orthographic"),
)

fig.show()
