# Global COVID-19 Death Data Analysis by Wil Oshoke

## Step 1: Installing & Importing Necessary Python Libraries 

### Step 1.1: Installing All Required Libraries

In [4]:
!pip install pandas -q
!pip install numpy -q
!pip install matplotlib -q
!pip install seaborn -q
!pip install plotly -q
!pip install folium -q
!pip install scikit-learn -q
!pip install statsmodels -q

### Step 1.2: Importing Libraries with Aliases

In [5]:
import pandas as pd  # Importing pandas for data manipulation
import numpy as np  # Importing numpy for numerical operations
import matplotlib.pyplot as plt   # Importing pyplot from matplotlib for plotting
import seaborn as sns   # Importing seaborn for statistical data visualization
import plotly.express as px   # Importing plotly.express for interactive plots
import folium   # Importing folium for interactive maps
import sklearn   # Importing scikit-learn for machine learning
import statsmodels.api as sm   # Importing statsmodels for statistical modeling

## Step 2: Data Analyses Guide

### Step 2.1: Data Loading and Cleaning

#### Explore the dataset with .info(), .head() and .describe()

In [191]:
url = "https://raw.githubusercontent.com/Wil-Oshoke/Covid-19-Global-Death-Analysis/refs/heads/main/RAW_global_deaths.csv"
df = pd.read_csv(url)
#df.head.info()
#df.describe()
#df.head(15)

### Step 2.2: Exploratory Data Analysis (EDA)

#### Viewing all Columns and Renaming the Cummulative Deaths Per Country

In [100]:
# View all column names
df.columns

# Rename the last column to "Total Deaths"
df.rename(columns={df.columns[-1]: "Total Deaths"}, inplace=True)

#### Selecting the Relevant Columns & Aggregating the 'Country/Region' Column to Sum the Total Deaths for Each Country

In [198]:
# Group by 'Country/Region' and sum the death counts for all dates
aggregated_data = df.groupby('Country/Region').sum(numeric_only=True).reset_index()

#aggregated_data.head(50)

#### Checking & Handling Duplicates, Missing Data and Empty Strings

In [108]:
Duplicated_Values = aggregated_data.duplicated().sum()
Duplicated_Values

missing_values = aggregated_data.isnull().sum()
#missing_values

Empty_Strings = (aggregated_data == '').sum() 
#Empty_Strings

# the dataset seem to have no duplicated values, missing values or empty strings in the relevant cells
# therefore no need to dropNA

In [222]:
import pandas as pd

# Check if 'Total' row already exists and remove it
if 'Total' in aggregated_data.index:
    aggregated_data = aggregated_data.drop('Total')

# Calculate the sum of all columns except the last one
sum_row = aggregated_data.iloc[:, 3:-1].sum()

# Create a new row with the sums
# Convert the Series to a DataFrame and set the index to a meaningful label (e.g., "Total")
sum_row_df = sum_row.to_frame().T
sum_row_df.index = ['Total']  # Set the index name to 'Total'

# Concatenate the new row to the original DataFrame
aggregated_data = pd.concat([aggregated_data, sum_row_df], ignore_index=False)

# Print the updated DataFrame
#aggregated_data.tail()
#aggregated_data.loc[aggregated_data['Country/Region'] == 'US']

#### Time Series Analysis: Total deaths over time

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

# Assuming 'aggregated_data' has already been defined and processed

# Check if 'Total' row already exists and remove it
if 'Total' in aggregated_data.index:
    aggregated_data = aggregated_data.drop('Total')

# Calculate the sum of all columns except the last one (assuming last column is not numeric)
sum_row = aggregated_data.iloc[:, 3:-1].sum()

# Create a new row with the sums
sum_row_df = sum_row.to_frame().T
sum_row_df.index = ['Total']  # Set the index name to 'Total'

# Concatenate the new row to the original DataFrame
aggregated_data = pd.concat([aggregated_data, sum_row_df], ignore_index=False)

# Extract and preprocess the total deaths row
total_deaths = aggregated_data.loc['Total'].iloc[1:]  # Skip the 'Country/Region' label

# Convert index to datetime
total_deaths.index = pd.to_datetime(total_deaths.index, format='%m/%d/%Y', errors='coerce')

# Filter from January 2020 onwards
total_deaths = total_deaths[total_deaths.index >= '2020-01-01']

# Plotting the total deaths over time
plt.figure(figsize=(12, 6))
total_deaths.plot(marker='o', linestyle='-', color='blue')
plt.title("Total Global Covid-19 Deaths Over Time")
plt.xlabel("Date")
plt.ylabel("Total Deaths")

# Format y-axis with commas
def comma_formatter(x, _):
    return f'{int(x):,}'

plt.gca().yaxis.set_major_formatter(FuncFormatter(comma_formatter))

# Show grid
plt.grid(True)

# Show the plot
plt.show()

#### Bar Chart: Top 10 countries by total deaths

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming aggregated_data has a column 'Total Deaths' at the end
# Rename the last column if necessary
aggregated_data.columns = [*aggregated_data.columns[:-1], 'Total Deaths']  # Ensure last column is 'Total Deaths'

# Exclude the 'Total' row
aggregated_data_no_total = aggregated_data[aggregated_data['Country/Region'] != 'Total']

# Sort the DataFrame by 'Total Deaths' in descending order and get the top 10 countries
top_countries = aggregated_data_no_total.nlargest(10, 'Total Deaths')

# Plotting the bar chart
plt.figure(figsize=(12, 6))
bars = plt.barh(top_countries['Country/Region'], top_countries['Total Deaths'], color='grey')
plt.xlabel("Total Deaths")
plt.title("Top 10 Countries by Total Deaths (Excluding Total)")
plt.gca().invert_yaxis()  # Invert y-axis to have the country with the highest deaths on top

# Format x-axis with commas for readability
plt.gca().xaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f'{int(x):,}'))

# Annotate each bar with the corresponding total deaths
for bar in bars:
    plt.text(bar.get_width(), bar.get_y() + bar.get_height() / 2, 
             f'{int(bar.get_width()):,}', va='center', ha='left')

plt.grid(axis='x')
plt.show()