In [None]:
import pandas as pd
df = pd.read_csv('owid-covid-data.csv')
print(df.columns)


In [None]:
df = pd.read_csv('owid-covid-data.csv')
print(df.head())

In [None]:
df = pd.read_csv('owid-covid-data.csv')
missing_values = df.isnull().sum()
print(missing_values)



In [None]:

# list of countries of interest
countries_of_interest = ['Nigeria', 'Kenya', 'United States', 'India', 'South Africa', 'China']
# DataFrame for the specified countries
filtered_df = df[df['location'].isin(countries_of_interest)]
# Print each country on a new line
print("Countries of Interest:")
for country in countries_of_interest:
     print("-", country)




In [None]:
# Define countries of interest
countries_of_interest = ['Nigeria', 'Kenya', 'United States', 'India', 'South Africa', 'China']

# Filter for those countries
df_filtered = df[df['location'].isin(countries_of_interest)]

# critical columns 
critical_columns = ['date', 'new_cases', 'new_deaths', 'total_cases', 'total_deaths', 'total_vaccinations','population']

# Drop rows with missing values in any of the critical columns
df_cleaned = df_filtered.dropna(subset=critical_columns)

# Display first few cleaned rows
print(df_cleaned.head())


In [None]:
# Convert the 'date' column to datetime format
df['date'] = pd.to_datetime(df['date'])


In [None]:
df_filtered.loc[:, ['new_cases', 'total_cases', 'new_deaths', 'total_deaths',
                    'total_vaccinations', 'new_cases_smoothed', 'total_deaths_per_million']] = \
    df_filtered.loc[:, ['new_cases', 'total_cases', 'new_deaths', 'total_deaths',
                        'total_vaccinations', 'new_cases_smoothed', 'total_deaths_per_million']].fillna(0)
print(df_filtered.isnull().sum())

In [None]:
import matplotlib.pyplot as plt

# Load and prepare the data
df = pd.read_csv('owid-covid-data.csv')
df['date'] = pd.to_datetime(df['date'])

# Define countries and custom colors
countries_of_interest = ['Nigeria', 'Kenya', 'United States', 'India', 'South Africa', 'China']
colors = {
    'Nigeria': "#0545F5",      # Blue
    'Kenya': '#ff7f0e',        # Orange
    'United States': '#2ca02c',# Green
    'India': '#d62728',        # Red
    'South Africa': '#9467bd', # Purple
    'China': '#8c564b'         # Brown
}

# Filter and sort
df_filtered = df[df['location'].isin(countries_of_interest)].copy()
df_filtered = df_filtered.sort_values(by=['location', 'date'])

# Plot
plt.figure(figsize=(12, 6))

for country in countries_of_interest:
    country_data = df_filtered[df_filtered['location'] == country]
    plt.plot(country_data['date'], country_data['total_cases'], label=country, color=colors[country])

plt.title('Total COVID-19 Cases Over Time')
plt.xlabel('Date')
plt.ylabel('Total Cases')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# Define countries and custom colors
countries_of_interest = ['Nigeria', 'Kenya', 'United States', 'India', 'South Africa', 'China']

# Filter data
df_filtered = df[df['location'].isin(countries_of_interest)].copy()
df_filtered = df_filtered.sort_values(by=['location', 'date'])

# Plot total deaths over time
plt.figure(figsize=(12, 6))

for country in countries_of_interest:
    country_data = df_filtered[df_filtered['location'] == country]
    plt.plot(country_data['date'], country_data['total_deaths'], label=country, color=colors[country])

plt.title('Total COVID-19 Deaths Over Time')
plt.xlabel('Date')
plt.ylabel('Total Deaths')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()



In [None]:

# Load and prepare data
df = pd.read_csv('owid-covid-data.csv')
df['date'] = pd.to_datetime(df['date'])

# Define countries of interest
countries_of_interest = ['Nigeria', 'Kenya', 'United States', 'India', 'South Africa', 'China']
colors = {
    'Nigeria': '#1f77b4',
    'Kenya': '#ff7f0e',
    'United States': '#2ca02c',
    'India': '#d62728',
    'South Africa': '#9467bd',
    'China': '#8c564b'
}

# Filter relevant data
df_filtered = df[df['location'].isin(countries_of_interest)].copy()
df_filtered = df_filtered.sort_values(['location', 'date'])

# Fill missing new_cases with 0
df_filtered['new_cases'] = df_filtered['new_cases'].fillna(0)

# Plot
plt.figure(figsize=(14, 7))

for country in countries_of_interest:
    country_data = df_filtered[df_filtered['location'] == country]
    plt.plot(country_data['date'], country_data['new_cases'], label=country, color=colors[country])

plt.title('Daily New COVID-19 Cases Over Time')
plt.xlabel('Date')
plt.ylabel('New Cases')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.yscale('log') 
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the data
df = pd.read_csv('owid-covid-data.csv')
df['date'] = pd.to_datetime(df['date'])

# Define countries of interest
countries_of_interest = ['Nigeria', 'Kenya', 'United States', 'India', 'South Africa', 'China']
df_filtered = df[df['location'].isin(countries_of_interest)].copy()

# Fill missing total_cases and total_deaths with 0
df_filtered['total_cases'] = df_filtered['total_cases'].fillna(0)
df_filtered['total_deaths'] = df_filtered['total_deaths'].fillna(0)

# Calculate death rate as a percentage and add it as a column
df_filtered['death_rate'] = 0  # default
non_zero_cases = df_filtered['total_cases'] > 0
df_filtered.loc[non_zero_cases, 'death_rate'] = (
    df_filtered.loc[non_zero_cases, 'total_deaths'] / df_filtered.loc[non_zero_cases, 'total_cases']
) * 100

# Get the most recent valid data for each country
latest_data = df_filtered[df_filtered['total_cases'] > 0]
latest_data = latest_data.sort_values('date').groupby('location').tail(1)

# Plot bar chart
plt.figure(figsize=(10, 6))
bars = plt.bar(latest_data['location'], latest_data['death_rate'], color=[
    '#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b'])

# Add labels
for bar in bars:
    height = bar.get_height()
    plt.annotate(f'{height:.2f}%',
                 xy=(bar.get_x() + bar.get_width() / 2, height),
                 xytext=(0, 5),
                 textcoords="offset points",
                 ha='center', va='bottom')

# Final formatting
plt.title('COVID-19 Death Rate (%) by Country (Most Recent Valid Data)')
plt.ylabel('Death Rate (%)')
plt.xlabel('Country')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# 1) Load and prepare data
df = pd.read_csv('owid-covid-data.csv')
df['date'] = pd.to_datetime(df['date'])

# Define your countries and colors
countries = ['Nigeria', 'Kenya', 'United States', 'India', 'South Africa', 'China']
colors = {
    'Nigeria': '#1f77b4',
    'Kenya': '#ff7f0e',
    'United States': '#2ca02c',
    'India': '#d62728',
    'South Africa': '#9467bd',
    'China': '#8c564b'
}

# Filter
df_filtered = df[df['location'].isin(countries)].copy()
df_filtered = df_filtered.sort_values(['location','date'])

# 2) Plot cumulative vaccinations over time
plt.figure(figsize=(12,6))
for country in countries:
    data = df_filtered[df_filtered['location']==country]
    # fillna so plot is continuous
    plt.plot(data['date'], data['total_vaccinations'].fillna(method='ffill'), 
             label=country, color=colors[country])
plt.title('Cumulative COVID-19 Vaccinations Over Time')
plt.xlabel('Date')
plt.ylabel('Total Vaccinations')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# 3) Prepare latest % vaccinated data
# Use people_vaccinated_per_hundred (people with ≥1 dose per 100)
latest = (
    df_filtered
    .dropna(subset=['people_vaccinated_per_hundred'])
    .sort_values('date')
    .groupby('location')
    .tail(1)
)

# 4) Pie chart of % vaccinated population
values = latest['people_vaccinated_per_hundred']
labels = latest['location']
pie_colors = [colors[c] for c in labels]

plt.figure(figsize=(8,8))
plt.pie(values, labels=labels, autopct='%1.1f%%', colors=pie_colors, startangle=140)
plt.title('Share of Population Vaccinated (≥1 Dose)')
plt.axis('equal')  # keep pie circular
plt.show()


In [None]:
import numpy as np
import pandas as pd
import plotly.express as px

# Load & prepare data
df = pd.read_csv('owid-covid-data.csv')
df['date'] = pd.to_datetime(df['date'])

# Filter out non-country entities
df = df[~df['iso_code'].str.startswith('OWID_')]

# Get latest values
latest = (
    df.dropna(subset=['iso_code', 'total_cases'])
    .sort_values('date')
    .groupby('iso_code', as_index=False)
    .last()[['iso_code', 'location', 'total_cases']]
)

# Clean and transform data
latest['total_cases'] = pd.to_numeric(latest['total_cases'], errors='coerce')
latest = latest.dropna(subset=['total_cases'])
latest['log_cases'] = np.log10(latest['total_cases'])

# Custom color scale configuration
min_value = 4  # 10^4 = 10,000 (10K)
max_value = 8   # 10^8 = 100,000,000 (100M)

color_scale = [
    [0.0, 'blue'],   # 10K
    [0.3, 'cyan'],   # 100K
    [0.6, 'yellow'], # 1M
    [1.0, 'red']     # 100M
]

fig = px.choropleth(
    latest,
    locations='iso_code',
    color='log_cases',
    hover_name='location',
    hover_data={'total_cases': ':,', 'log_cases': False},
    color_continuous_scale=color_scale,
    range_color=(min_value, max_value),
    title='COVID-19 Total Cases (10K = Blue → 100M = Red)'
)

fig.update_layout(
    geo=dict(showframe=False, showcoastlines=False),
    coloraxis_colorbar=dict(
        title='Total Cases',
        ticks="outside",
        tickvals=[4, 5, 6, 7, 8],
        ticktext=['10K', '100K', '1M', '10M', '100M']
    )
)
fig.show()

Key Insights from COVID-19 Total Cases Chart:

1. Extreme Disparity in Case Loads: The United States and India likely dominate the chart with cases in the tens of millions (aligning with their large populations and widespread transmission), while African nations like Nigeria and Kenya show significantly lower reported cases.

2. China's Low Cases: China’s heat cell appears disproportionately small compared to other populous nations despite its 1.4 billion population, reflecting its controversial "zero-COVID" strategy.

3. South Africa’s Regional Prominence: South Africa may show higher cases than other African countries, consistent with its role as an early hotspot for variants (Beta, Omicron) and more robust testing infrastructure relative to its neighbors.

Notable Anomalies/Patterns:

Data Transparency Questions: Large gaps between countries with similar populations but vastly different case numbers (e.g., India vs. China) suggest inconsistencies in testing, reporting, or transparency.
