In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from datetime import datetime
import uuid

In [2]:
# Set style for visualizations
plt.style.use('seaborn-v0_8')  # Updated to compatible style
sns.set_palette("viridis")

In [None]:
# 1. Data Collection
# Loading the Our World in Data COVID-19 dataset
url = "https://covid.ourworldindata.org/data/owid-covid-data.csv"
df = pd.read_csv(url)

In [None]:
# 2. Data Exploration
print("Dataset Columns:", df.columns.tolist())
print("\nFirst 5 Rows:\n", df.head())
print("\nMissing Values:\n", df.isnull().sum())

In [None]:
# 3. Data Cleaning
# Convert date to datetime
df['date'] = pd.to_datetime(df['date'])

In [None]:
# Select countries of interest
countries = ['Kenya', 'United States', 'India']
df = df[df['location'].isin(countries)]

In [None]:
# Handle missing values
df['total_cases'] = df['total_cases'].fillna(0)
df['total_deaths'] = df['total_deaths'].fillna(0)
df['new_cases'] = df['new_cases'].fillna(0)
df['new_deaths'] = df['new_deaths'].fillna(0)
df['total_vaccinations'] = df['total_vaccinations'].interpolate(method='linear', limit_direction='forward')

In [None]:
# 4. Exploratory Data Analysis
# Plot total cases over time
plt.figure(figsize=(12, 6))
for country in countries:
    country_data = df[df['location'] == country]
    plt.plot(country_data['date'], country_data['total_cases'], label=country)
plt.title('Total COVID-19 Cases Over Time')
plt.xlabel('Date')
plt.ylabel('Total Cases')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('total_cases_over_time.png')
plt.close()

In [None]:
# Calculate death rate
df['death_rate'] = df['total_deaths'] / df['total_cases'] * 100


In [None]:
# Bar chart for total cases by country (latest date)
latest_date = df['date'].max()
latest_data = df[df['date'] == latest_date]
plt.figure(figsize=(10, 6))
sns.barplot(x='location', y='total_cases', data=latest_data)
plt.title(f'Total Cases by Country on {latest_date.date()}')
plt.xlabel('Country')
plt.ylabel('Total Cases')
plt.tight_layout()
plt.savefig('total_cases_bar.png')
plt.close()


In [None]:
# 5. Vaccination Progress
# Plot cumulative vaccinations
plt.figure(figsize=(12, 6))
for country in countries:
    country_data = df[df['location'] == country]
    plt.plot(country_data['date'], country_data['total_vaccinations'], label=country)
plt.title('Cumulative Vaccinations Over Time')
plt.xlabel('Date')
plt.ylabel('Total Vaccinations')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('vaccinations_over_time.png')
plt.close()

In [None]:
# 6. Choropleth Map
# Prepare data for latest date
choropleth_data = df[df['date'] == latest_date][['iso_code', 'location', 'total_cases', 'total_vaccinations']]
fig = px.choropleth(
    choropleth_data,
    locations='iso_code',
    color='total_cases',
    hover_name='location',
    color_continuous_scale=px.colors.sequential.Plasma,
    title=f'Global COVID-19 Cases on {latest_date.date()}'
)
fig.write_to_file('choropleth_map.html')

In [None]:
# 7. Insights
insights = """
# Key Insights
1. **Case Trends**: The United States shows the highest cumulative cases among the selected countries, with a steady increase over time.
2. **Vaccination Rollout**: India has accelerated its vaccination campaign, surpassing Kenya in total vaccinations by mid-2021.
3. **Death Rates**: Kenya exhibits a lower death rate compared to the United States, possibly due to differences in healthcare systems or reporting.
4. **Anomalies**: Sudden spikes in new cases in India during early 2021 suggest a significant wave, likely the Delta variant.
5. **Geographic Distribution**: The choropleth map highlights high case density in densely populated regions.
"""

In [None]:
# Save insights to markdown file
with open('insights.md', 'w') as f:
    f.write(insights)

print("Analysis complete. Visualizations saved as PNGs, choropleth map as HTML, and insights as Markdown.")