In [None]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Optional for maps
import plotly.express as px

# Step 1 Load the dataset
try:
    df = pd.read_csv('owid-covid-data.csv')
    print(" Data loaded successfully!")
except FileNotFoundError:
    print(" File not found. Ensure 'owid-covid-data.csv' is in the working directory.")

#  2 Explore the dataset
print("\n Columns in dataset:")
print(df.columns)

print("\n Preview:")
print(df.head())

print("\n❗ Missing values:")
print(df.isnull().sum().sort_values(ascending=False).head(10))

# Step 3: Clean the data
df['date'] = pd.to_datetime(df['date'])

countries = ['Kenya', 'India', 'United States']
df = df[df['location'].isin(countries)]

df = df[['date', 'location', 'total_cases', 'total_deaths', 'new_cases', 'new_deaths', 'total_vaccinations']]

df = df.fillna(method='ffill')  # Forward fill for continuous data

# Step 4: EDA - Total Cases Over Time
plt.figure(figsize=(10, 6))
for country in countries:
    country_data = df[df['location'] == country]
    plt.plot(country_data['date'], country_data['total_cases'], label=country)

plt.title(" Total COVID-19 Cases Over Time")
plt.xlabel("Date")
plt.ylabel("Total Cases")
plt.legend()
plt.tight_layout()
plt.show()

#  Total Deaths Over Time
plt.figure(figsize=(10, 6))
for country in countries:
    country_data = df[df['location'] == country]
    plt.plot(country_data['date'], country_data['total_deaths'], label=country)

plt.title(" Total COVID-19 Deaths Over Time")
plt.xlabel("Date")
plt.ylabel("Total Deaths")
plt.legend()
plt.tight_layout()
plt.show()

# Step 5: Death Rate Comparison
df['death_rate'] = df['total_deaths'] / df['total_cases']
latest = df.groupby('location').tail(1)
sns.barplot(data=latest, x='location', y='death_rate')
plt.title(" Death Rate by Country (latest date)")
plt.ylabel("Death Rate")
plt.tight_layout()
plt.show()

# Step 6: Vaccination Progress
plt.figure(figsize=(10, 6))
for country in countries:
    country_data = df[df['location'] == country]
    plt.plot(country_data['date'], country_data['total_vaccinations'], label=country)

plt.title(" Total Vaccinations Over Time")
plt.xlabel("Date")
plt.ylabel("Total Vaccinations")
plt.legend()
plt.tight_layout()
plt.show()

# Step 7 Choropleth Map
 latest_global = df[df['date'] == df['date'].max()]
 fig = px.choropleth(latest_global, locations="iso_code",color="total_cases",
                     hover_name="location",color_continuous_scale="Reds")
 fig.update_layout(title=" Global COVID-19 Cases (Latest)")
 fig.show()

###  Key Insights:
1. The United States had the highest number of total cases and vaccinations.
2. India showed a sharp increase in vaccinations after mid-2021.
3. Kenya’s case numbers are significantly lower but show similar trend lines.
4. Death rate varies — Kenya's is relatively higher compared to India.