In [None]:
import pandas as pd

# Loading the dataset
df = pd.read_csv('owid-covid-data.csv')

# Checking column names
print("Columns in dataset:")
print(df.columns)

print("\nPreview of data:")
print(df.head())

# Identifying missing values in each column
print("\nMissing values per column:")
print(df.isnull().sum())


In [None]:
# Filtering for countries of interest
countries = ['Kenya', 'United States', 'India']
df = df[df['location'].isin(countries)]

# Dropping rows with missing date or total_cases
df = df.dropna(subset=['date', 'total_cases'])

# Converting 'date' column to datetime
df['date'] = pd.to_datetime(df['date'])

# Handling missing values in key numeric columns
numeric_cols = [
    'total_cases', 'new_cases', 'total_deaths', 'new_deaths',
    'total_vaccinations'
]

df[numeric_cols] = df[numeric_cols].interpolate()

# Previewing cleaned data
print(df[['location', 'date', 'total_cases', 'total_deaths', 'total_vaccinations']].head())


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load cleaned data for selected countries (Kenya, India, United States)
df = pd.read_csv('owid-covid-data.csv')
df['date'] = pd.to_datetime(df['date'])

# Filter countries of interest
countries = ['Kenya', 'India', 'United States']
df = df[df['location'].isin(countries)]

# Drop rows with missing total_cases
df = df.dropna(subset=['total_cases'])

# Interpolate missing values in key numeric columns
numeric_cols = ['total_cases', 'new_cases', 'total_deaths', 'new_deaths', 'total_vaccinations']
df[numeric_cols] = df[numeric_cols].interpolate()

# Set style
sns.set(style='whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Total cases over time
plt.figure()
for country in countries:
    plt.plot(df[df['location'] == country]['date'],
             df[df['location'] == country]['total_cases'],
             label=country)
plt.title('Total COVID-19 Cases Over Time')
plt.xlabel('Date')
plt.ylabel('Total Cases')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Total deaths over time
plt.figure()
for country in countries:
    plt.plot(df[df['location'] == country]['date'],
             df[df['location'] == country]['total_deaths'],
             label=country)
plt.title('Total COVID-19 Deaths Over Time')
plt.xlabel('Date')
plt.ylabel('Total Deaths')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Daily new cases comparison
plt.figure()
sns.lineplot(data=df, x='date', y='new_cases', hue='location')
plt.title('Daily New COVID-19 Cases')
plt.xlabel('Date')
plt.ylabel('New Cases')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Death rate over time
df['death_rate'] = df['total_deaths'] / df['total_cases']
plt.figure()
for country in countries:
    plt.plot(df[df['location'] == country]['date'],
             df[df['location'] == country]['death_rate'],
             label=country)
plt.title('COVID-19 Death Rate Over Time')
plt.xlabel('Date')
plt.ylabel('Death Rate')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Bar chart: Top 10 countries by total cases
full_df = pd.read_csv('owid-covid-data.csv')
full_df['date'] = pd.to_datetime(full_df['date'])

# Get latest entry for each country
latest_df = full_df.sort_values('date').groupby('location').tail(1)

# Remove aggregate regions like "World", "Africa", etc.
latest_df = latest_df[latest_df['iso_code'].str.len() == 3]

# Get top 10 by total cases
top_countries = latest_df[['location', 'total_cases']].dropna().sort_values(
    by='total_cases', ascending=False).head(10)

# Plot bar chart
plt.figure(figsize=(10, 6))
sns.barplot(data=top_countries, x='total_cases', y='location', palette='Reds_r')
plt.title('Top 10 Countries by Total COVID-19 Cases (Latest)')
plt.xlabel('Total Cases')
plt.ylabel('Country')
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
df = pd.read_csv('owid-covid-data.csv')
df['date'] = pd.to_datetime(df['date'])

# Filter countries of interest
countries = ['Kenya', 'India', 'United States']
df = df[df['location'].isin(countries)]

# Keep only necessary columns
vax_cols = ['location', 'date', 'total_vaccinations', 'total_vaccinations_per_hundred']
df = df[vax_cols]

# Interpolate missing values
df[['total_vaccinations', 'total_vaccinations_per_hundred']] = df[['total_vaccinations', 'total_vaccinations_per_hundred']].interpolate()

# Set plotting style
sns.set(style='whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Cumulative vaccinations over time
plt.figure()
for country in countries:
    country_data = df[df['location'] == country]
    plt.plot(country_data['date'], country_data['total_vaccinations'], label=country)

plt.title('Cumulative COVID-19 Vaccinations Over Time')
plt.xlabel('Date')
plt.ylabel('Total Vaccinations')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Percentage of population vaccinated over time
plt.figure()
for country in countries:
    country_data = df[df['location'] == country]
    plt.plot(country_data['date'], country_data['total_vaccinations_per_hundred'], label=country)

plt.title('% of Population Vaccinated Over Time')
plt.xlabel('Date')
plt.ylabel('Vaccinations per 100 People')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


# COVID-19 Data Analysis & Insights

### Key Insights:
1. **COVID-19 Cases and Deaths Over Time:**
   - The **United States** has the highest number of total cases and deaths, reflecting its large population and high case rates, especially during early waves.
   - **India** showed a rapid increase in cases starting in mid-2020, but the overall death rate remains lower compared to other countries like the U.S.
   - **Kenya** had a slower rise in cases but faced significant challenges during the 2021-2022 waves, especially with the emergence of new variants.

2. **Daily New Cases Comparison:**
   - While the **United States** had a peak of new daily cases in mid-2021, **India** and **Kenya** experienced dramatic surges during later phases of the pandemic.
   - The **daily new cases** in **India** outpaced the other two countries in the summer of 2021, coinciding with the Delta variant's impact.

3. **Death Rate Analysis:**
   - The **United States** had a higher **death rate** compared to India, particularly in the early stages of the pandemic. This might be due to higher case numbers and strain on healthcare systems.
   - Both **India** and **Kenya** saw lower death rates, possibly indicating a younger population and lower rates of underlying health conditions in some areas.

4. **Vaccination Progress:**
   - The **United States** led the world in vaccinations, with rapid rollout and large proportions of the population vaccinated by mid-2021. However, vaccine uptake slowed towards the end of the year.
   - **India** and **Kenya** showed slower vaccine rollouts, but **India** made significant progress in 2021, vaccinating millions of people daily by the end of the year.
   - **Kenya** faced logistical challenges but managed to vaccinate a significant portion of the population, particularly after global donations and partnerships with COVAX.

5. **Anomalies & Interesting Patterns:**
   - The rapid vaccination rollout in **India** from mid-2021 shows a highly coordinated effort to scale vaccinations across the country, which contrasts with slower vaccine adoption in **Kenya**.
   - **Kenya**'s vaccination data had gaps during the early phases, indicating challenges in data reporting and vaccine distribution.
