In [None]:
# Initial setup: import required libraries
import pandas as pd
from datetime import datetime

# Data Loading: read the COVID-19 dataset into a DataFrame
# The file should be in the same directory as this notebook
# 'utf-8' encoding and comma separator are specified for compatibility
# The first row is used as the header

df = pd.read_csv(r'owid-covid-data.csv',
                 encoding='utf-8',
                 sep=',',
                 header=0
)

# Define the list of East African countries for filtering
east_african_countries = [
    'Kenya', 'Uganda', 'Tanzania', 'Rwanda', 'Burundi',
    'South Sudan', 'Ethiopia', 'Somalia', 'Eritrea'
]

# Filter the main DataFrame to only include East African countries
east_africa_data = df[df['location'].isin(east_african_countries)].copy()

# Define columns that are critical for analysis
critical_cols = ['date', 'location', 'total_cases']

# Drop rows with missing values in any of the critical columns
# This ensures that only complete records are used for further analysis
east_africa_data_cleaned = east_africa_data.dropna(subset=critical_cols).copy()

# Convert the 'date' column to datetime objects for time-based operations
# .loc is used to avoid SettingWithCopyWarning
east_africa_data_cleaned.loc[:, 'date'] = pd.to_datetime(east_africa_data_cleaned['date'])

# Sort the cleaned data by date to maintain chronological order
east_africa_data_cleaned = east_africa_data_cleaned.sort_values(by='date')

# Convert object columns to their inferred types (e.g., numbers, dates)
# This helps with numerical operations and interpolation
east_africa_data_cleaned = east_africa_data_cleaned.infer_objects(copy=False)

# Fill remaining missing numeric values with zero (alternative to interpolation)
east_africa_data_filled_zero = east_africa_data_cleaned.fillna(0)

# Interpolate missing numeric values linearly
# This fills in gaps in the data for smoother analysis
east_africa_data_filled_interp = east_africa_data_cleaned.interpolate(method='linear')


# Inspect results: print the shape of the DataFrame after each cleaning step
print("After dropna & datetime conversion:", east_africa_data_cleaned.shape)
print("After fillna(0):", east_africa_data_filled_zero.shape)
print("After interpolate():", east_africa_data_filled_interp.shape)





In [None]:
# Create timestamp for unique filename
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

# Save the cleaned and analyzed data
output_files = {
    'cleaned_data': f'east_africa_covid_cleaned_{timestamp}.csv',
    'vaccination_data': f'east_africa_vaccination_{timestamp}.csv',
    'correlation_data': f'east_africa_correlation_{timestamp}.csv'
}

# Save different analyses to separate files
east_africa_data_filled_interp.to_csv(output_files['cleaned_data'], index=False)
vax_data.to_csv(output_files['vaccination_data'], index=False)
correlation_data.to_csv(output_files['correlation_data'], index=False)

print(f"\nFiles saved successfully:")
for key, filepath in output_files.items():
    print(f"{key}: {filepath}")

In [None]:
# Create timestamp for unique filename
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

# Save the cleaned and analyzed data
output_files = {
    'cleaned_data': f'east_africa_covid_cleaned_{timestamp}.csv',
    'vaccination_data': f'east_africa_vaccination_{timestamp}.csv',
    'correlation_data': f'east_africa_correlation_{timestamp}.csv'
}

# Save different analyses to separate files
east_africa_data_filled_interp.to_csv(output_files['cleaned_data'], index=False)
vax_data.to_csv(output_files['vaccination_data'], index=False)
correlation_data.to_csv(output_files['correlation_data'], index=False)

print(f"\nFiles saved successfully:")
for key, filepath in output_files.items():
    print(f"{key}: {filepath}")

In [None]:
# Create timestamp for unique filename
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

# Save the cleaned and analyzed data
output_files = {
    'cleaned_data': f'east_africa_covid_cleaned_{timestamp}.csv',
    'vaccination_data': f'east_africa_vaccination_{timestamp}.csv',
    'correlation_data': f'east_africa_correlation_{timestamp}.csv'
}

# Save different analyses to separate files
east_africa_data_filled_interp.to_csv(output_files['cleaned_data'], index=False)
vax_data.to_csv(output_files['vaccination_data'], index=False)
correlation_data.to_csv(output_files['correlation_data'], index=False)

print(f"\nFiles saved successfully:")
for key, filepath in output_files.items():
    print(f"{key}: {filepath}")

In [None]:
# Create timestamp for unique filename
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

# Save the cleaned and analyzed data
output_files = {
    'cleaned_data': f'east_africa_covid_cleaned_{timestamp}.csv',
    'vaccination_data': f'east_africa_vaccination_{timestamp}.csv',
    'correlation_data': f'east_africa_correlation_{timestamp}.csv'
}

# Save different analyses to separate files
east_africa_data_filled_interp.to_csv(output_files['cleaned_data'], index=False)
vax_data.to_csv(output_files['vaccination_data'], index=False)
correlation_data.to_csv(output_files['correlation_data'], index=False)

print(f"\nFiles saved successfully:")
for key, filepath in output_files.items():
    print(f"{key}: {filepath}")

In [None]:
# Create timestamp for unique filename
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

# Save the cleaned and analyzed data
output_files = {
    'cleaned_data': f'east_africa_covid_cleaned_{timestamp}.csv',
    'vaccination_data': f'east_africa_vaccination_{timestamp}.csv',
    'correlation_data': f'east_africa_correlation_{timestamp}.csv'
}

# Save different analyses to separate files
east_africa_data_filled_interp.to_csv(output_files['cleaned_data'], index=False)
vax_data.to_csv(output_files['vaccination_data'], index=False)
correlation_data.to_csv(output_files['correlation_data'], index=False)

print(f"\nFiles saved successfully:")
for key, filepath in output_files.items():
    print(f"{key}: {filepath}")

In [None]:
# Create timestamp for unique filename
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

# Save the cleaned and analyzed data
output_files = {
    'cleaned_data': f'east_africa_covid_cleaned_{timestamp}.csv',
    'vaccination_data': f'east_africa_vaccination_{timestamp}.csv',
    'correlation_data': f'east_africa_correlation_{timestamp}.csv'
}

# Save different analyses to separate files
east_africa_data_filled_interp.to_csv(output_files['cleaned_data'], index=False)
vax_data.to_csv(output_files['vaccination_data'], index=False)
correlation_data.to_csv(output_files['correlation_data'], index=False)

print(f"\nFiles saved successfully:")
for key, filepath in output_files.items():
    print(f"{key}: {filepath}")

In [None]:
# Plot total cases over time for each country
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")
plt.figure(figsize=(14, 8))

for country in east_african_countries:
    country_data = east_africa_data_filled_interp[east_africa_data_filled_interp['location'] == country]
    plt.plot(country_data['date'], country_data['total_cases'], label=country)

plt.title("COVID-19 Total Cases Over Time (East Africa)")
plt.xlabel("Date")
plt.ylabel("Total Cases")
plt.legend()
plt.tight_layout()
#plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")
plt.figure(figsize=(14, 8))

for country in east_african_countries:
    country_data = east_africa_data_filled_interp[east_africa_data_filled_interp['location'] == country]
    plt.plot(country_data['date'], country_data['total_deaths'], label=country)

plt.title("COVID-19 Total Deaths Over Time (East Africa)")
plt.xlabel("Date")
plt.ylabel("Total Deaths")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Plot total c
plt.figure(figsize=(14, 8))

for country in east_african_countries:
    country_data = east_africa_data_filled_interp[east_africa_data_filled_interp['location'] == country]
    plt.plot(country_data['date'], country_data['total_cases'], label=country)

plt.title("COVID-19 Total Cases Over Time (East Africa)")
plt.xlabel("Date")
plt.ylabel("Total Cases")
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
#Daily new cases
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid')

plt.figure(figsize=(14, 8))
for country in east_african_countries:
    country_data = df[df['location'] == country]
    plt.plot(country_data['date'], country_data['new_cases'], label=country)

plt.title('Daily New COVID-19 Cases in East Africa')
plt.xlabel('Date')
plt.ylabel('New Cases')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# total deaths
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df['death_rate'] = df['total_deaths'] / df['total_cases']

plt.figure(figsize=(14, 8))
for country in east_african_countries:
    country_data = df[df['location'] == country]
    plt.plot(country_data['date'], country_data['total_deaths'], label=country)

plt.title('Total COVID-19 Deaths in East Africa Over Time')
plt.xlabel('Date')
plt.ylabel('Total Deaths')
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
#Bar Chart: Top countries by Total Cases
# Get latest data per country in East Africa
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
east_africa_countries = [
    'Kenya', 'Uganda', 'Tanzania', 'Rwanda', 'Burundi',
    'South Sudan', 'Ethiopia', 'Somalia', 'Eritrea'
]

east_africa_df = df[df['location'].isin(east_africa_countries)]
latest_data = east_africa_df.sort_values('date').groupby('location').tail(1)

# Sort and plot top East African countries
top_countries = latest_data.sort_values('total_cases', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=top_countries, x='total_cases', y='location', palette='Blues_d')
plt.title('Total COVID-19 Cases by East African Country')
plt.xlabel('Total Cases')
plt.ylabel('Country')
plt.tight_layout()
plt.show()



In [None]:
# Filter to East African countries
east_africa_countries = [
    'Kenya', 'Uganda', 'Tanzania', 'Rwanda', 'Burundi',
    'South Sudan', 'Ethiopia', 'Somalia', 'Eritrea'
]
east_africa_df = df[df['location'].isin(east_africa_countries)].copy()

# Compute correlation matrix (create a new DataFrame instead of modifying a view)
correlation_data = east_africa_df[['total_cases', 'total_deaths', 'new_cases', 'new_deaths']].copy()
correlation_data.loc[:, 'death_rate'] = east_africa_df['total_deaths'] / east_africa_df['total_cases']
correlation_matrix = correlation_data.corr()

# Plot heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap (East Africa - COVID-19 Indicators)')
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Filter relevant columns and drop rows with missing vaccination data
vax_data = east_africa_data_cleaned[['date', 'location', 'people_vaccinated', 'people_fully_vaccinated']].dropna()

# Convert 'date' to datetime
vax_data['date'] = pd.to_datetime(vax_data['date'])

# Plot vaccination trends
plt.figure(figsize=(12, 6))
sns.lineplot(data=vax_data, x='date', y='people_vaccinated', hue='location')
plt.title('People Vaccinated Over Time (East Africa)')
plt.xlabel('Date')
plt.ylabel('Cumulative People Vaccinated')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# Latest vaccination data per country
latest_vax = east_africa_data_cleaned.sort_values('date').groupby('location').tail(1)

# Plot % vaccinated
plt.figure(figsize=(10, 5))
sns.barplot(data=latest_vax, x='location', y='people_vaccinated_per_hundred')
plt.title('Percentage of Population Vaccinated (East Africa)')
plt.ylabel('% Vaccinated')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()



In [None]:
# Create timestamp for unique filename
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

# Save the cleaned and analyzed data
output_files = {
    'cleaned_data': f'east_africa_covid_cleaned_{timestamp}.csv',
    'vaccination_data': f'east_africa_vaccination_{timestamp}.csv',
    'correlation_data': f'east_africa_correlation_{timestamp}.csv'
}

# Save different analyses to separate files
east_africa_data_filled_interp.to_csv(output_files['cleaned_data'], index=False)
vax_data.to_csv(output_files['vaccination_data'], index=False)
correlation_data.to_csv(output_files['correlation_data'], index=False)

print(f"\nFiles saved successfully:")
for key, filepath in output_files.items():
    print(f"{key}: {filepath}")