### *--DATA LOADING--*

1.  Importing Library

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

2. Reading the CSV file into a DataFrame

In [None]:
df = pd.read_csv('Covid_Dataset.csv')

3. Display the first five rows of the DataFrame to confirm it's loaded

In [None]:
df.head()

4. Print a summary of the DataFrame to check data types and missing values


In [None]:
df.info()

### *--DATA CLEANING--*

In [None]:
# Replace 'N/A' with 0
df.fillna(0, inplace=True)

# Define columns to clean (remove commas and convert to numeric)
cols_to_clean = ['Total Cases', 'Total Deaths', 'Total Recovered', 'Active Cases', 'Total Test', 'Population']

# Iterate over columns and clean them
for col in cols_to_clean:
    if col in df.columns:
        # Remove commas and convert to numeric
        df[col] = df[col].astype(str).str.replace(',', '', regex=False)
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Clean the column names (MOVED TO HERE)
df.columns = df.columns.str.replace(' ', '_').str.lower()
print("Cleaned Column Names:\n")
print(df.columns)
print("\n" + "="*50 + "\n")

### *--LOADING CLEAN DATA--*

6. Display the first five rows of the Cleaned DataFrame

In [None]:
df.head()

7. Print a summary of the Cleaned DataFrame

In [None]:
df.info()

8. Save the cleaned dataset to a new CSV file

In [None]:
df.to_csv('Cleaned_Covid_Dataset.csv', index=False)

### *--DATA ANALYSIS--*

1. Define the colour pallete for all plots ?

In [None]:
dark_grey = '#36454F'
light_grey = '#F5F5F5'
white_color = '#FFFFFF'
medium_grey = '#778899'

2. Identify and display the top 10 countries with the highest total cases ?

In [None]:
top_10_cases = df.nlargest(10, 'total_cases')[['country', 'total_cases']]
print("\n" + "="*50 + "\n")
print("Top 10 Countries by Total Cases:\n")
print(top_10_cases)

3. Calculate Mortality Rate for each country and add as a new column ?

In [None]:
df['mortality_rate'] = df['total_deaths'] / df['total_cases']
# Fill any resulting NaN or Inf values with 0 for countries with 0 cases.
df['mortality_rate'] = df['mortality_rate'].replace([np.inf, -np.inf], np.nan).fillna(0)
print("\n" + "="*50 + "\n")
print("Added 'mortality_rate' column to the DataFrame.")
print(df[['country', 'mortality_rate']].head())

4. Find the country with the highest Mortality Rate (for countries > 100,000 cases) ?

In [None]:
filtered_df = df[df['total_cases'] > 100000]
highest_mortality_country = filtered_df.nlargest(1, 'mortality_rate')[['country', 'mortality_rate']]
print("\n" + "="*50 + "\n")
print("Country with highest Mortality Rate (for > 100,000 cases):")
print(highest_mortality_country)


5. Create a new column for tests per 1000 people ?

In [None]:
df['tests_per_1000_people'] = (df['total_test'] / df['population']) * 1000
# Handle countries with 0 population to avoid infinite values
df['tests_per_1000_people'] = df['tests_per_1000_people'].replace([np.inf, -np.inf], 0)
print("\n" + "="*50 + "\n")
print("Added 'tests_per_1000_people' column.")
print(df[['country', 'tests_per_1000_people']].head())

6. Find the total number of tests in the top 10 countries by total cases ?

In [None]:
print("\n" + "="*50 + "\n")
top_10_countries_by_tests = df.nlargest(10, 'total_cases')
total_tests_top_10 = top_10_countries_by_tests['total_test'].sum()
print(f"Total tests in top 10 countries by cases: {int(total_tests_top_10):,}")


7. Find the number of countries with zero total cases ?

In [None]:
print("\n" + "="*50 + "\n")
countries_with_zero_cases = df[df['total_cases'] == 0].shape[0]
print(f"Number of countries with 0 total cases: {countries_with_zero_cases}")

8. Identify the top 5 countries with the highest total recovered cases ?

In [None]:
print("\n" + "="*50 + "\n")
top_5_recovered = df.nlargest(5, 'total_recovered')[['country', 'total_recovered']]
print("Top 5 Countries by Total Recovered Cases:\n")
print(top_5_recovered)


### *--VISUALIZATIONS--*

In [None]:
# Set matplotlib style for the new color theme
plt.style.use('default') # Reset to default first
plt.rcParams.update({
    'axes.facecolor': light_grey,
    'figure.facecolor': light_grey,
    'axes.edgecolor': white_color,
    'axes.labelcolor': 'black',
    'xtick.color': 'black',
    'ytick.color': 'black',
    'grid.color': white_color,
    'grid.alpha': 0.5,
    'text.color': 'black',
    'font.family': 'sans-serif'
})

9. Bar chart showing the top 15 countries by total cases ?

In [None]:
top_15_cases = df.nlargest(15, 'total_cases')
plt.figure(figsize=(12, 8))
plt.bar(top_15_cases['country'], top_15_cases['total_cases'], color=dark_grey)
plt.title('Top 15 Countries by Total Cases', color='black', fontsize=16, fontweight='bold')
plt.xlabel('Country', color='black', fontsize=12)
plt.ylabel('Total Cases (in billions)', color='black', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

10. Horizontal bar chart for Mortality Rate of top 10 countries by total deaths ?

In [None]:
top_10_deaths = df.nlargest(10, 'total_deaths')
plt.figure(figsize=(12, 8))
plt.barh(top_10_deaths['country'], top_10_deaths['mortality_rate'], color=dark_grey)
plt.title('Mortality Rate for Top 10 Countries by Total Deaths', color='black', fontsize=16, fontweight='bold')
plt.xlabel('Mortality Rate', color='black', fontsize=12)
plt.ylabel('Country', color='black', fontsize=12)
plt.gca().invert_yaxis()  # Invert y-axis to show the highest at the top
plt.tight_layout()
plt.show()

11. Scatter plot to visualize relationship between Total Cases and Total Deaths ?

In [None]:
plt.figure(figsize=(10, 8))
plt.scatter(df['total_cases'], df['total_deaths'], color=dark_grey, alpha=0.7)
plt.title('Relationship between Total Cases and Total Deaths', color='black', fontsize=16, fontweight='bold')
plt.xlabel('Total Cases (in billions)', color='black', fontsize=12)
plt.ylabel('Total Deaths (in millions)', color='black', fontsize=12)
plt.tight_layout()
plt.show()

12. Overlay a regression line on the scatter plot ?

In [None]:
# Exclude the values where Total Cases are 0 to avoid errors in polyfit
non_zero_cases_df = df[df['total_cases'] > 0]
x = non_zero_cases_df['total_cases']
y = non_zero_cases_df['total_deaths']

# Fit a first-degree polynomial (linear regression)
z = np.polyfit(x, y, 1)
p = np.poly1d(z)

plt.figure(figsize=(10, 8))
plt.scatter(x, y, color=dark_grey, alpha=0.7, label='Data Points')
plt.plot(x, p(x), color=medium_grey, linewidth=2, label='Regression Line')
plt.title('Total Cases vs. Total Deaths with Regression Line', color='black', fontsize=16, fontweight='bold')
plt.xlabel('Total Cases (in billions)', color='black', fontsize=12)
plt.ylabel('Total Deaths (in millions)', color='black', fontsize=12)
plt.legend(facecolor=light_grey)
plt.tight_layout()
plt.show()

13. Histogram of the Population column ?

In [None]:
plt.figure(figsize=(10, 6))
# Filter out 0 population values which are likely data errors
filtered_population = df[df['population'] > 0]['population']
plt.hist(filtered_population, bins=50, color=dark_grey, edgecolor=white_color)
plt.title('Distribution of Population Worldwide', color='black', fontsize=16, fontweight='bold')
plt.xlabel('Population (in billions)', color='black', fontsize=12)
plt.ylabel('Number of Countries', color='black', fontsize=12)
plt.tight_layout()
plt.show()

14.  Scatter plot for Tests_per_1000_people vs. Total Cases (pop > 50M) ?

In [None]:
filtered_df_pop = df[df['population'] > 50000000]
plt.figure(figsize=(10, 8))
plt.scatter(
    filtered_df_pop['tests_per_1000_people'],
    filtered_df_pop['total_cases'],
    color=dark_grey,
    alpha=0.7
)
plt.title('Tests per 1000 people vs. Total Cases (Population > 50M)', color='black', fontsize=16, fontweight='bold')
plt.xlabel('Tests per 1000 people', color='black', fontsize=12)
plt.ylabel('Total Cases (in billions)', color='black', fontsize=12)
plt.tight_layout()
plt.show()

15. Create a box plot to show the distribution of total_cases ?

In [None]:
# Filter out countries with zero cases for a more meaningful plot
filtered_cases = df[df['total_cases'] > 0]['total_cases']

plt.figure(figsize=(10, 6))
plt.boxplot(filtered_cases, vert=False, patch_artist=True, boxprops=dict(facecolor=medium_grey))
plt.title('Distribution of Total Cases Worldwide', color='black', fontsize=16, fontweight='bold')
plt.xlabel('Total Cases', color='black', fontsize=12)
plt.yticks([1], [''])  # Hide y-axis label
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

16. Scatter plot with a log scale for Population vs Total Cases ?

In [None]:
plt.figure(figsize=(10, 8))
# Filter out zero values before applying log scale
log_df = df[(df['population'] > 0) & (df['total_cases'] > 0)]
plt.scatter(log_df['population'], log_df['total_cases'], color=dark_grey, alpha=0.7)
plt.xscale('log')
plt.yscale('log')
plt.title('Log-Log Scatter Plot: Population vs. Total Cases', color='black', fontsize=16, fontweight='bold')
plt.xlabel('Population (Log Scale)', color='black', fontsize=12)
plt.ylabel('Total Cases (Log Scale)', color='black', fontsize=12)
plt.grid(True, which="both", ls="--")
plt.tight_layout()
plt.show()

17. Horizontal bar chart for the top 10 countries by active cases ?

In [None]:
top_10_active = df.nlargest(10, 'active_cases')
plt.figure(figsize=(12, 8))
plt.barh(top_10_active['country'], top_10_active['active_cases'], color=dark_grey)
plt.title('Top 10 Countries by Active Cases', color='black', fontsize=16, fontweight='bold')
plt.xlabel('Active Cases', color='black', fontsize=12)
plt.ylabel('Country', color='black', fontsize=12)
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

### *--THE END--*