In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# import glob
# import gc

import seaborn as sns
from seaborn import objects
sns.set_theme()
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Import Dataset

In [None]:
df= pd.read_csv("covid_19_clean_complete.csv").copy()

## Data Description and Summary

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.shape

In [None]:
# Convert Date to Datetime
df["Date"]= pd.to_datetime(df["Date"])

In [None]:
df.info()

In [None]:
df_obj= df.drop(["Date"], axis=1).select_dtypes(object)
df_obj.describe()

In [None]:
df_int= df.select_dtypes(int)
df_int.describe()

In [None]:
df.duplicated().sum()

In [None]:
df.isna().sum()

In [None]:
df["Year"], df["Month"], df["Day"]= df.Date.dt.year, df.Date.dt.month_name(), df.Date.dt.day_name()

In [None]:
df.head()

In [None]:
df.Month.unique()

In [None]:
df.Day.unique()

In [None]:
df.Year.unique()

## Data Exploration and Visualization

In [None]:
corr_df= df_int.corr()
corr_df
sns.heatmap(corr_df)
plt.title("CORRELATION OF FOUR IMPORTATN COVID-19 DATA COLUMNS")
plt.savefig("plots/correlation viz.png")

In [None]:
# Create scatter map for confirmed cases
fig = px.scatter_geo(df, lat='Lat', lon='Long', color='Confirmed',
                     hover_name='Country/Region', size='Confirmed',
                     title='Confirmed Cases Globally')

# Save plot
pio.write_image(fig=fig, file="plots/confirmed_cases.png", format="png", width=1000, height=600)
fig.show()

In [None]:
# Create scatter map for deaths
fig = px.scatter_geo(df, lat='Lat', lon='Long', color='Deaths',
                     hover_name='Country/Region', size='Deaths',
                     title='Deaths Globally')

# Save plot
pio.write_image(fig=fig, file="plots/death_cases.png", format="png", width=1000, height=600)
fig.show()

In [None]:
# Create scatter map for confirmed cases
fig = px.scatter_geo(df, lat='Lat', lon='Long', color='Recovered',
                     hover_name='Country/Region', size='Recovered',
                     title='Recovery Globally')

# Save plot
pio.write_image(fig=fig, file="plots/recovered_cases.png", format="png", width=1000, height=600)
fig.show()

In [None]:
# Group the dataset by 'Country/Region' and sum the relevant columns
df_grouped = df.groupby('Country/Region')[['Confirmed', 'Active', 'Recovered', 'Deaths']].sum().reset_index()

# Sort the DataFrame by total confirmed cases in descending order
df_grouped = df_grouped.sort_values(by='Confirmed', ascending=False)

# Select the top 10 countries
top_10_countries = df_grouped.head(10)

# Display the result
top_10_countries

In [None]:
fig, ax= plt.subplots()
sns.lineplot(df, x="Month", y="Confirmed")
ax.set_title("GLOBAL SPREAD OF COVID-19")
plt.savefig("plots/GLOBAL SPREAD OF COVID-19.png")

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
sns.lineplot(df, x="Month", y="Confirmed", hue= "WHO Region", style="WHO Region")
ax.set_title("TREND OF VIRAL CASES ACROSS CONTINENTS")
plt.savefig("plots/TREND OF VIRAL CASES ACROSS CONTINENTS.png")

In [None]:
df_sorted = df.sort_values(by="Confirmed", ascending=False)
fig,ax= plt.subplots()
sns.barplot(df_sorted, x= "Confirmed", y="WHO Region", hue="WHO Region", errorbar=None)
ax.set_title("Confirmed Cases by Continent")
plt.xlabel("Confirmed Cases")
plt.savefig("plots/Confirmed Cases by Continent.png")

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(10, 10))

sns.kdeplot(data=df_grouped, x="Confirmed", ax=ax[0,0]).set(ylabel="Frequency")
sns.kdeplot(data=df_grouped, x="Deaths", ax=ax[0,1]).set(ylabel="Frequency")
sns.kdeplot(data=df_grouped, x="Recovered", ax=ax[1,0]).set(ylabel="Frequency")
sns.kdeplot(data=df_grouped, x="Active", ax=ax[1,1]).set(ylabel="Frequency")

fig.suptitle("DISTRIBUTION OF CASES")
plt.savefig("plots/DISTRIBUTION OF COVID-19 CASES.png")

the distribution for cases are skewed to the left, with outliers

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
sns.barplot(data=top_10_countries, x='Confirmed', y='Country/Region')

ax.set_title('Top 10 Countries with Highest Confirmed Cases')
plt.xlabel('Number of Confirmed Cases')
plt.ylabel('Country/Region')

plt.savefig('plots/Top_10_Countries_Confirmed_Cases.png')
plt.show()

In [None]:
fig, ax= plt.subplots()
sns.lineplot(df, x="Month", y="Deaths")
# ax.set_title("GLOBAL SPREAD OF COVID-19")
# plt.savefig("plots/GLOBAL SPREAD OF COVID-19.png")

## Feature Engineering

In [None]:
# Mortality Ratios
df['Mortality Ratio'] = df['Deaths'] / df['Confirmed'] * 100

In [None]:
import pandas as pd

# Assuming your DataFrame is sorted by date
df= df.sort_values(by='Date', ascending=False)
df['Daily Growth Rate'] = df['Confirmed'].pct_change() * 100

In [None]:
# Download world population dataset
# ! #!/bin/bash
# ! kaggle datasets download aungdev/world-population-dataset-world-bank

country_pop= pd.read_csv("world_population_worldbank.csv", usecols=["Country Name", "Country Code", "2020"])
country_pop.head()

In [None]:
# get country code and population as at the year 2020
df_new= df.join(country_pop.set_index('Country Name'), on='Country/Region')
df_new.rename(columns= {"Country/Region": "Country Name", "2020": "Population"}, inplace=True)



In [None]:
# Calculate confirmed case per population
# getting the case for each population per million
df_new['Cases Per Million'] = (df_new.Confirmed / df_new.Population) * 1000000

In [None]:
df_new.head()

In [None]:
df_new.tail()

In [None]:
df_new.isna().sum()