In [1]:
!pip install jovian opendatasets --upgrade --quiet
!pip install plotly --quiet
!pip install folium --quiet


In [2]:
# Data manipulation
import pandas as pd
import numpy as np

# Data visualization
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns 
import folium

# Statistics 
import scipy
import statsmodels as sms
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf


In [3]:
covid19_df = pd.read_csv("COVID19.csv")

In [4]:
covid19_df

Unnamed: 0,date,location,new_cases,new_deaths,total_cases,total_deaths,weekly_cases,weekly_deaths,biweekly_cases,biweekly_deaths
0,2020-02-24,Afghanistan,1.0,,1.0,,,,,
1,2020-02-25,Afghanistan,0.0,,1.0,,,,,
2,2020-02-26,Afghanistan,0.0,,1.0,,,,,
3,2020-02-27,Afghanistan,0.0,,1.0,,,,,
4,2020-02-28,Afghanistan,0.0,,1.0,,,,,
...,...,...,...,...,...,...,...,...,...,...
73721,2021-03-10,Zimbabwe,20.0,0.0,36341.0,1489.0,162.0,11.0,381.0,33.0
73722,2021-03-11,Zimbabwe,36.0,3.0,36377.0,1492.0,154.0,9.0,383.0,34.0
73723,2021-03-12,Zimbabwe,46.0,4.0,36423.0,1496.0,175.0,12.0,379.0,33.0
73724,2021-03-13,Zimbabwe,48.0,5.0,36471.0,1501.0,211.0,16.0,413.0,38.0


In [5]:
covid19_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73726 entries, 0 to 73725
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   date             73726 non-null  object 
 1   location         73726 non-null  object 
 2   new_cases        73717 non-null  float64
 3   new_deaths       64648 non-null  float64
 4   total_cases      73719 non-null  float64
 5   total_deaths     64490 non-null  float64
 6   weekly_cases     72716 non-null  float64
 7   weekly_deaths    72716 non-null  float64
 8   biweekly_cases   71323 non-null  float64
 9   biweekly_deaths  71323 non-null  float64
dtypes: float64(8), object(2)
memory usage: 5.6+ MB


In [6]:
covid19_df.describe()

Unnamed: 0,new_cases,new_deaths,total_cases,total_deaths,weekly_cases,weekly_deaths,biweekly_cases,biweekly_deaths
count,73717.0,64648.0,73719.0,64490.0,72716.0,72716.0,71323.0,71323.0
mean,5172.018286,132.052082,655254.6,19376.8,36284.25,814.08122,73052.11,1640.785665
std,32304.463913,723.035537,4603780.0,113441.2,224089.9,4627.067406,449233.2,9270.537149
min,-74347.0,-1918.0,1.0,1.0,-43561.0,-1625.0,-4849.0,-1616.0
25%,2.0,0.0,721.0,38.0,42.0,0.0,97.0,1.0
50%,59.0,2.0,8135.0,253.0,506.0,7.0,1067.0,16.0
75%,675.0,16.0,87482.5,2409.75,4908.0,86.0,9917.0,177.0
max,880902.0,17895.0,119875200.0,2653652.0,5176951.0,100968.0,9880398.0,197872.0


In [7]:
# Checking null values
covid19_df.isnull().sum()

date                  0
location              0
new_cases             9
new_deaths         9078
total_cases           7
total_deaths       9236
weekly_cases       1010
weekly_deaths      1010
biweekly_cases     2403
biweekly_deaths    2403
dtype: int64

In [8]:
# Removing null values
covid19_df["new_cases"].fillna(0, inplace=True)
covid19_df["new_deaths"].fillna(0, inplace=True)
covid19_df["total_cases"].fillna(0, inplace=True)
covid19_df["total_deaths"].fillna(0, inplace=True)
covid19_df["weekly_cases"].fillna(0, inplace=True)
covid19_df["weekly_deaths"].fillna(0, inplace=True)
covid19_df["biweekly_cases"].fillna(0, inplace=True)
covid19_df["biweekly_deaths"].fillna(0, inplace=True)

In [9]:
# Checking again
covid19_df.isnull().sum()

date               0
location           0
new_cases          0
new_deaths         0
total_cases        0
total_deaths       0
weekly_cases       0
weekly_deaths      0
biweekly_cases     0
biweekly_deaths    0
dtype: int64

In [10]:
# Type Check
covid19_df["new_cases"].dtypes
covid19_df["new_deaths"].dtypes
covid19_df["total_cases"].dtypes
covid19_df["total_deaths"].dtypes
covid19_df["weekly_cases"].dtypes
covid19_df["weekly_deaths"].dtypes
covid19_df["biweekly_cases"].dtypes
covid19_df["biweekly_deaths"].dtypes

dtype('float64')

In [11]:
# Set types into integer
covid19_df["new_cases"] = covid19_df["new_cases"].fillna(0).astype(np.int64)
covid19_df["new_deaths"] = covid19_df["new_deaths"].fillna(0).astype(np.int64)
covid19_df["total_cases"] = covid19_df["total_cases"].fillna(0).astype(np.int64)
covid19_df["total_deaths"] = covid19_df["total_deaths"].fillna(0).astype(np.int64)
covid19_df["weekly_cases"] = covid19_df["weekly_cases"].fillna(0).astype(np.int64)
covid19_df["weekly_deaths"] = covid19_df["weekly_deaths"].fillna(0).astype(np.int64)
covid19_df["biweekly_cases"] = covid19_df["biweekly_cases"].fillna(0).astype(np.int64)
covid19_df["biweekly_deaths"] = covid19_df["biweekly_deaths"].fillna(0).astype(np.int64)

In [12]:
# Type Check Agin
covid19_df["new_cases"].dtypes
covid19_df["new_deaths"].dtypes
covid19_df["total_cases"].dtypes
covid19_df["total_deaths"].dtypes
covid19_df["weekly_cases"].dtypes
covid19_df["weekly_deaths"].dtypes
covid19_df["biweekly_cases"].dtypes
covid19_df["biweekly_deaths"].dtypes

dtype('int64')

In [13]:
covid19_df["date"] = pd.to_datetime(covid19_df["date"], format = '%Y-%m-%d')
covid19_df = covid19_df.replace([np.inf, -np.inf], np.nan)
covid19_df.fillna(0)

Unnamed: 0,date,location,new_cases,new_deaths,total_cases,total_deaths,weekly_cases,weekly_deaths,biweekly_cases,biweekly_deaths
0,2020-02-24,Afghanistan,1,0,1,0,0,0,0,0
1,2020-02-25,Afghanistan,0,0,1,0,0,0,0,0
2,2020-02-26,Afghanistan,0,0,1,0,0,0,0,0
3,2020-02-27,Afghanistan,0,0,1,0,0,0,0,0
4,2020-02-28,Afghanistan,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
73721,2021-03-10,Zimbabwe,20,0,36341,1489,162,11,381,33
73722,2021-03-11,Zimbabwe,36,3,36377,1492,154,9,383,34
73723,2021-03-12,Zimbabwe,46,4,36423,1496,175,12,379,33
73724,2021-03-13,Zimbabwe,48,5,36471,1501,211,16,413,38


In [14]:
covid19_df.location.unique()

array(['Afghanistan', 'Africa', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antigua and Barbuda', 'Argentina', 'Armenia', 'Asia', 'Australia',
       'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh',
       'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan',
       'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil',
       'Brunei', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia',
       'Cameroon', 'Canada', 'Cape Verde', 'Central African Republic',
       'Chad', 'Chile', 'China', 'Colombia', 'Comoros', 'Congo',
       'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Cuba', 'Cyprus',
       'Czechia', 'Democratic Republic of Congo', 'Denmark', 'Djibouti',
       'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt',
       'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia',
       'Eswatini', 'Ethiopia', 'Europe', 'European Union', 'Fiji',
       'Finland', 'France', 'Gabon', 'Gambia', 'Georgia', 'Germany',
       'Ghana', 'Greece', 'Grenada', 'Gu

<h3><strong>Separating the columns with total values</strong></h3>

In [15]:
countries_total_df = covid19_df.groupby("location").sum().reset_index()[["location","total_cases", "total_deaths"]]
countries_total_df

Unnamed: 0,location,total_cases,total_deaths
0,Afghanistan,12488780,471323
1,Africa,553411806,13668942
2,Albania,10008196,199035
3,Algeria,18492555,590044
4,Andorra,1410351,22848
...,...,...,...
194,Vietnam,364377,7463
195,World,15247572791,390024883
196,Yemen,543773,154447
197,Zambia,6621342,112247


<h3><strong>Separating the columns with average values</strong></h3>

In [16]:
countries_mean_df = covid19_df.groupby("location").mean().reset_index()[["location", "new_cases", "new_deaths", "total_cases", "weekly_cases", "weekly_deaths", "biweekly_cases","biweekly_deaths"]]
countries_mean_df

Unnamed: 0,location,new_cases,new_deaths,total_cases,weekly_cases,weekly_deaths,biweekly_cases,biweekly_deaths
0,Afghanistan,145.415584,6.381818,3.243839e+04,1.016782e+03,44.594805,2.031156e+03,89.085714
1,Africa,10191.181818,272.166667,1.397505e+06,7.078908e+04,1890.654040,1.403530e+05,3745.391414
2,Albania,316.641509,5.512129,2.697627e+04,2.179000e+03,37.743935,4.262143e+03,73.126685
3,Algeria,300.169271,7.906250,4.815770e+04,2.093406e+03,55.195312,4.166995e+03,109.898438
4,Andorra,29.804233,0.298942,3.731087e+03,2.066296e+02,2.076720,4.097328e+02,4.140212
...,...,...,...,...,...,...,...,...
194,Vietnam,6.124700,0.083933,8.738058e+02,4.258753e+01,0.587530,8.424700e+01,1.175060
195,World,284812.011962,6348.409091,3.647745e+07,1.971734e+06,44023.354067,3.895070e+06,87040.303828
196,Yemen,8.365782,2.032448,1.604050e+03,5.532743e+01,13.823009,1.046224e+02,27.129794
197,Zambia,234.245856,3.198895,1.829100e+04,1.619608e+03,22.146409,3.180370e+03,43.616022


<h3><strong>Removing the non-country in dataset</strong></h3>

In [17]:
# Removing the location "World" in the dataset
countries_total_df = countries_total_df.drop(195)
# Removing the location "North America" in the dataset
countries_total_df = countries_total_df.drop(130)
# Removing the location "Europe" in the dataset
countries_total_df = countries_total_df.drop(60)
# Removing the location "Asia" in the dataset
countries_total_df = countries_total_df.drop(9)
# Removing the location "South America" in the dataset
countries_total_df = countries_total_df.drop(165)
# Removing the location "European Union" in the dataset
countries_total_df = countries_total_df.drop(61)
# Removing the location "Africa" in the dataset
countries_total_df = countries_total_df.drop(1)

countries_total_df

Unnamed: 0,location,total_cases,total_deaths
0,Afghanistan,12488780,471323
2,Albania,10008196,199035
3,Algeria,18492555,590044
4,Andorra,1410351,22848
5,Angola,2788570,71196
...,...,...,...
193,Venezuela,21740308,195892
194,Vietnam,364377,7463
196,Yemen,543773,154447
197,Zambia,6621342,112247


In [18]:
# Removing the location "World" in the dataset
countries_mean_df = countries_mean_df.drop(195)
# Removing the location "North America" in the dataset
countries_mean_df = countries_mean_df.drop(130)
# Removing the location "Europe" in the dataset
countries_mean_df = countries_mean_df.drop(60)
# Removing the location "Asia" in the dataset
countries_mean_df = countries_mean_df.drop(9)
# Removing the location "South America" in the dataset
countries_mean_df = countries_mean_df.drop(165)
# Removing the location "European Union" in the dataset
countries_df = countries_df.drop(61)
# Removing the location "Africa" in the dataset
countries_df = countries_df.drop(1)

countries_df

NameError: name 'countries_df' is not defined

In [None]:
covid19_df.head()

<h3><strong>Weekly and biweekly column should express in mean</strong></h3>

In [None]:
countries_weekly_datas = countries_df.groupby("location").mean().reset_index()[["location", "weekly_cases", "weekly_deaths", "biweekly_cases","biweekly_deaths"]]
countries_weekly_datas

In [None]:
countries_daily_datas = covid19_df.groupby("location").mean().reset_index()[["location", "new_cases", "new_deaths",  ]]
countries_daily_datas

<h3><strong>Q: How many countries have no total tests result in the dataset?</strong></h3>

In [None]:
total_new_cases_missing = covid19_df.isna().sum()["location"]

In [None]:
print("The data for total tests is missing for {} countries.".format(int(total_new_cases_missing)))

In [None]:
countries_total_datas.describe()

<h3><strong>Q: What is the total cases of all the countries listed in this dataset?</strong></h3>

In [None]:
total_world_cases = countries_total_datas["total_cases"].sum()

In [None]:
print("The total world cases is {}.".format(int(total_world_cases)))

<h3><strong>Q: What are the 10 countries with the highest cases?</strong></h3>

In [None]:
most_cases_countries = countries_total_datas.sort_values(by="total_cases", ascending = False)

In [None]:
most_cases_countries.head(10)

<h3><strong>Q: What are the 10 countries with the least total cases?</strong></h3>

In [None]:
most_cases_countries.tail(10)