# **Connecting to drive**

In [35]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Importing libraries**

In [36]:
import pandas as pd
import numpy as np
import os
from geopy.geocoders import Nominatim # Longutude and Latitude
!pip install pycountry_convert
import pycountry_convert as pc
import warnings
warnings.filterwarnings('ignore')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# **Data Aggregation**

### 1. Automated importing and basic cleaning of the 25 dataframes from the website "Our World in Data", preparing them for merging
source: https://ourworldindata.org/

In [37]:
def Our_World_in_Data_CLEANING(csv_address, file_name):
  df = pd.read_csv(csv_address)
  df.rename(columns={'Entity':'Country','Day':'Date'},inplace=True)
  df.drop('Code', axis=1, inplace=True)

  return df

dir = '/content/drive/MyDrive/COVID 19 Data/Our_World_in_Data'

flag = 0
merged_df = pd.DataFrame()
DFcounter = 0
for file in os.listdir(dir):
  
  if file.endswith(".csv"):
    file_name = file
    csv_address = '/content/drive/MyDrive/COVID 19 Data/Our_World_in_Data/'+file_name

    df = Our_World_in_Data_CLEANING(csv_address, file_name)
    #if min(df['Date']) >= '2020-02-17':
    if min(df['Date']) > '2020-02-17':
      continue
    DFcounter+=1
    print(file_name,'countries:', len(df['Country'].unique()))
    print(file_name,'min date',min(df['Date']))
    print(file_name,'max date',max(df['Date']))
    print(' ')
    if flag == 0:
      merged_df =df
      flag = 1
    else:
      merged_df = pd.merge(df, merged_df, on=['Country','Date'])
print(DFcounter)

merged_df['Date'] = pd.to_datetime(merged_df['Date'])
print('[merged_df]')
print('Countries:', len(merged_df['Country'].unique()))
print('Min date',min(merged_df['Date']))
print('Max date',max(merged_df['Date']))
print('Shape',merged_df.shape)

covid-vaccination-policy.csv countries: 187
covid-vaccination-policy.csv min date 2020-01-01
covid-vaccination-policy.csv max date 2022-09-15
 
covid-vaccine-age.csv countries: 186
covid-vaccine-age.csv min date 2020-01-01
covid-vaccine-age.csv max date 2022-09-15
 
public-campaigns-covid.csv countries: 186
public-campaigns-covid.csv min date 2020-01-01
public-campaigns-covid.csv max date 2022-09-15
 
covid-containment-and-health-index.csv countries: 181
covid-containment-and-health-index.csv min date 2020-01-22
covid-containment-and-health-index.csv max date 2022-09-13
 
school-closures-covid.csv countries: 186
school-closures-covid.csv min date 2020-01-21
school-closures-covid.csv max date 2022-09-15
 
workplace-closures-covid.csv countries: 186
workplace-closures-covid.csv min date 2020-01-01
workplace-closures-covid.csv max date 2022-09-15
 
public-events-covid.csv countries: 186
public-events-covid.csv min date 2020-01-01
public-events-covid.csv max date 2022-09-15
 
public-gather

### 2. Retrieving and cleaning of 3 dataframes 

  
*   Renaming the columns
*   Converting 'Date' to date-time
*   Prepared them for merging 

Source: https://covid19.who.int/data

In [38]:
CumulativeDF = pd.read_csv('/content/drive/MyDrive/COVID 19 Data/WHO/WHO-COVID-19-global-data.csv')
CumulativeDF.rename(columns={'Date_reported':'Date','Country_code':'country code'},inplace=True)
CumulativeDF['Date'] = pd.to_datetime(CumulativeDF['Date'])

LatestDF = pd.read_csv('/content/drive/MyDrive/COVID 19 Data/WHO/WHO-COVID-19-global-table-data.csv')
LatestDF.reset_index(inplace=True)
LatestDF = LatestDF.rename(columns={'index':'Country'})
LatestDF.drop('Deaths - newly reported in last 24 hours',axis=1,inplace=True)
LatestDF.dropna(inplace=True)

VaccineDF = pd.read_csv('/content/drive/MyDrive/COVID 19 Data/WHO/vaccination-data.csv')
VaccineDF = VaccineDF.rename(columns={'COUNTRY':'Country'})
VaccineDF.dropna(inplace=True)

CumulativeDF = CumulativeDF[CumulativeDF['Country'].isin(list(VaccineDF['Country']))]
LatestDF = LatestDF[LatestDF['Country'].isin(list(VaccineDF['Country']))]
VaccineDF = VaccineDF[VaccineDF['Country'].isin(list(CumulativeDF['Country']))] # to assure that they all have the same countries which is 190

CumulativeDF.reset_index(inplace=True, drop=True)
LatestDF.reset_index(inplace=True, drop=True)
VaccineDF.reset_index(inplace=True, drop=True)

print('DF1:',str(len(CumulativeDF['Country'].unique()))+', DF2:',str(len(LatestDF['Country']))+', DF3:',len(VaccineDF['Country']))

DF1: 189, DF2: 189, DF3: 189


In [39]:
merged_df2 = pd.merge(LatestDF, CumulativeDF, on=['Country'])
merged_df2 = pd.merge(merged_df2, VaccineDF, on=['Country'])

merged_df2['Date'] = pd.to_datetime(merged_df2['Date'])
print('merged_df2 countries:', len(merged_df2['Country'].unique()))
print('merged_df2 min date',min(merged_df2['Date']))
print('merged_df2 max date',max(merged_df2['Date']))
print(' ')

print(type(merged_df2.Date[0]))

merged_df2 countries: 189
merged_df2 min date 2020-01-03 00:00:00
merged_df2 max date 2022-09-15 00:00:00
 
<class 'pandas._libs.tslibs.timestamps.Timestamp'>


### Merging the 3 dataframes with the 24 dataframes

In [40]:
merged_df = pd.merge(merged_df2, merged_df, on=['Country','Date'])
merged_df = merged_df.sort_values(by='Date')
print('[merged_df]')
print('Countries:', len(merged_df['Country'].unique()))
print('Min date',min(merged_df['Date']))
print('Max date',max(merged_df['Date']))
print('Shape',merged_df.shape)
print(' ')
merged_df['Date']

[merged_df]
Countries: 102
Min date 2020-02-17 00:00:00
Max date 2022-09-11 00:00:00
Shape (93768, 55)
 


0       2020-02-17
72578   2020-02-17
71654   2020-02-17
70737   2020-02-17
6362    2020-02-17
           ...    
58747   2022-09-11
52289   2022-09-11
85560   2022-09-11
25799   2022-09-11
93767   2022-09-11
Name: Date, Length: 93768, dtype: datetime64[ns]

In [41]:
print('Number of rows =', len(merged_df))
print('Number of columns =', len(merged_df.columns))
print('Number of countries =', len(merged_df['Country'].unique()))
print('Number of rows for each country =',len(merged_df)/len(merged_df['Country'].unique()))
print('With',merged_df.isna().sum().sum(),'Nulls')
print('Number of missing dates in the range between the min and max Dates =',
      len(pd.date_range(start=min(merged_df['Date']), end=max(merged_df['Date'])).difference(merged_df.Date)), '(No gaps in the date column)')

Number of rows = 93768
Number of columns = 55
Number of countries = 102
Number of rows for each country = 919.2941176470588
With 5241 Nulls
Number of missing dates in the range between the min and max Dates = 0 (No gaps in the date column)


In [42]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 93768 entries, 0 to 93767
Data columns (total 55 columns):
 #   Column                                                        Non-Null Count  Dtype         
---  ------                                                        --------------  -----         
 0   Country                                                       93768 non-null  object        
 1   Name                                                          93768 non-null  object        
 2   WHO Region                                                    93768 non-null  int64         
 3   Cases - cumulative total                                      93768 non-null  float64       
 4   Cases - cumulative total per 100000 population                93768 non-null  int64         
 5   Cases - newly reported in last 7 days                         93768 non-null  float64       
 6   Cases - newly reported in last 7 days per 100000 population   93768 non-null  int64         
 7   Case

In [43]:
merged_df.reset_index(inplace=True)
merged_df.drop('index', inplace=True, axis=1)

# **Feature Engineering**

### Added Day of the week


*   Monday = 0 
*   Sunday = 6






In [44]:
merged_df['Day of the week'] = list(merged_df['Date'].dt.day_of_week)
merged_df['Day of the week string'] = list(merged_df['Date'].dt.day_name())
merged_df

Unnamed: 0,Country,Name,WHO Region,Cases - cumulative total,Cases - cumulative total per 100000 population,Cases - newly reported in last 7 days,Cases - newly reported in last 7 days per 100000 population,Cases - newly reported in last 24 hours,Deaths - cumulative total,Deaths - cumulative total per 100000 population,...,restriction_gatherings,cancel_public_events,workplace_closures,school_closures,containment_index,public_information_campaigns,vaccine_availability,vaccination_policy,Day of the week,Day of the week string
0,India,South-East Asia,44516479,3225.822,37843,2.742,6422,528250,38.279,160,...,0,0,0,0,17.26,1,0,0,0,Monday
1,Kyrgyzstan,Europe,205920,3156.253,85,1.303,0,2991,45.845,0,...,0,0,0,0,11.31,2,0,0,0,Monday
2,Mozambique,Africa,230184,736.461,39,0.125,4,2222,7.109,0,...,0,0,0,0,5.36,1,0,0,0,Monday
3,Zimbabwe,Africa,256939,1728.724,105,0.706,0,5596,37.651,0,...,0,0,0,0,5.36,1,0,0,0,Monday
4,Australia,Western Pacific,10112229,39655.980,17318,67.914,0,14421,56.553,207,...,0,0,0,0,22.02,2,0,0,0,Monday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93763,Kuwait,Eastern Mediterranean,657745,15401.805,0,0.000,0,2563,60.015,0,...,0,0,0,0,19.64,1,3,5,6,Sunday
93764,Uruguay,Americas,982846,28293.679,1660,47.787,0,7462,214.812,7,...,0,0,0,0,32.14,2,3,5,6,Sunday
93765,Senegal,Africa,88230,526.937,63,0.376,0,1968,11.754,0,...,0,1,1,1,46.19,2,2,4,6,Sunday
93766,Philippines,Western Pacific,3911487,3569.491,15009,13.697,1709,62416,56.959,249,...,0,1,1,1,50.43,2,3,5,6,Sunday


### Parsed the "Date" column into three more features 


*   Day
*   Month
*   Year






In [45]:
merged_df['Day'] = pd.to_datetime(merged_df['Date']).dt.day
merged_df['Month'] = pd.to_datetime(merged_df['Date']).dt.month
merged_df['Year'] = pd.to_datetime(merged_df['Date']).dt.year
merged_df

Unnamed: 0,Country,Name,WHO Region,Cases - cumulative total,Cases - cumulative total per 100000 population,Cases - newly reported in last 7 days,Cases - newly reported in last 7 days per 100000 population,Cases - newly reported in last 24 hours,Deaths - cumulative total,Deaths - cumulative total per 100000 population,...,school_closures,containment_index,public_information_campaigns,vaccine_availability,vaccination_policy,Day of the week,Day of the week string,Day,Month,Year
0,India,South-East Asia,44516479,3225.822,37843,2.742,6422,528250,38.279,160,...,0,17.26,1,0,0,0,Monday,17,2,2020
1,Kyrgyzstan,Europe,205920,3156.253,85,1.303,0,2991,45.845,0,...,0,11.31,2,0,0,0,Monday,17,2,2020
2,Mozambique,Africa,230184,736.461,39,0.125,4,2222,7.109,0,...,0,5.36,1,0,0,0,Monday,17,2,2020
3,Zimbabwe,Africa,256939,1728.724,105,0.706,0,5596,37.651,0,...,0,5.36,1,0,0,0,Monday,17,2,2020
4,Australia,Western Pacific,10112229,39655.980,17318,67.914,0,14421,56.553,207,...,0,22.02,2,0,0,0,Monday,17,2,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93763,Kuwait,Eastern Mediterranean,657745,15401.805,0,0.000,0,2563,60.015,0,...,0,19.64,1,3,5,6,Sunday,11,9,2022
93764,Uruguay,Americas,982846,28293.679,1660,47.787,0,7462,214.812,7,...,0,32.14,2,3,5,6,Sunday,11,9,2022
93765,Senegal,Africa,88230,526.937,63,0.376,0,1968,11.754,0,...,1,46.19,2,2,4,6,Sunday,11,9,2022
93766,Philippines,Western Pacific,3911487,3569.491,15009,13.697,1709,62416,56.959,249,...,1,50.43,2,3,5,6,Sunday,11,9,2022


### Added weekend column

In [46]:
names = merged_df['Day of the week']
lst = []
for i in range(len(names)):
  if((names[i] == 5) or (names[i] == 6)):
    lst.append(1)
  else:
    lst.append(0)
    
merged_df['Weekend'] = lst
merged_df[merged_df['Weekend'] == 1]

Unnamed: 0,Country,Name,WHO Region,Cases - cumulative total,Cases - cumulative total per 100000 population,Cases - newly reported in last 7 days,Cases - newly reported in last 7 days per 100000 population,Cases - newly reported in last 24 hours,Deaths - cumulative total,Deaths - cumulative total per 100000 population,...,containment_index,public_information_campaigns,vaccine_availability,vaccination_policy,Day of the week,Day of the week string,Day,Month,Year,Weekend
510,Greece,Europe,4838811,45144.205,33829,315.611,0,32894,306.888,137,...,0.00,0,0,0,5,Saturday,22,2,2020,1
511,Kazakhstan,Europe,1482094,7893.258,1232,6.561,181,19047,101.440,4,...,9.52,0,0,0,5,Saturday,22,2,2020,1
512,Bulgaria,Europe,1250250,17985.373,3585,51.572,544,37663,541.798,31,...,5.36,1,0,0,5,Saturday,22,2,2020,1
513,Senegal,Africa,88230,526.937,63,0.376,0,1968,11.754,0,...,0.00,0,0,0,5,Saturday,22,2,2020,1
514,Portugal,Europe,5451460,52947.826,16778,162.958,3085,24944,242.271,42,...,5.95,1,0,0,5,Saturday,22,2,2020,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93763,Kuwait,Eastern Mediterranean,657745,15401.805,0,0.000,0,2563,60.015,0,...,19.64,1,3,5,6,Sunday,11,9,2022,1
93764,Uruguay,Americas,982846,28293.679,1660,47.787,0,7462,214.812,7,...,32.14,2,3,5,6,Sunday,11,9,2022,1
93765,Senegal,Africa,88230,526.937,63,0.376,0,1968,11.754,0,...,46.19,2,2,4,6,Sunday,11,9,2022,1
93766,Philippines,Western Pacific,3911487,3569.491,15009,13.697,1709,62416,56.959,249,...,50.43,2,3,5,6,Sunday,11,9,2022,1


### Adding death ratio per country


> Death Ratio = total number of deaths / total number of cases

In [47]:
DeathRatioDF = (merged_df.groupby('Country')['Deaths - cumulative total'].max()/merged_df.groupby('Country')['Cases - cumulative total'].max())*100
DeathRatioDF = DeathRatioDF.to_frame()
DeathRatioDF.reset_index(inplace=True)
DeathRatioDF = DeathRatioDF.rename(columns={0:'Death Ratio'})

merged_df = pd.merge(merged_df, DeathRatioDF, on=['Country'])
merged_df

Unnamed: 0,Country,Name,WHO Region,Cases - cumulative total,Cases - cumulative total per 100000 population,Cases - newly reported in last 7 days,Cases - newly reported in last 7 days per 100000 population,Cases - newly reported in last 24 hours,Deaths - cumulative total,Deaths - cumulative total per 100000 population,...,public_information_campaigns,vaccine_availability,vaccination_policy,Day of the week,Day of the week string,Day,Month,Year,Weekend,Death Ratio
0,India,South-East Asia,44516479,3225.822,37843,2.742,6422,528250,38.279,160,...,1,0,0,0,Monday,17,2,2020,0,1.186643
1,India,South-East Asia,44516479,3225.822,37843,2.742,6422,528250,38.279,160,...,1,0,0,1,Tuesday,18,2,2020,0,1.186643
2,India,South-East Asia,44516479,3225.822,37843,2.742,6422,528250,38.279,160,...,1,0,0,2,Wednesday,19,2,2020,0,1.186643
3,India,South-East Asia,44516479,3225.822,37843,2.742,6422,528250,38.279,160,...,1,0,0,3,Thursday,20,2,2020,0,1.186643
4,India,South-East Asia,44516479,3225.822,37843,2.742,6422,528250,38.279,160,...,1,0,0,4,Friday,21,2,2020,0,1.186643
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93763,Canada,Americas,4197701,11122.049,18365,48.659,0,44347,117.500,237,...,2,3,5,2,Wednesday,7,9,2022,0,1.056460
93764,Canada,Americas,4197701,11122.049,18365,48.659,0,44347,117.500,237,...,2,3,5,3,Thursday,8,9,2022,0,1.056460
93765,Canada,Americas,4197701,11122.049,18365,48.659,0,44347,117.500,237,...,2,3,5,4,Friday,9,9,2022,0,1.056460
93766,Canada,Americas,4197701,11122.049,18365,48.659,0,44347,117.500,237,...,2,3,5,5,Saturday,10,9,2022,1,1.056460


### Adding continents and droping the name column

In [48]:
continents = []
countries = merged_df['Country'].unique()

for country in countries:
  country_code = pc.country_name_to_country_alpha2(country, cn_name_format="default")
  continent_name = pc.country_alpha2_to_continent_code(country_code)
  continents.append(continent_name)

continentsDic = {
    'NA': 'North America',
    'SA': 'South America', 
    'AS': 'Asia',
    'OC': 'Australia',
    'AF': 'Africa',
    'EU': 'Europe'
}

ContinentDF = pd.DataFrame(list(zip(countries,continents)), columns=['Country','Continent'])

for i in range(len(ContinentDF['Continent'])):
  ContinentDF['Continent'][i] = continentsDic[ContinentDF['Continent'][i]]
  

ContinentDF

Unnamed: 0,Country,Continent
0,India,Asia
1,Kyrgyzstan,Asia
2,Mozambique,Africa
3,Zimbabwe,Africa
4,Australia,Australia
...,...,...
97,Norway,Europe
98,Greece,Europe
99,Guatemala,North America
100,Yemen,Asia


In [49]:
merged_df = pd.merge(merged_df, ContinentDF, on=['Country'])
merged_df.drop('Name', axis=1,inplace=True)
merged_df

Unnamed: 0,Country,WHO Region,Cases - cumulative total,Cases - cumulative total per 100000 population,Cases - newly reported in last 7 days,Cases - newly reported in last 7 days per 100000 population,Cases - newly reported in last 24 hours,Deaths - cumulative total,Deaths - cumulative total per 100000 population,Deaths - newly reported in last 7 days,...,vaccine_availability,vaccination_policy,Day of the week,Day of the week string,Day,Month,Year,Weekend,Death Ratio,Continent
0,India,44516479,3225.822,37843,2.742,6422,528250,38.279,160,0.012,...,0,0,0,Monday,17,2,2020,0,1.186643,Asia
1,India,44516479,3225.822,37843,2.742,6422,528250,38.279,160,0.012,...,0,0,1,Tuesday,18,2,2020,0,1.186643,Asia
2,India,44516479,3225.822,37843,2.742,6422,528250,38.279,160,0.012,...,0,0,2,Wednesday,19,2,2020,0,1.186643,Asia
3,India,44516479,3225.822,37843,2.742,6422,528250,38.279,160,0.012,...,0,0,3,Thursday,20,2,2020,0,1.186643,Asia
4,India,44516479,3225.822,37843,2.742,6422,528250,38.279,160,0.012,...,0,0,4,Friday,21,2,2020,0,1.186643,Asia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93763,Canada,4197701,11122.049,18365,48.659,0,44347,117.500,237,0.628,...,3,5,2,Wednesday,7,9,2022,0,1.056460,North America
93764,Canada,4197701,11122.049,18365,48.659,0,44347,117.500,237,0.628,...,3,5,3,Thursday,8,9,2022,0,1.056460,North America
93765,Canada,4197701,11122.049,18365,48.659,0,44347,117.500,237,0.628,...,3,5,4,Friday,9,9,2022,0,1.056460,North America
93766,Canada,4197701,11122.049,18365,48.659,0,44347,117.500,237,0.628,...,3,5,5,Saturday,10,9,2022,1,1.056460,North America


### Added Longutude and Latitude

In [50]:
geolocator = Nominatim()
def geolocate(country):
    try:
        # Geolocate the center of the country
        loc = geolocator.geocode(country)
        # And return latitude and longitude
        return (loc.latitude, loc.longitude)
    except:
        # Return missing value
        return np.nan

countries = merged_df['Country'].unique()
lat=[]
lng=[]
for country in countries:
  lat.append(geolocate(country)[0])
  lng.append(geolocate(country)[1])

CountryDF = pd.DataFrame(list(zip(countries,lat,lng)), columns=['Country','latitude','longitude'])
CountryDF

Unnamed: 0,Country,latitude,longitude
0,India,22.351115,78.667743
1,Kyrgyzstan,41.508932,74.724091
2,Mozambique,-19.302233,34.914498
3,Zimbabwe,-18.455496,29.746841
4,Australia,-24.776109,134.755000
...,...,...,...
97,Norway,60.500021,9.099972
98,Greece,38.995368,21.987713
99,Guatemala,15.585555,-90.345759
100,Yemen,16.347124,47.891527


In [51]:
merged_df = pd.merge(merged_df, CountryDF, on=['Country'])
print('With',merged_df.isna().sum().sum(),'Nulls')
merged_df

With 5241 Nulls


Unnamed: 0,Country,WHO Region,Cases - cumulative total,Cases - cumulative total per 100000 population,Cases - newly reported in last 7 days,Cases - newly reported in last 7 days per 100000 population,Cases - newly reported in last 24 hours,Deaths - cumulative total,Deaths - cumulative total per 100000 population,Deaths - newly reported in last 7 days,...,Day of the week,Day of the week string,Day,Month,Year,Weekend,Death Ratio,Continent,latitude,longitude
0,India,44516479,3225.822,37843,2.742,6422,528250,38.279,160,0.012,...,0,Monday,17,2,2020,0,1.186643,Asia,22.351115,78.667743
1,India,44516479,3225.822,37843,2.742,6422,528250,38.279,160,0.012,...,1,Tuesday,18,2,2020,0,1.186643,Asia,22.351115,78.667743
2,India,44516479,3225.822,37843,2.742,6422,528250,38.279,160,0.012,...,2,Wednesday,19,2,2020,0,1.186643,Asia,22.351115,78.667743
3,India,44516479,3225.822,37843,2.742,6422,528250,38.279,160,0.012,...,3,Thursday,20,2,2020,0,1.186643,Asia,22.351115,78.667743
4,India,44516479,3225.822,37843,2.742,6422,528250,38.279,160,0.012,...,4,Friday,21,2,2020,0,1.186643,Asia,22.351115,78.667743
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93763,Canada,4197701,11122.049,18365,48.659,0,44347,117.500,237,0.628,...,2,Wednesday,7,9,2022,0,1.056460,North America,61.066692,-107.991707
93764,Canada,4197701,11122.049,18365,48.659,0,44347,117.500,237,0.628,...,3,Thursday,8,9,2022,0,1.056460,North America,61.066692,-107.991707
93765,Canada,4197701,11122.049,18365,48.659,0,44347,117.500,237,0.628,...,4,Friday,9,9,2022,0,1.056460,North America,61.066692,-107.991707
93766,Canada,4197701,11122.049,18365,48.659,0,44347,117.500,237,0.628,...,5,Saturday,10,9,2022,1,1.056460,North America,61.066692,-107.991707


In [52]:
merged_df.dropna(inplace=True)
merged_df.reset_index(inplace=True)
merged_df.drop('index',axis=1,inplace=True)
merged_df

Unnamed: 0,Country,WHO Region,Cases - cumulative total,Cases - cumulative total per 100000 population,Cases - newly reported in last 7 days,Cases - newly reported in last 7 days per 100000 population,Cases - newly reported in last 24 hours,Deaths - cumulative total,Deaths - cumulative total per 100000 population,Deaths - newly reported in last 7 days,...,Day of the week,Day of the week string,Day,Month,Year,Weekend,Death Ratio,Continent,latitude,longitude
0,India,44516479,3225.822,37843,2.742,6422,528250,38.279,160,0.012,...,0,Monday,17,2,2020,0,1.186643,Asia,22.351115,78.667743
1,India,44516479,3225.822,37843,2.742,6422,528250,38.279,160,0.012,...,1,Tuesday,18,2,2020,0,1.186643,Asia,22.351115,78.667743
2,India,44516479,3225.822,37843,2.742,6422,528250,38.279,160,0.012,...,2,Wednesday,19,2,2020,0,1.186643,Asia,22.351115,78.667743
3,India,44516479,3225.822,37843,2.742,6422,528250,38.279,160,0.012,...,3,Thursday,20,2,2020,0,1.186643,Asia,22.351115,78.667743
4,India,44516479,3225.822,37843,2.742,6422,528250,38.279,160,0.012,...,4,Friday,21,2,2020,0,1.186643,Asia,22.351115,78.667743
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91479,Canada,4197701,11122.049,18365,48.659,0,44347,117.500,237,0.628,...,2,Wednesday,7,9,2022,0,1.056460,North America,61.066692,-107.991707
91480,Canada,4197701,11122.049,18365,48.659,0,44347,117.500,237,0.628,...,3,Thursday,8,9,2022,0,1.056460,North America,61.066692,-107.991707
91481,Canada,4197701,11122.049,18365,48.659,0,44347,117.500,237,0.628,...,4,Friday,9,9,2022,0,1.056460,North America,61.066692,-107.991707
91482,Canada,4197701,11122.049,18365,48.659,0,44347,117.500,237,0.628,...,5,Saturday,10,9,2022,1,1.056460,North America,61.066692,-107.991707


### Adding population, area, and density of each country.

In [53]:
PopulationDF = pd.read_csv('/content/drive/MyDrive/COVID 19 Data/World_Population_Review/population-area-density.csv')
PopulationDF.rename(columns={'name':'Country'}, inplace=True)
PopulationDF

Unnamed: 0,Rank,Country,pop2022,pop2021,GrowthRate,area,Density
0,1,China,1425887.337,1425893.465,1.0000,9706961,146.8933
1,2,India,1417173.173,1407563.842,1.0068,3287590,431.0675
2,3,United States,338289.857,336997.624,1.0038,9372610,36.0935
3,4,Indonesia,275501.339,273753.191,1.0064,1904569,144.6529
4,5,Pakistan,235824.862,231402.117,1.0191,881912,267.4018
...,...,...,...,...,...,...,...
205,206,San Marino,33.660,33.745,0.9975,61,551.8033
206,207,Palau,18.055,18.024,1.0017,459,39.3355
207,208,Nauru,12.668,12.511,1.0125,21,603.2381
208,209,Tuvalu,11.312,11.204,1.0096,26,435.0769


In [54]:
merged_df = pd.merge(merged_df, PopulationDF, on=['Country'])
merged_df

Unnamed: 0,Country,WHO Region,Cases - cumulative total,Cases - cumulative total per 100000 population,Cases - newly reported in last 7 days,Cases - newly reported in last 7 days per 100000 population,Cases - newly reported in last 24 hours,Deaths - cumulative total,Deaths - cumulative total per 100000 population,Deaths - newly reported in last 7 days,...,Death Ratio,Continent,latitude,longitude,Rank,pop2022,pop2021,GrowthRate,area,Density
0,India,44516479,3225.822,37843,2.742,6422,528250,38.279,160,0.012,...,1.186643,Asia,22.351115,78.667743,2,1417173.173,1407563.842,1.0068,3287590,431.0675
1,India,44516479,3225.822,37843,2.742,6422,528250,38.279,160,0.012,...,1.186643,Asia,22.351115,78.667743,2,1417173.173,1407563.842,1.0068,3287590,431.0675
2,India,44516479,3225.822,37843,2.742,6422,528250,38.279,160,0.012,...,1.186643,Asia,22.351115,78.667743,2,1417173.173,1407563.842,1.0068,3287590,431.0675
3,India,44516479,3225.822,37843,2.742,6422,528250,38.279,160,0.012,...,1.186643,Asia,22.351115,78.667743,2,1417173.173,1407563.842,1.0068,3287590,431.0675
4,India,44516479,3225.822,37843,2.742,6422,528250,38.279,160,0.012,...,1.186643,Asia,22.351115,78.667743,2,1417173.173,1407563.842,1.0068,3287590,431.0675
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90602,Canada,4197701,11122.049,18365,48.659,0,44347,117.500,237,0.628,...,1.056460,North America,61.066692,-107.991707,40,38454.327,38155.012,1.0078,9984670,3.8513
90603,Canada,4197701,11122.049,18365,48.659,0,44347,117.500,237,0.628,...,1.056460,North America,61.066692,-107.991707,40,38454.327,38155.012,1.0078,9984670,3.8513
90604,Canada,4197701,11122.049,18365,48.659,0,44347,117.500,237,0.628,...,1.056460,North America,61.066692,-107.991707,40,38454.327,38155.012,1.0078,9984670,3.8513
90605,Canada,4197701,11122.049,18365,48.659,0,44347,117.500,237,0.628,...,1.056460,North America,61.066692,-107.991707,40,38454.327,38155.012,1.0078,9984670,3.8513


In [55]:
print('Number of rows =', len(merged_df))
print('Number of columns =', len(merged_df.columns))
print('Number of countries =', len(merged_df['Country'].unique()))
print('Number of rows for each country =',len(merged_df)/len(merged_df['Country'].unique()))
print('With',merged_df.isna().sum().sum(),'Nulls')
print('Number of missing dates in the range between the min and max Dates =',
      len(pd.date_range(start=min(merged_df['Date']), end=max(merged_df['Date'])).difference(merged_df.Date)), '(No gaps in the date column)')

Number of rows = 90607
Number of columns = 70
Number of countries = 99
Number of rows for each country = 915.2222222222222
With 0 Nulls
Number of missing dates in the range between the min and max Dates = 0 (No gaps in the date column)


### Adding
*   Life expectancy
*   Sub-region
*   Income group
*   Rural percentage
*   Temperature
*   Time zone
*   Poverty ratio
*   Health care index
*   Pollution index

**Important: Some column names and formating were changed using Excel before making the csv! Make sure to change the column names in Excel after downloading the data from the sources we've listed. If you struggle to obtain the data just email us or contact us on LinkedIn and we will send all the data.**

In [56]:
life = pd.read_csv('/content/drive/MyDrive/COVID 19 Data/Worldometers/life_expectancy.csv')
sub_region = pd.read_csv('/content/drive/MyDrive/COVID 19 Data/GitHub/sub-region.csv')
income = pd.read_csv('/content/drive/MyDrive/COVID 19 Data/The_World_Bank/income.csv')
rural = pd.read_csv('/content/drive/MyDrive/COVID 19 Data/The_World_Bank/rural-percentage.csv')
temp = pd.read_csv('/content/drive/MyDrive/COVID 19 Data/listfist/temperature.csv')


time_zone = pd.read_excel('/content/drive/MyDrive/COVID 19 Data/Wikipedia/time-zone.xlsx')
time_zone.dropna(inplace=True)
time_zone.reset_index(inplace=True)
time_zone.drop('index', axis=1, inplace=True)
time_zone = time_zone.astype({"No. of time zones": int})
time_zone['country'] = time_zone['country'].str.strip()

poverty = pd.read_csv('/content/drive/MyDrive/COVID 19 Data/World_Population_Review/poverty.csv')
poverty.rename(columns={'percPoverty':'poverty percentage '}, inplace=True)

health = pd.read_csv('/content/drive/MyDrive/COVID 19 Data/World_Population_Review/health-care.csv')

**column names after changing them using Excel**

In [57]:
print('life expectancy columns: ',list(life.columns))
print('sub_region columns: ',list(sub_region.columns))
print('income columns: ',list(income.columns))
print('rural columns: ',list(rural.columns))
print('temperature columns: ',list(temp.columns))
print('time_zone columns: ',list(time_zone.columns))
print('poverty columns: ',list(poverty.columns))
print('health care columns: ',list(health.columns))

life expectancy columns:  ['country', 'Life Expectancy(both sexes) ', 'Life Expectancy(Females)', 'Life Expectancy(Males)']
sub_region columns:  ['country', 'sub-region']
income columns:  ['country', 'Income group']
rural columns:  ['country', 'country code', 'rural percentage(2021)']
temperature columns:  ['country', 'Average Temperature(1991-2020) C', 'Coldest Month(1991-2020) C', 'Hottest Month(1991-2020) C', 'Variation(1991-2020) C']
time_zone columns:  ['country', 'No. of time zones', 'Time zone']
poverty columns:  ['country', 'poverty percentage ']
health care columns:  ['country', 'legatumRank2020', 'legatumRank2019', 'ceoworldRank']


In [58]:
rural.drop('country code', axis=1, inplace=True)
merged_df.rename(columns={'Country':'country'}, inplace=True)

In [59]:
merged_df = pd.merge(merged_df, life, on=['country'])
merged_df = pd.merge(merged_df, sub_region, on=['country'])
merged_df = pd.merge(merged_df, income, on=['country'])
merged_df = pd.merge(merged_df, rural, on=['country'])
merged_df = pd.merge(merged_df, temp, on=['country'])
merged_df = pd.merge(merged_df, time_zone, on=['country'])
merged_df = pd.merge(merged_df, poverty, on=['country'])
merged_df = pd.merge(merged_df, health, on=['country'])

In [60]:
print('Number of rows =', len(merged_df))
print('Number of columns =', len(merged_df.columns))
print('Number of countries =', len(merged_df['country'].unique()))
print('Number of rows for each country =',len(merged_df)/len(merged_df['country'].unique()))
print('With',merged_df.isna().sum().sum(),'Nulls')
print('Number of missing dates in the range between the min and max Dates =',
      len(pd.date_range(start=min(merged_df['Date']), end=max(merged_df['Date'])).difference(merged_df.Date)), '(No gaps in the date column)')

Number of rows = 76849
Number of columns = 86
Number of countries = 84
Number of rows for each country = 914.8690476190476
With 26363 Nulls
Number of missing dates in the range between the min and max Dates = 0 (No gaps in the date column)


# Cleaning


*   Renamed some Countries 
*   Droppped a country with null data "Turkmenistan"
*   Dropped some redundant features ( "country code, ”   who_region.1", “who region”, "rank", "data_source") 
*   Dropped missing values <1k
*   Dropped “ceoworldRank” feature because it has 24k missing values around 32% of the entire dataset
*   Converted all feature names into lower case 








In [61]:
merged_df['country'] = merged_df['country'].replace('Israel', 'Palestine')

In [62]:
merged_df = merged_df[merged_df.country != 'Turkmenistan']
merged_df = merged_df.sort_values(by='Date')

In [63]:
for column in merged_df:
  merged_df.rename(columns={column:column.lower()}, inplace=True)
merged_df

Unnamed: 0,country,who region,cases - cumulative total,cases - cumulative total per 100000 population,cases - newly reported in last 7 days,cases - newly reported in last 7 days per 100000 population,cases - newly reported in last 24 hours,deaths - cumulative total,deaths - cumulative total per 100000 population,deaths - newly reported in last 7 days,...,average temperature(1991-2020) c,coldest month(1991-2020) c,hottest month(1991-2020) c,variation(1991-2020) c,no. of time zones,time zone,poverty percentage,legatumrank2020,legatumrank2019,ceoworldrank
0,India,44516479,3225.822,37843,2.742,6422,528250,38.279,160,0.012,...,24.68,16.3,31.1,14.8,1,UTC+05:30 (IST),21.9,101,98,19.0
58398,Mongolia,981618,29942.989,981,29.924,0,2130,64.973,1,0.031,...,0.67,-25.7,20.7,46.4,2,"UTC+07:00 — the provinces of Khovd, Uvs and Ba...",28.4,85,88,
57460,Colombia,6304317,12389.856,1508,2.964,0,141708,278.498,62,0.122,...,24.79,23.5,26.6,3.1,1,UTC−05:00,35.7,74,71,35.0
56522,Latvia,912002,47806.990,7019,367.935,1419,5965,312.684,9,0.472,...,6.86,-10.9,21.0,31.9,1,UTC+02:00 (EET),22.9,32,32,48.0
55615,Kazakhstan,1482094,7893.258,1232,6.561,181,19047,101.440,4,0.021,...,6.91,-18.0,25.5,43.5,2,"UTC+05:00 — western Kazakhstan (Aktobe, Atyrau...",4.3,62,66,82.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63023,Poland,6233117,16421.029,30681,80.829,5858,117316,309.067,97,0.256,...,8.85,-6.7,21.3,28.0,1,UTC+01:00 (CET),15.4,36,36,51.0
6425,Luxembourg,288527,46082.625,815,130.169,198,1126,179.841,2,0.319,...,10.02,-2.7,22.1,24.8,1,UTC+01:00 (CET),17.5,9,9,
63961,Nepal,999121,3429.068,689,2.365,75,12015,41.237,5,0.017,...,13.21,2.7,20.8,18.1,1,UTC+05:45 (Nepal Time),25.2,114,114,61.0
20186,Togo,38678,467.198,97,1.172,8,284,3.430,0,0.000,...,27.38,24.4,30.9,6.5,1,UTC±00:00,55.1,140,140,


In [64]:
merged_df.ceoworldrank.isna().sum()

24531

In [65]:
merged_df.drop(['country code','rank', 'data_source','vaccine_availability','ceoworldrank','who region'],axis=1,inplace=True)

In [66]:
print('Missing Vlaues: ',merged_df.isna().sum().sum())
merged_df.dropna(inplace=True)

Missing Vlaues:  1832


In [67]:
print('Number of rows =', len(merged_df))
print('Number of columns =', len(merged_df.columns))
print('Number of countries =', len(merged_df['country'].unique()))
print('Number of rows for each country =',len(merged_df)/len(merged_df['country'].unique()))
print('With',merged_df.isna().sum().sum(),'Nulls')
print('Number of missing dates in the range between the min and max Dates =',
      len(pd.date_range(start=min(merged_df['date']), end=max(merged_df['date'])).difference(merged_df.date)), '(No gaps in the date column)')

Number of rows = 75017
Number of columns = 80
Number of countries = 82
Number of rows for each country = 914.8414634146342
With 0 Nulls
Number of missing dates in the range between the min and max Dates = 0 (No gaps in the date column)


# Saving the DataFrame as csv.

In [68]:
merged_df = merged_df.sort_values(by='date')
merged_df.to_csv('COVID19_Data.csv')

In [69]:
merged_df.isnull().sum().sum()

0

In [70]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 75017 entries, 0 to 76848
Data columns (total 80 columns):
 #   Column                                                        Non-Null Count  Dtype         
---  ------                                                        --------------  -----         
 0   country                                                       75017 non-null  object        
 1   cases - cumulative total                                      75017 non-null  float64       
 2   cases - cumulative total per 100000 population                75017 non-null  int64         
 3   cases - newly reported in last 7 days                         75017 non-null  float64       
 4   cases - newly reported in last 7 days per 100000 population   75017 non-null  int64         
 5   cases - newly reported in last 24 hours                       75017 non-null  int64         
 6   deaths - cumulative total                                     75017 non-null  float64       
 7   deat

In [71]:
print('Number of rows =', len(merged_df))
print('Number of columns =', len(merged_df.columns))
print('Number of countries =', len(merged_df['country'].unique()))
print('Number of rows for each country =',len(merged_df)/len(merged_df['country'].unique()))
print('With',merged_df.isna().sum().sum(),'Nulls')
print('Number of missing dates in the range between the min and max Dates =',
      len(pd.date_range(start=min(merged_df['date']), end=max(merged_df['date'])).difference(merged_df.date)), '(No gaps in the date column)')

Number of rows = 75017
Number of columns = 80
Number of countries = 82
Number of rows for each country = 914.8414634146342
With 0 Nulls
Number of missing dates in the range between the min and max Dates = 0 (No gaps in the date column)
