In [1]:
import pandas as pd

In [3]:
covid_data = pd.read_csv('data/covid_data.csv', sep=',', decimal='.')
covid_data.head()

Unnamed: 0,date,province/state,country,confirmed,deaths,recovered
0,01/22/2020,Anhui,China,1.0,0.0,0.0
1,01/22/2020,Beijing,China,14.0,0.0,0.0
2,01/22/2020,Chongqing,China,6.0,0.0,0.0
3,01/22/2020,Fujian,China,1.0,0.0,0.0
4,01/22/2020,Gansu,China,0.0,0.0,0.0


In [7]:
vaccinations_data = pd.read_csv('data/country_vaccinations.csv', sep=',', decimal='.')
vaccinations_data = vaccinations_data[
    ['country', 'date', 'total_vaccinations', 
     'people_vaccinated', 'people_vaccinated_per_hundred',
     'people_fully_vaccinated', 'people_fully_vaccinated_per_hundred',
     'daily_vaccinations', 'vaccines']
]
vaccinations_data.head()

Unnamed: 0,country,date,total_vaccinations,people_vaccinated,people_vaccinated_per_hundred,people_fully_vaccinated,people_fully_vaccinated_per_hundred,daily_vaccinations,vaccines
0,Afghanistan,2021-02-22,0.0,0.0,0.0,,,,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi..."
1,Afghanistan,2021-02-23,,,,,,1367.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi..."
2,Afghanistan,2021-02-24,,,,,,1367.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi..."
3,Afghanistan,2021-02-25,,,,,,1367.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi..."
4,Afghanistan,2021-02-26,,,,,,1367.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi..."


In [11]:
#Группируем таблицу по дате и названию страны и рассчитываем суммарные показатели по всем регионам. 
# Тем самым переходим от данных по регионам к данным по странам:

covid_data.groupby(['date', 'country'], as_index=False)[['confirmed', 'deaths', 'recovered']].sum()

Unnamed: 0,date,country,confirmed,deaths,recovered
0,01/01/2021,Afghanistan,51526.0,2191.0,41727.0
1,01/01/2021,Albania,58316.0,1181.0,33634.0
2,01/01/2021,Algeria,99897.0,2762.0,67395.0
3,01/01/2021,Andorra,8117.0,84.0,7463.0
4,01/01/2021,Angola,17568.0,405.0,11146.0
...,...,...,...,...,...
86780,12/31/2020,Vietnam,1465.0,35.0,1325.0
86781,12/31/2020,West Bank and Gaza,138004.0,1400.0,117183.0
86782,12/31/2020,Yemen,2099.0,610.0,1394.0
86783,12/31/2020,Zambia,20725.0,388.0,18660.0


In [14]:
# Преобразуем даты в формат datetime с помощью функции pd.to_datetime():

covid_data['date'] = pd.to_datetime(covid_data['date'], dayfirst=True)
covid_data['date'].head()

0   2020-01-22
1   2020-01-22
2   2020-01-22
3   2020-01-22
4   2020-01-22
Name: date, dtype: datetime64[ns]

In [16]:
# Создадим признак больных на данный момент (active). 
# Для этого вычтем из общего числа зафиксированных случаев число смертей и число выздоровевших пациентов

covid_data['active'] = covid_data['confirmed'] - covid_data['deaths'] - covid_data['recovered']
covid_data.head()

Unnamed: 0,date,province/state,country,confirmed,deaths,recovered,active
0,2020-01-22,Anhui,China,1.0,0.0,0.0,1.0
1,2020-01-22,Beijing,China,14.0,0.0,0.0,14.0
2,2020-01-22,Chongqing,China,6.0,0.0,0.0,6.0
3,2020-01-22,Fujian,China,1.0,0.0,0.0,1.0
4,2020-01-22,Gansu,China,0.0,0.0,0.0,0.0


In [24]:
# Создадим признак ежедневного прироста числа заболевших, умерших и выздоровевших людей. 
# Для этого отсортируем данные по названиям стран, а затем по датам. 
# После этого произведём группировку по странам и рассчитаем разницу между «вчера и сегодня» с помощью метода diff():

covid_data.sort_values(by=['country', 'date'])
covid_data['daily_confirmed'] = covid_data.groupby('country')['confirmed'].diff()
covid_data['daily_deaths'] = covid_data.groupby('country')['deaths'].diff()
covid_data['daily_recovered'] = covid_data.groupby('country')['recovered'].diff()

In [25]:
covid_data.head()

Unnamed: 0,date,province/state,country,confirmed,deaths,recovered,active,daily_confirmed,daily_deaths,daily_recovered
0,2020-01-22,Anhui,China,1.0,0.0,0.0,1.0,,,
1,2020-01-22,Beijing,China,14.0,0.0,0.0,14.0,13.0,0.0,0.0
2,2020-01-22,Chongqing,China,6.0,0.0,0.0,6.0,-8.0,0.0,0.0
3,2020-01-22,Fujian,China,1.0,0.0,0.0,1.0,-5.0,0.0,0.0
4,2020-01-22,Gansu,China,0.0,0.0,0.0,0.0,-1.0,0.0,0.0


In [26]:
# 3.1

covid_data['date'].describe()
# 2020-01-02-2021-12-05

  covid_data['date'].describe()


count                  306429
unique                    494
top       2021-05-29 00:00:00
freq                      765
first     2020-01-02 00:00:00
last      2021-12-05 00:00:00
Name: date, dtype: object

In [32]:
# 3.2

vaccinations_data['date'] = pd.to_datetime(vaccinations_data['date'], dayfirst=True)
vaccinations_data = vaccinations_data.sort_values(by='date')
vaccinations_data['date'].describe()

# 2020-12-02-2021-09-06

  vaccinations_data['date'].describe()


count                   42795
unique                    279
top       2021-06-22 00:00:00
freq                      216
first     2020-12-02 00:00:00
last      2021-09-06 00:00:00
Name: date, dtype: object