# Анализ данных по COVID-19

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
# загрузка данных
data = pd.read_csv("covid_19_data.csv")
data.head()

Unnamed: 0,SNo,ObservationDate,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
0,1,01/22/2020,Anhui,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
1,2,01/22/2020,Beijing,Mainland China,1/22/2020 17:00,14.0,0.0,0.0
2,3,01/22/2020,Chongqing,Mainland China,1/22/2020 17:00,6.0,0.0,0.0
3,4,01/22/2020,Fujian,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
4,5,01/22/2020,Gansu,Mainland China,1/22/2020 17:00,0.0,0.0,0.0


In [4]:
data.shape

(236017, 8)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 236017 entries, 0 to 236016
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   SNo              236017 non-null  int64  
 1   ObservationDate  236017 non-null  object 
 2   Province/State   173972 non-null  object 
 3   Country/Region   236017 non-null  object 
 4   Last Update      236017 non-null  object 
 5   Confirmed        236017 non-null  float64
 6   Deaths           236017 non-null  float64
 7   Recovered        236017 non-null  float64
dtypes: float64(3), int64(1), object(4)
memory usage: 14.4+ MB


In [6]:
# проверка названий индексов столбцов
data.columns

Index(['SNo', 'ObservationDate', 'Province/State', 'Country/Region',
       'Last Update', 'Confirmed', 'Deaths', 'Recovered'],
      dtype='object')

In [7]:
# удаление ненужных столбцов по ключам, индексам
data_2 = data.drop(["SNo", "Last Update"], axis=1)
data_2.head()

Unnamed: 0,ObservationDate,Province/State,Country/Region,Confirmed,Deaths,Recovered
0,01/22/2020,Anhui,Mainland China,1.0,0.0,0.0
1,01/22/2020,Beijing,Mainland China,14.0,0.0,0.0
2,01/22/2020,Chongqing,Mainland China,6.0,0.0,0.0
3,01/22/2020,Fujian,Mainland China,1.0,0.0,0.0
4,01/22/2020,Gansu,Mainland China,0.0,0.0,0.0


In [9]:
# проверка на дубликаты
data_2.duplicated(['ObservationDate', 'Province/State', 'Country/Region']).sum()

5

In [11]:
# удаление дубликатов
data_3 = data_2.drop_duplicates(['ObservationDate', 'Province/State', 'Country/Region'])

In [12]:
# повторная проверка на дубликаты
data_3.duplicated(['ObservationDate', 'Province/State', 'Country/Region']).sum()

0

In [21]:
# изучаем страны (стобец с индексом 'Country/Region')
countries = data_3['Country/Region'].str.strip(" (,)'").unique()

for c in np.sort(countries):
    print(c)

Afghanistan
Albania
Algeria
Andorra
Angola
Antigua and Barbuda
Argentina
Armenia
Aruba
Australia
Austria
Azerbaijan
Bahamas
Bahamas, The
Bahrain
Bangladesh
Barbados
Belarus
Belgium
Belize
Benin
Bhutan
Bolivia
Bosnia and Herzegovina
Botswana
Brazil
Brunei
Bulgaria
Burkina Faso
Burma
Burundi
Cabo Verde
Cambodia
Cameroon
Canada
Cape Verde
Cayman Islands
Central African Republic
Chad
Channel Islands
Chile
Colombia
Comoros
Congo (Brazzaville
Congo (Kinshasa
Costa Rica
Croatia
Cuba
Curacao
Cyprus
Czech Republic
Denmark
Diamond Princess
Djibouti
Dominica
Dominican Republic
East Timor
Ecuador
Egypt
El Salvador
Equatorial Guinea
Eritrea
Estonia
Eswatini
Ethiopia
Faroe Islands
Fiji
Finland
France
French Guiana
Gabon
Gambia
Gambia, The
Georgia
Germany
Ghana
Gibraltar
Greece
Greenland
Grenada
Guadeloupe
Guam
Guatemala
Guernsey
Guinea
Guinea-Bissau
Guyana
Haiti
Holy See
Honduras
Hong Kong
Hungary
Iceland
India
Indonesia
Iran
Iraq
Ireland
Israel
Italy
Ivory Coast
Jamaica
Japan
Jersey
Jordan
Kazakhst

In [22]:
# смотрим Others
data_3.loc[data_3['Country/Region'] == "Others", :]

Unnamed: 0,ObservationDate,Province/State,Country/Region,Confirmed,Deaths,Recovered
933,02/07/2020,Cruise Ship,Others,61.0,0.0,0.0
1005,02/08/2020,Cruise Ship,Others,61.0,0.0,0.0
1077,02/09/2020,Diamond Princess cruise ship,Others,64.0,0.0,0.0
1143,02/10/2020,Diamond Princess cruise ship,Others,135.0,0.0,0.0
1215,02/11/2020,Diamond Princess cruise ship,Others,135.0,0.0,0.0
1286,02/12/2020,Diamond Princess cruise ship,Others,175.0,0.0,0.0
1359,02/13/2020,Diamond Princess cruise ship,Others,175.0,0.0,0.0
1433,02/14/2020,Diamond Princess cruise ship,Others,218.0,0.0,0.0
1506,02/15/2020,Diamond Princess cruise ship,Others,285.0,0.0,0.0
1578,02/16/2020,Diamond Princess cruise ship,Others,355.0,0.0,0.0


In [23]:
# проверяем даты
data_3["ObservationDate"].head()

0    01/22/2020
1    01/22/2020
2    01/22/2020
3    01/22/2020
4    01/22/2020
Name: ObservationDate, dtype: object

In [25]:
# конвертация временных данных в спец тип данных класса TimeStamp
data_3["ObservationDate"] = pd.to_datetime(data_3["ObservationDate"])
# метод apply принимает объект функции, который нужно применить ко всем методам
data_3["ObservationDate"] = data_3["ObservationDate"].apply(lambda el: el.date())
data_3["ObservationDate"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_3["ObservationDate"] = pd.to_datetime(data_3["ObservationDate"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_3["ObservationDate"] = data_3["ObservationDate"].apply(lambda el: el.date())


0         2020-01-22
1         2020-01-22
2         2020-01-22
3         2020-01-22
4         2020-01-22
             ...    
236012    2021-02-27
236013    2021-02-27
236014    2021-02-27
236015    2021-02-27
236016    2021-02-27
Name: ObservationDate, Length: 236012, dtype: object

In [26]:
# группировка данных по дате и вывод суммарных значений по всему миру
sum_data = data_3.groupby(["ObservationDate"]).sum()
sum_data

Unnamed: 0_level_0,Confirmed,Deaths,Recovered
ObservationDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-01-22,557.0,17.0,30.0
2020-01-23,653.0,17.0,32.0
2020-01-24,941.0,26.0,39.0
2020-01-25,1437.0,42.0,42.0
2020-01-26,2118.0,56.0,56.0
...,...,...,...
2021-02-23,112109754.0,2485434.0,63295144.0
2021-02-24,112554301.0,2497488.0,63504603.0
2021-02-25,113001412.0,2507624.0,63732295.0
2021-02-26,113415604.0,2517422.0,63992563.0
