In [25]:
import numpy as np
import pandas as pd

import scipy.stats as sts
import matplotlib.pyplot as plt
import seaborn as sns

# Загрузка датасета №1 и его предобработка

In [26]:
df = pd.read_excel('datasets/coronavirus_Mar31.xlsx')
df.set_index("EVENT_ID_CNTY", inplace = True)
df['EVENT_DATE'] = pd.to_datetime(df['EVENT_DATE'])
df.head()

Unnamed: 0_level_0,EVENT_DATE,YEAR,TIME_PRECISION,DISORDER_TYPE,EVENT_TYPE,SUB_EVENT_TYPE,ACTOR1,ASSOC_ACTOR_1,INTER1,ACTOR2,...,LOCATION,LATITUDE,LONGITUDE,GEO_PRECISION,SOURCE,SOURCE_SCALE,NOTES,FATALITIES,TAGS,TIMESTAMP
EVENT_ID_CNTY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
XKX325,2020-03-13,2020,1,Strategic developments,Strategic developments,Change to group/activity,Government of Kosovo (2020-),,1,,...,Pristina,42.667,21.172,3,Prishtina Insight,National,"On 13 March 2020, the government of Kosovo ena...",0,,1585075531
XKX326,2020-03-19,2020,1,Demonstrations,Protests,Peaceful protest,Protesters (Kosovo),,6,,...,Pristina,42.667,21.172,1,RFE/RL,International,"On 19 March 2020, residents of Pristina, Kosov...",0,crowd size=no report,1585673085
XKX327,2020-03-20,2020,1,Demonstrations,Protests,Peaceful protest,Protesters (Kosovo),,6,,...,Pristina,42.667,21.172,1,Kosovo Online,National,"On 20 March 2020, residents of Pristina, Kosov...",0,crowd size=no report,1585673085
XKX328,2020-03-21,2020,1,Demonstrations,Protests,Peaceful protest,Protesters (Kosovo),,6,,...,Pristina,42.667,21.172,1,Sinjali,National,"On 21 March 2020, citizens took to their balco...",0,crowd size=no report,1585673076
XKX329,2020-03-21,2020,1,Demonstrations,Protests,Peaceful protest,Protesters (Kosovo),,6,,...,Peja,42.659,20.288,1,Sinjali,National,"On 21 March 2020, citizens took to their balco...",0,crowd size=no report,1585673076


Датасет содержит наблюдения о беспорядках, столкновениях и иных конфликтах, непросредственно связанных с эпидемией COVID-19.
## Предобработка 1

In [27]:
df.value_counts(subset = ["DISORDER_TYPE", "EVENT_TYPE", "SUB_EVENT_TYPE"]) #Посмотрим на типы событий

DISORDER_TYPE                       EVENT_TYPE                  SUB_EVENT_TYPE                    
Demonstrations                      Protests                    Peaceful protest                      56930
                                    Riots                       Violent demonstration                  2658
                                    Protests                    Protest with intervention              2046
Political violence                  Riots                       Mob violence                           1529
Strategic developments              Strategic developments      Change to group/activity                882
Political violence                  Violence against civilians  Attack                                  851
Strategic developments              Strategic developments      Other                                   254
                                                                Looting/property destruction            176
                                     

Видим, что подавляющее большинство событий - это мирные протесты. Рассмотрим данные по локациям и действующим лицам.

In [28]:
df[['COUNTRY', 'ADMIN1', 'ADMIN2', 'ADMIN3', 'LOCATION']].nunique()

COUNTRY       220
ADMIN1       2022
ADMIN2       7112
ADMIN3       3669
LOCATION    13999
dtype: int64

In [29]:
df[['ACTOR1', 'ASSOC_ACTOR_1', 'ACTOR2', 'ASSOC_ACTOR_2']].nunique()

ACTOR1            946
ASSOC_ACTOR_1    6639
ACTOR2            611
ASSOC_ACTOR_2    1037
dtype: int64

In [30]:
df[['DISORDER_TYPE', 'EVENT_TYPE', 'SUB_EVENT_TYPE']].nunique()

DISORDER_TYPE      4
EVENT_TYPE         6
SUB_EVENT_TYPE    19
dtype: int64

Становится ясно, что никакого OneHotEncoding по геоданным и акторам не будет, да вообще эти данные кроме визуализаций и присоединения других показателей не очень то нужны.
А вот типы событий имеет смысл OneHot'нуть.

In [31]:
df.value_counts(subset = ["REGION"]) #Посмотрим на распределение наблюдений по регионам

REGION                   
Europe                       25071
North America                10796
South America                 8482
South Asia                    7807
Middle East                   3121
East Asia                     2922
Northern Africa               1574
Southeast Asia                1293
Central America               1062
Caucasus and Central Asia      696
Oceania                        637
Eastern Africa                 624
Caribbean                      584
Southern Africa                440
Western Africa                 413
Middle Africa                  194
dtype: int64

Видим, что больше половины наблюдений приходится на Европу и Северную Америку. Сравним Китай и США

In [32]:
print(df[df["COUNTRY"] == "China"].value_counts(subset = ["COUNTRY"]))
print(df[df["COUNTRY"] == "United States"].value_counts(subset = ["COUNTRY"]))

COUNTRY
China      629
dtype: int64
COUNTRY      
United States    7966
dtype: int64


Есть некоторые подозрения по поводу количества наблюдений в полутрамиллиардном Китае. Посмотрим на источники.

In [33]:
print("КИТАЙ: \n", df[df["COUNTRY"] == "China"].value_counts(subset = ["SOURCE_SCALE", "SOURCE"]), "\n")
print("США: \n", df[df["COUNTRY"] == "United States"].value_counts(subset = ["SOURCE_SCALE", "SOURCE"]))

КИТАЙ: 
 SOURCE_SCALE            SOURCE                          
Subnational             Inmediahk.net                       79
                        HK01                                65
                        Wen Wei Po                          55
International           Radio Free Asia                     55
Other                   CLB (China)                         40
                                                            ..
Other-Subnational       China Aid; Inmediahk.net             1
                        Apple Daily Hong Kong; China Aid     1
Other-New media         Twitter; CLB (China)                 1
National-International  Deutsche Welle; HK01                 1
Other-National          Weiquanwang; RTHK                    1
Length: 134, dtype: int64 

США: 
 SOURCE_SCALE          SOURCE                                                                  
New media             Twitter                                                                     127
Other    

Ожидаемо, данные для США берут из открытых внутренних источников, данные для Китая - из внешних, до которых доходит гораздо меньше событий.

In [34]:
df.value_counts(subset = ['TIMESTAMP'])

TIMESTAMP 
1643149412    361
1677617817    303
1631575313    275
1680633575    263
1642536651    254
             ... 
1618498097      1
1618498094      1
1618498091      1
1618498090      1
1618499525      1
Length: 9665, dtype: int64

Не очень то нужные значения, их даже как идентификатор события не выйдет использовать.

In [35]:
df["TAGS"].value_counts()

crowd size=no report                  36035
crowd size=dozens                      1711
crowd size=hundreds                    1455
crowd size=around 100                  1013
crowd size=around 50                    771
                                      ...  
crowd size=around 2,100                   1
crowd size=around 4,700                   1
crowd size=around 1,412                   1
crowd size=hundreds-more than 1000        1
crowd size=unkown                         1
Name: TAGS, Length: 3128, dtype: int64

Интерпретировать теги очень сложно, поэтому тоже отброшу.

In [36]:
df["ASSOC_ACTOR_1"].value_counts()

Labor Group (Italy)                                                                                      1036
Students (United States)                                                                                  836
Labor Group (Spain)                                                                                       831
Labor Group (United States)                                                                               777
Labor Group (India)                                                                                       776
                                                                                                         ... 
Students (India); AISA: All India Students Association; SFI: Students Federation of India                   1
NSUI: National Students Union of India; JKNC: Jammu and Kashmir National Conference; Students (India)       1
Government of Pakistan (2018-2022); SP: Samajwadi Party                                                     1
Farmers (I

Видим очевидную проблему - к "неорганизованным" объединениям присоединены названия стран, да еще и другие участники. Разделю их, чтобы можно было оценивать событие по участникам.

In [37]:
df['ACTOR1_TYPES'] = df["ASSOC_ACTOR_1"].fillna(df["ACTOR1"])
def process_string(row):
    new_string = row['ACTOR1_TYPES'].replace(' (' + row['COUNTRY'] + ')', '')
    return [new_string]
df['ACTOR1_TYPES'] = df.apply(lambda row: pd.Series(process_string(row)), axis=1)

А теперь сделаем признак "неорганизованность" - если событие не организовано формальным объединением, то оно не-политическое, признак равен 1, и наоборот.

In [38]:
df_actorcheck = df["ACTOR1_TYPES"].str.split(",", expand = True)
#Ниже список акторов, не являющихся организациями
my_list = ["Protesters", "Labor Group", "Health Workers", "Students", "Rioters", "Teachers", "Farmers", "Women", "Prisoners", "Taxi/Bus Drivers", "Lawyers", "Taxi Drivers", "No Vax", "Muslim Group", "Fishers", "Journalists", "Street Traders", "Orthodox Christian Group", "Protestant Christian Group", "Haredi Jewish Group", "Judges", "Refugees/IDPs", None]
def check_list(row):
    return int(all(elem in my_list for elem in row))

df['UNORGANIZED'] = df_actorcheck.apply(lambda row: check_list(row), axis=1)
df["UNORGANIZED"].value_counts()

1    43124
0    22592
Name: UNORGANIZED, dtype: int64

Примерно две трети событий оказались неорганизованными, это не очень хорошо - у нас дисбаланс классов.

In [39]:
df

Unnamed: 0_level_0,EVENT_DATE,YEAR,TIME_PRECISION,DISORDER_TYPE,EVENT_TYPE,SUB_EVENT_TYPE,ACTOR1,ASSOC_ACTOR_1,INTER1,ACTOR2,...,LONGITUDE,GEO_PRECISION,SOURCE,SOURCE_SCALE,NOTES,FATALITIES,TAGS,TIMESTAMP,ACTOR1_TYPES,UNORGANIZED
EVENT_ID_CNTY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
XKX325,2020-03-13,2020,1,Strategic developments,Strategic developments,Change to group/activity,Government of Kosovo (2020-),,1,,...,21.172,3,Prishtina Insight,National,"On 13 March 2020, the government of Kosovo ena...",0,,1585075531,Government of Kosovo (2020-),0
XKX326,2020-03-19,2020,1,Demonstrations,Protests,Peaceful protest,Protesters (Kosovo),,6,,...,21.172,1,RFE/RL,International,"On 19 March 2020, residents of Pristina, Kosov...",0,crowd size=no report,1585673085,Protesters,1
XKX327,2020-03-20,2020,1,Demonstrations,Protests,Peaceful protest,Protesters (Kosovo),,6,,...,21.172,1,Kosovo Online,National,"On 20 March 2020, residents of Pristina, Kosov...",0,crowd size=no report,1585673085,Protesters,1
XKX328,2020-03-21,2020,1,Demonstrations,Protests,Peaceful protest,Protesters (Kosovo),,6,,...,21.172,1,Sinjali,National,"On 21 March 2020, citizens took to their balco...",0,crowd size=no report,1585673076,Protesters,1
XKX329,2020-03-21,2020,1,Demonstrations,Protests,Peaceful protest,Protesters (Kosovo),,6,,...,20.288,1,Sinjali,National,"On 21 March 2020, citizens took to their balco...",0,crowd size=no report,1585673076,Protesters,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZAM1292,2020-04-12,2020,1,Political violence,Violence against civilians,Attack,Police Forces of Zambia (2011-2021),,1,Civilians (Zambia),...,29.497,1,Zambia Watchdog,National,"On 12 April 2020, Zambian police attacked peop...",0,,1631575315,Police Forces of Zambia (2011-2021),0
ZAM1300,2020-07-24,2020,1,Political violence,Violence against civilians,Attack,Police Forces of Zambia (2011-2021),,1,Civilians (Zambia),...,31.325,1,Zambia Watchdog,National,"On 24 July 2020, a man was hit and injured by ...",0,,1631575315,Police Forces of Zambia (2011-2021),0
ZAM1315,2020-09-26,2020,1,Political violence,Violence against civilians,Attack,Police Forces of Zambia (2011-2021),,1,Civilians (Zambia),...,28.283,1,Mwebantu; Daily Nation (Zambia),National,"On 26 September 2020, a teenage boy was shot a...",1,,1631575311,Police Forces of Zambia (2011-2021),0
ZAM1323,2020-10-08,2020,2,Political violence,Riots,Mob violence,Rioters (Zambia),,5,Police Forces of Zambia (2011-2021),...,28.283,1,Lusaka Times,Subnational,"Around 8 October 2020 (as reported), a mob att...",1,crowd size=no report,1631575314,Rioters,1


# Чистка 1
Удалю ненужные столбцы, которые не помогут при анализе

In [40]:
useless = [
    'NOTES', #Текстовое описание новости
    'SOURCE', #Источник
    'SOURCE_SCALE', #Охват источника
    'TIME_PRECISION', #Точность определения времени события
    'TIMESTAMP', #Как оказалось, это время внесения наблюдения в таблицу
    'GEO_PRECISION', #Точность географической оценки
    'TAGS', #Плохо сделанные теги
    'ACTOR1', 
    'ASSOC_ACTOR_1', #Действующие лица
    'ACTOR2', 
    'ASSOC_ACTOR_2', 
    'CIVILIAN_TARGETING', #Было ли направленное именно на гражданские лица насилие
    'ADMIN1', 
    'ADMIN2',
    'ADMIN3', #Место действия
    'LOCATION',
    'INTERACTION',
    'ACTOR1_TYPES' #Служебная колонка из пункта выше
]

In [41]:
df_data = df.drop(useless, axis = 1)

Сохраним все события в табличку для визуализаций

In [42]:
df_data.to_excel("datasets/disorders.xlsx", encoding='utf-8')

И отдельно сохраним именно протесты

In [43]:
df_protests = df_data[df_data["DISORDER_TYPE"] == "Demonstrations"]
df_protests.to_excel("datasets/protests.xlsx", encoding='utf-8')
df_protests

Unnamed: 0_level_0,EVENT_DATE,YEAR,DISORDER_TYPE,EVENT_TYPE,SUB_EVENT_TYPE,INTER1,INTER2,ISO,REGION,COUNTRY,LATITUDE,LONGITUDE,FATALITIES,UNORGANIZED
EVENT_ID_CNTY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
XKX326,2020-03-19,2020,Demonstrations,Protests,Peaceful protest,6,0,0,Europe,Kosovo,42.667,21.172,0,1
XKX327,2020-03-20,2020,Demonstrations,Protests,Peaceful protest,6,0,0,Europe,Kosovo,42.667,21.172,0,1
XKX328,2020-03-21,2020,Demonstrations,Protests,Peaceful protest,6,0,0,Europe,Kosovo,42.667,21.172,0,1
XKX329,2020-03-21,2020,Demonstrations,Protests,Peaceful protest,6,0,0,Europe,Kosovo,42.659,20.288,0,1
XKX330,2020-03-21,2020,Demonstrations,Protests,Peaceful protest,6,0,0,Europe,Kosovo,42.638,21.093,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YEM54985,2020-06-18,2020,Demonstrations,Protests,Peaceful protest,6,0,887,Middle East,Yemen,15.943,48.793,0,1
YEM55677,2020-07-09,2020,Demonstrations,Protests,Peaceful protest,6,0,887,Middle East,Yemen,12.779,45.037,0,0
YEM56030,2020-07-19,2020,Demonstrations,Protests,Peaceful protest,6,0,887,Middle East,Yemen,14.602,49.239,0,1
YEM60731,2020-10-18,2020,Demonstrations,Protests,Peaceful protest,6,0,887,Middle East,Yemen,14.543,49.124,0,1


## Предобработка 2
Теперь подгружаем второй датасет, в нем ежедневная информация о заболеваемости по странам (и куча лишнего мусора).

In [44]:
df_cov = pd.read_csv("datasets/owid-covid-data.csv")
df_cov['date'] = pd.to_datetime(df_cov['date'])
df_cov

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-02-24,5.0,5.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
1,AFG,Asia,Afghanistan,2020-02-25,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
2,AFG,Asia,Afghanistan,2020-02-26,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
3,AFG,Asia,Afghanistan,2020-02-27,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
4,AFG,Asia,Afghanistan,2020-02-28,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166321,ZWE,Africa,Zimbabwe,2022-03-01,236871.0,491.0,413.000,5395.0,0.0,1.000,...,1.6,30.7,36.791,1.7,61.49,0.571,,,,
166322,ZWE,Africa,Zimbabwe,2022-03-02,237503.0,632.0,416.286,5396.0,1.0,1.143,...,1.6,30.7,36.791,1.7,61.49,0.571,,,,
166323,ZWE,Africa,Zimbabwe,2022-03-03,237503.0,0.0,362.286,5396.0,0.0,0.857,...,1.6,30.7,36.791,1.7,61.49,0.571,,,,
166324,ZWE,Africa,Zimbabwe,2022-03-04,238739.0,1236.0,467.429,5397.0,1.0,0.714,...,1.6,30.7,36.791,1.7,61.49,0.571,,,,


Во - первых, в датасетах наверняка есть несовпадения по странам. Посмотрим на них

In [46]:
unknown = set(df_data["COUNTRY"].tolist()) - set(df_cov["location"].tolist())
print(unknown)

{'Saint-Barthelemy', 'North Korea', 'French Guiana', 'East Timor', 'Northern Mariana Islands', 'Bailiwick of Guernsey', 'eSwatini', 'Sint Maarten', 'Guam', 'Ivory Coast', 'Virgin Islands, U.S.', 'Saint-Martin', 'Martinique', 'Caribbean Netherlands', 'Saint Helena, Ascension and Tristan da Cunha', 'American Samoa', 'Czech Republic', 'Bailiwick of Jersey', 'Reunion', 'Guadeloupe', 'Puerto Rico', 'Republic of Congo'}


In [47]:
droplist = list(set(df_cov["location"].tolist()) - set(df_data["COUNTRY"].tolist()))
set(df_cov["location"].tolist()) - set(df_data["COUNTRY"].tolist())#Это несовпавшие страны из датасета по заболеваемости

{'Africa',
 'Asia',
 'Bonaire Sint Eustatius and Saba',
 'British Virgin Islands',
 'Comoros',
 'Congo',
 "Cote d'Ivoire",
 'Czechia',
 'Eswatini',
 'Europe',
 'European Union',
 'Faeroe Islands',
 'Greenland',
 'Guernsey',
 'High income',
 'Hong Kong',
 'International',
 'Jersey',
 'Kiribati',
 'Low income',
 'Lower middle income',
 'Macao',
 'Marshall Islands',
 'Micronesia (country)',
 'Niue',
 'North America',
 'Northern Cyprus',
 'Oceania',
 'Palau',
 'Pitcairn',
 'Saint Helena',
 'Sao Tome and Principe',
 'Sint Maarten (Dutch part)',
 'South America',
 'Timor',
 'Tokelau',
 'Tuvalu',
 'Upper middle income',
 'Vatican',
 'World'}

Несостыкованных стран много, и наверняка когда мы начнем добавлять дополнительные данные, с ними возникнет такая же проблема, поэтому применим грязный трюк:

In [48]:
df_data[["COUNTRY","ISO"]][df_data["COUNTRY"].isin(unknown)].groupby("COUNTRY").count().sort_values(by = "ISO", ascending = False)

Unnamed: 0_level_0,ISO
COUNTRY,Unnamed: 1_level_1
Czech Republic,246
Guadeloupe,241
Martinique,95
Reunion,69
North Korea,55
French Guiana,49
Saint-Martin,29
Ivory Coast,23
Puerto Rico,20
eSwatini,14


Выкинем все страны кроме первых (по остальным мало наблюдений), причем у нас нет данных по заболеваемости в Guadeloupe, Martinique, Reunion и тем более в North Korea, так что оставляем только Чехию.

In [86]:
df_cov["location"].replace(to_replace = "Czechia", value = "Czech Republic", inplace = True)
df_cov = df_cov.drop(df_cov[df_cov["location"].isin(droplist)].index)
df_cov

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,3D_CASES_PER_MIL,7D_CASES_PER_MIL,30D_CASES_PER_MIL,3D_DEATHS,7D_DEATHS,30D_DEATHS,3D_DEATHS_PER_MIL,7D_DEATHS_PER_MIL,30D_DEATHS_PER_MIL,LOCDATE
0,AFG,Asia,Afghanistan,2020-02-24,5.0,5.0,,,,,...,0.126,0.126,0.126,,,,,,,Afghanistan2020-02-24
1,AFG,Asia,Afghanistan,2020-02-25,5.0,0.0,,,,,...,0.126,0.126,0.126,,,,,,,Afghanistan2020-02-25
2,AFG,Asia,Afghanistan,2020-02-26,5.0,0.0,,,,,...,0.126,0.126,0.126,,,,,,,Afghanistan2020-02-26
3,AFG,Asia,Afghanistan,2020-02-27,5.0,0.0,,,,,...,0.000,0.126,0.126,,,,,,,Afghanistan2020-02-27
4,AFG,Asia,Afghanistan,2020-02-28,5.0,0.0,,,,,...,0.000,0.126,0.126,,,,,,,Afghanistan2020-02-28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166321,ZWE,Africa,Zimbabwe,2022-03-01,236871.0,491.0,413.000,5395.0,0.0,1.000,...,70.765,191.556,491.048,2.0,7.0,58.0,0.133,0.465,3.844,Zimbabwe2022-03-01
166322,ZWE,Africa,Zimbabwe,2022-03-02,237503.0,632.0,416.286,5396.0,1.0,1.143,...,112.641,193.080,519.275,3.0,8.0,58.0,0.199,0.531,3.844,Zimbabwe2022-03-02
166323,ZWE,Africa,Zimbabwe,2022-03-03,237503.0,0.0,362.286,5396.0,0.0,0.857,...,74.409,168.034,507.017,1.0,6.0,46.0,0.066,0.398,3.049,Zimbabwe2022-03-03
166324,ZWE,Africa,Zimbabwe,2022-03-04,238739.0,1236.0,467.429,5397.0,1.0,0.714,...,123.773,216.801,578.246,2.0,5.0,45.0,0.132,0.331,2.982,Zimbabwe2022-03-04


In [None]:
Соединим датасеты

In [None]:
#создание служебных колонок для присоединения
df_cov["LOCDATE"] = df_cov["location"] + df_cov["date"].astype("str")
df["LOCDATE"] = df["COUNTRY"] + df["EVENT_DATE"].astype("str")

In [None]:
df_data = pd.merge(df, df_cov, on = "LOCDATE", how = "left")

В этом датасете очень много колонок, в которых преимущественно пропуски. Найдем те колонки, которые будут полезны

# ПЕРЕПРОВЕРЬ
Создадим два списка признаков, которые возьмем для присоединения - кумулятивные и некумулятивные

In [None]:
clist = ["location", "date", "population", "total_cases", "total_cases_per_million", "life_expectancy", "population_density", 
         "total_deaths",
         "total_deaths_per_million",
         "gdp_per_capita"
        ]
nclist = ["new_cases", "new_cases_per_million", "new_deaths", "new_deaths_per_million"]

In [102]:
#Проверка на несклеивание
## df_datafull.isna().sum().sort_values()
df_datafull = df_datafull.drop(df_datafull[df_datafull["location"].isna()].index)
df_datafull

Unnamed: 0,EVENT_DATE,YEAR,TIME_PRECISION,DISORDER_TYPE,EVENT_TYPE,SUB_EVENT_TYPE,ACTOR1,ASSOC_ACTOR_1,INTER1,ACTOR2,...,30D_CASES,3D_CASES_PER_MIL,7D_CASES_PER_MIL,30D_CASES_PER_MIL,3D_DEATHS,7D_DEATHS,30D_DEATHS,3D_DEATHS_PER_MIL,7D_DEATHS_PER_MIL,30D_DEATHS_PER_MIL
1,2020-03-19,2020,1,Demonstrations,Protests,Peaceful protest,Protesters (Kosovo),,6,,...,817.0,3.927,11.476,25.714,,0.0,18.0,,-5.773160e-14,0.327
2,2020-03-20,2020,1,Demonstrations,Protests,Peaceful protest,Protesters (Kosovo),,6,,...,758.0,2.805,11.782,25.184,,,14.0,,,0.254
3,2020-03-21,2020,1,Demonstrations,Protests,Peaceful protest,Protesters (Kosovo),,6,,...,707.0,2.805,12.343,25.885,,,14.0,,,0.254
4,2020-03-21,2020,1,Demonstrations,Protests,Peaceful protest,Protesters (Kosovo),,6,,...,707.0,2.805,12.343,25.885,,,14.0,,,0.254
5,2020-03-21,2020,1,Demonstrations,Protests,Peaceful protest,Protesters (Kosovo),,6,,...,707.0,2.805,12.343,25.885,,,14.0,,,0.254
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65711,2020-04-12,2020,1,Political violence,Violence against civilians,Attack,Police Forces of Zambia (2011-2021),,1,Civilians (Zambia),...,47.0,0.212,0.212,2.406,1.0,1.0,2.0,5.300000e-02,5.300000e-02,0.106
65712,2020-07-24,2020,1,Political violence,Violence against civilians,Attack,Police Forces of Zambia (2011-2021),,1,Civilians (Zambia),...,2367.0,24.841,55.284,125.101,8.0,27.0,118.0,4.230000e-01,1.427000e+00,6.237
65713,2020-09-26,2020,1,Political violence,Violence against civilians,Attack,Police Forces of Zambia (2011-2021),,1,Civilians (Zambia),...,3011.0,8.932,28.646,159.138,0.0,2.0,50.0,5.767609e-14,1.060000e-01,2.646
65714,2020-10-08,2020,2,Political violence,Riots,Mob violence,Rioters (Zambia),,5,Police Forces of Zambia (2011-2021),...,2349.0,11.205,26.374,124.151,1.0,2.0,38.0,5.300000e-02,1.060000e-01,2.010


In [103]:
df_datafull[df_datafull["location"].isna()]

Unnamed: 0,EVENT_DATE,YEAR,TIME_PRECISION,DISORDER_TYPE,EVENT_TYPE,SUB_EVENT_TYPE,ACTOR1,ASSOC_ACTOR_1,INTER1,ACTOR2,...,30D_CASES,3D_CASES_PER_MIL,7D_CASES_PER_MIL,30D_CASES_PER_MIL,3D_DEATHS,7D_DEATHS,30D_DEATHS,3D_DEATHS_PER_MIL,7D_DEATHS_PER_MIL,30D_DEATHS_PER_MIL


In [109]:
df_datafull[df_datafull["7D_DEATHS_PER_MIL"] < 0]

Unnamed: 0,EVENT_DATE,YEAR,TIME_PRECISION,DISORDER_TYPE,EVENT_TYPE,SUB_EVENT_TYPE,ACTOR1,ASSOC_ACTOR_1,INTER1,ACTOR2,...,30D_CASES,3D_CASES_PER_MIL,7D_CASES_PER_MIL,30D_CASES_PER_MIL,3D_DEATHS,7D_DEATHS,30D_DEATHS,3D_DEATHS_PER_MIL,7D_DEATHS_PER_MIL,30D_DEATHS_PER_MIL
1,2020-03-19,2020,1,Demonstrations,Protests,Peaceful protest,Protesters (Kosovo),,6,,...,817.0,3.927,11.476,25.714,,0.0,18.0,,-5.773160e-14,0.327
8,2020-04-03,2020,1,Demonstrations,Protests,Peaceful protest,Protesters (Kosovo),,6,,...,375.0,11.223,22.445,75.228,0.0,0.0,3.0,0.000000e+00,-5.773160e-14,0.597
11,2020-06-04,2020,1,Demonstrations,Protests,Peaceful protest,Protesters (Kosovo),Labor Group (Kosovo),6,,...,287.0,33.107,52.748,161.045,0.0,0.0,4.0,0.000000e+00,-5.773160e-14,2.244
492,2020-11-13,2020,1,Demonstrations,Protests,Peaceful protest,Protesters (Andorra),Labor Group (Andorra),6,,...,2535.0,3206.039,7627.271,32771.414,0.0,0.0,16.0,1.421085e-14,-1.421085e-14,206.842
496,2020-04-17,2020,1,Demonstrations,Riots,Violent demonstration,Rioters (Angola),Street Traders (Angola),5,Police Forces of Angola (1975-),...,19.0,0.000,0.000,0.557,0.0,0.0,2.0,1.421085e-14,-1.421085e-14,0.059
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64847,2020-05-20,2020,1,Demonstrations,Protests,Peaceful protest,Protesters (Venezuela),Lawyers (Venezuela),6,,...,568.0,9.859,13.970,19.789,0.0,0.0,7.0,5.684342e-14,-7.105427e-15,0.244
64848,2020-05-22,2020,1,Demonstrations,Protests,Peaceful protest,Protesters (Venezuela),,6,,...,656.0,6.794,16.897,22.855,0.0,0.0,6.0,5.684342e-14,-7.105427e-15,0.209
64849,2020-05-23,2020,1,Demonstrations,Protests,Peaceful protest,Protesters (Venezuela),Lawyers (Venezuela),6,,...,699.0,6.480,17.628,24.353,0.0,0.0,6.0,5.684342e-14,-7.105427e-15,0.209
64850,2020-05-23,2020,1,Strategic developments,Strategic developments,Arrests,Police Forces of Venezuela (1999-),,1,Civilians (Venezuela),...,699.0,6.480,17.628,24.353,0.0,0.0,6.0,5.684342e-14,-7.105427e-15,0.209


In [108]:
df_datafull.isna().sum().sort_values()[50:]

30D_DEATHS                                   374
30D_DEATHS_PER_MIL                           374
stringency_index                             635
hospital_beds_per_thousand                   742
7D_DEATHS_PER_MIL                            750
                                           ...  
excess_mortality_cumulative_per_million    59381
excess_mortality_cumulative_absolute       59381
excess_mortality_cumulative                59381
ASSOC_ACTOR_2                              60619
CIVILIAN_TARGETING                         61320
Length: 62, dtype: int64

In [None]:
df_corrtest = pd.get_dummies(df["SOURCE_SCALE"])
df_corrtest = df_corrtest.join(df["UNORGANIZED"])
df_corrtest.corr()

In [None]:
# Проверка корреляции масштаба источника и организованности события, выданная ChatGPT
from scipy.stats import chi2_contingency

# create contingency table
cont_table = pd.crosstab(df['SOURCE_SCALE'], df['UNORGANIZED'])

# perform chi-square test of independence
chi2, pval, dof, expected = chi2_contingency(cont_table)

print('Chi-square statistic:', chi2)
print('P-value:', pval)

Запрос: I have a pandas dataframe where "SOURCE_SCALE" column contains categorial feature, and another column named "UNORGANIZED", with boolean values. I want to know if there is a correlation between categorial feature and bool value.

In [None]:
from scipy.stats import chi2_contingency

# Assuming your DataFrame is called "df"
contingency_table = pd.crosstab(df['SOURCE_SCALE'], df['UNORGANIZED'])
chi2, p_value, _, _ = chi2_contingency(contingency_table)

# Print the chi-square statistic and p-value
print("Chi-square statistic:", chi2)
print("p-value:", p_value)

In [None]:
import numpy as np

# Assuming your DataFrame is called "df"
contingency_table = pd.crosstab(df['SOURCE_SCALE'], df['UNORGANIZED'])
chi2, _, _, _ = chi2_contingency(contingency_table)
n = contingency_table.sum().sum()
phi = np.sqrt(chi2 / n)
rows, cols = contingency_table.shape
cramers_v = phi / np.sqrt(min(rows - 1, cols - 1))

# Print Cramér's V
print("Cramér's V:", cramers_v)
