In [26]:
import numpy as np
import pandas as pd

import scipy.stats as sts
import matplotlib.pyplot as plt
import seaborn as sns

# Загрузка датасета №1 и его предобработка

In [27]:
df = pd.read_excel('coronavirus_Mar31.xlsx')
df.set_index("EVENT_ID_CNTY", inplace = True)
df['EVENT_DATE'] = pd.to_datetime(df['EVENT_DATE'])
df.head()

Unnamed: 0_level_0,EVENT_DATE,YEAR,TIME_PRECISION,DISORDER_TYPE,EVENT_TYPE,SUB_EVENT_TYPE,ACTOR1,ASSOC_ACTOR_1,INTER1,ACTOR2,...,LOCATION,LATITUDE,LONGITUDE,GEO_PRECISION,SOURCE,SOURCE_SCALE,NOTES,FATALITIES,TAGS,TIMESTAMP
EVENT_ID_CNTY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
XKX325,2020-03-13,2020,1,Strategic developments,Strategic developments,Change to group/activity,Government of Kosovo (2020-),,1,,...,Pristina,42.667,21.172,3,Prishtina Insight,National,"On 13 March 2020, the government of Kosovo ena...",0,,1585075531
XKX326,2020-03-19,2020,1,Demonstrations,Protests,Peaceful protest,Protesters (Kosovo),,6,,...,Pristina,42.667,21.172,1,RFE/RL,International,"On 19 March 2020, residents of Pristina, Kosov...",0,crowd size=no report,1585673085
XKX327,2020-03-20,2020,1,Demonstrations,Protests,Peaceful protest,Protesters (Kosovo),,6,,...,Pristina,42.667,21.172,1,Kosovo Online,National,"On 20 March 2020, residents of Pristina, Kosov...",0,crowd size=no report,1585673085
XKX328,2020-03-21,2020,1,Demonstrations,Protests,Peaceful protest,Protesters (Kosovo),,6,,...,Pristina,42.667,21.172,1,Sinjali,National,"On 21 March 2020, citizens took to their balco...",0,crowd size=no report,1585673076
XKX329,2020-03-21,2020,1,Demonstrations,Protests,Peaceful protest,Protesters (Kosovo),,6,,...,Peja,42.659,20.288,1,Sinjali,National,"On 21 March 2020, citizens took to their balco...",0,crowd size=no report,1585673076


## Подготовка к чистке

In [28]:
len(df[df["TIME_PRECISION"] == 1]) / df.shape[0] #Доля точных по времени данных

0.9670552072554629

Можно убрать столбец TIME_PRECISION, потому что он ни на что не повлияет

In [29]:
df.value_counts(subset = ["DISORDER_TYPE", "EVENT_TYPE", "SUB_EVENT_TYPE"]) #Посмотрим на типы конфликтов

DISORDER_TYPE                       EVENT_TYPE                  SUB_EVENT_TYPE                    
Demonstrations                      Protests                    Peaceful protest                      56930
                                    Riots                       Violent demonstration                  2658
                                    Protests                    Protest with intervention              2046
Political violence                  Riots                       Mob violence                           1529
Strategic developments              Strategic developments      Change to group/activity                882
Political violence                  Violence against civilians  Attack                                  851
Strategic developments              Strategic developments      Other                                   254
                                                                Looting/property destruction            176
                                     

In [30]:
df[['COUNTRY', 'ADMIN1', 'ADMIN2', 'ADMIN3', 'LOCATION']].nunique()

COUNTRY       220
ADMIN1       2022
ADMIN2       7112
ADMIN3       3669
LOCATION    13999
dtype: int64

In [31]:
df[['ACTOR1', 'ASSOC_ACTOR_1', 'ACTOR2', 'ASSOC_ACTOR_2']].nunique()

ACTOR1            946
ASSOC_ACTOR_1    6639
ACTOR2            611
ASSOC_ACTOR_2    1037
dtype: int64

In [32]:
df[['DISORDER_TYPE', 'EVENT_TYPE', 'SUB_EVENT_TYPE']].nunique()

DISORDER_TYPE      4
EVENT_TYPE         6
SUB_EVENT_TYPE    19
dtype: int64

Становится ясно, что никакого OneHotEncoding по геоданным и акторам не будет, да вообще эти данные кроме визуализаций и присоединения других показателей не очень то нужны.
А вот типы событий имеет смысл OneHot'нуть.

In [33]:
df.value_counts(subset = ["REGION"]) #Посмотрим на распределение наблюдений по регионам

REGION                   
Europe                       25071
North America                10796
South America                 8482
South Asia                    7807
Middle East                   3121
East Asia                     2922
Northern Africa               1574
Southeast Asia                1293
Central America               1062
Caucasus and Central Asia      696
Oceania                        637
Eastern Africa                 624
Caribbean                      584
Southern Africa                440
Western Africa                 413
Middle Africa                  194
dtype: int64

Видим, что больше половины наблюдений приходится на Европу и Северную Америку. Сравним Китай и США

In [34]:
print(df[df["COUNTRY"] == "China"].value_counts(subset = ["COUNTRY"]))
print(df[df["COUNTRY"] == "United States"].value_counts(subset = ["COUNTRY"]))

COUNTRY
China      629
dtype: int64
COUNTRY      
United States    7966
dtype: int64


Есть некоторые подозрения по поводу количества наблюдений в полутрамиллиардном Китае. Посмотрим на источники.

In [35]:
print("КИТАЙ: \n", df[df["COUNTRY"] == "China"].value_counts(subset = ["SOURCE_SCALE", "SOURCE"]), "\n")
print("США: \n", df[df["COUNTRY"] == "United States"].value_counts(subset = ["SOURCE_SCALE", "SOURCE"]))

КИТАЙ: 
 SOURCE_SCALE            SOURCE                          
Subnational             Inmediahk.net                       79
                        HK01                                65
                        Wen Wei Po                          55
International           Radio Free Asia                     55
Other                   CLB (China)                         40
                                                            ..
Other-Subnational       China Aid; Inmediahk.net             1
                        Apple Daily Hong Kong; China Aid     1
Other-New media         Twitter; CLB (China)                 1
National-International  Deutsche Welle; HK01                 1
Other-National          Weiquanwang; RTHK                    1
Length: 134, dtype: int64 

США: 
 SOURCE_SCALE          SOURCE                                                                  
New media             Twitter                                                                     127
Other    

Ожидаемо, данные для США берут из открытых внутренних источников, данные для Китая - из внешних, до которых доходит гораздо меньше событий. Запомним это для проверки гипотез.

In [36]:
df.value_counts(subset = ['TIMESTAMP'])

TIMESTAMP 
1643149412    361
1677617817    303
1631575313    275
1680633575    263
1642536651    254
             ... 
1618498097      1
1618498094      1
1618498091      1
1618498090      1
1618499525      1
Length: 9665, dtype: int64

Не очень то нужные значения, их даже как идентификатор события не выйдет использовать.

In [37]:
df["TAGS"].value_counts()

crowd size=no report                  36035
crowd size=dozens                      1711
crowd size=hundreds                    1455
crowd size=around 100                  1013
crowd size=around 50                    771
                                      ...  
crowd size=around 2,100                   1
crowd size=around 4,700                   1
crowd size=around 1,412                   1
crowd size=hundreds-more than 1000        1
crowd size=unkown                         1
Name: TAGS, Length: 3128, dtype: int64

Интерпретировать теги очень сложно, поэтому тоже отброшу.

In [38]:
df["ASSOC_ACTOR_1"].value_counts()

Labor Group (Italy)                                                                                      1036
Students (United States)                                                                                  836
Labor Group (Spain)                                                                                       831
Labor Group (United States)                                                                               777
Labor Group (India)                                                                                       776
                                                                                                         ... 
Students (India); AISA: All India Students Association; SFI: Students Federation of India                   1
NSUI: National Students Union of India; JKNC: Jammu and Kashmir National Conference; Students (India)       1
Government of Pakistan (2018-2022); SP: Samajwadi Party                                                     1
Farmers (I

Видим очевидную проблему - к "неполитическим" объединениям присоединены названия стран, да еще и другие участники. Разделю их, чтобы можно было оценивать событие по участникам.

In [39]:
df['ACTOR1_TYPES'] = df["ASSOC_ACTOR_1"].fillna(df["ACTOR1"])
def process_string(row):
    new_string = row['ACTOR1_TYPES'].replace(' (' + row['COUNTRY'] + ')', '')
    return [new_string]
df['ACTOR1_TYPES'] = df.apply(lambda row: pd.Series(process_string(row)), axis=1)

А теперь сделаем признак "неполитичности" - если событие не организовано формальным объединением, то оно не-политическое, признак равен 1, и наоборот.

In [40]:
df_actorcheck = df["ACTOR1_TYPES"].str.split(",", expand = True)
my_list = ["Protesters", "Labor Group", "Health Workers", "Students", "Rioters", "Teachers", "Farmers", "Women", "Prisoners", "Taxi/Bus Drivers", "Lawyers", "Taxi Drivers", "No Vax", "Muslim Group", "Fishers", "Journalists", "Street Traders", "Orthodox Christian Group", "Protestant Christian Group", "Haredi Jewish Group", "Judges", "Refugees/IDPs", None]
def check_list(row):
    return int(all(elem in my_list for elem in row))

df['NONPOLITICAL'] = df_actorcheck.apply(lambda row: check_list(row), axis=1)
df["NONPOLITICAL"].value_counts()

1    43124
0    22592
Name: NONPOLITICAL, dtype: int64

In [41]:
df

Unnamed: 0_level_0,EVENT_DATE,YEAR,TIME_PRECISION,DISORDER_TYPE,EVENT_TYPE,SUB_EVENT_TYPE,ACTOR1,ASSOC_ACTOR_1,INTER1,ACTOR2,...,LONGITUDE,GEO_PRECISION,SOURCE,SOURCE_SCALE,NOTES,FATALITIES,TAGS,TIMESTAMP,ACTOR1_TYPES,NONPOLITICAL
EVENT_ID_CNTY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
XKX325,2020-03-13,2020,1,Strategic developments,Strategic developments,Change to group/activity,Government of Kosovo (2020-),,1,,...,21.172,3,Prishtina Insight,National,"On 13 March 2020, the government of Kosovo ena...",0,,1585075531,Government of Kosovo (2020-),0
XKX326,2020-03-19,2020,1,Demonstrations,Protests,Peaceful protest,Protesters (Kosovo),,6,,...,21.172,1,RFE/RL,International,"On 19 March 2020, residents of Pristina, Kosov...",0,crowd size=no report,1585673085,Protesters,1
XKX327,2020-03-20,2020,1,Demonstrations,Protests,Peaceful protest,Protesters (Kosovo),,6,,...,21.172,1,Kosovo Online,National,"On 20 March 2020, residents of Pristina, Kosov...",0,crowd size=no report,1585673085,Protesters,1
XKX328,2020-03-21,2020,1,Demonstrations,Protests,Peaceful protest,Protesters (Kosovo),,6,,...,21.172,1,Sinjali,National,"On 21 March 2020, citizens took to their balco...",0,crowd size=no report,1585673076,Protesters,1
XKX329,2020-03-21,2020,1,Demonstrations,Protests,Peaceful protest,Protesters (Kosovo),,6,,...,20.288,1,Sinjali,National,"On 21 March 2020, citizens took to their balco...",0,crowd size=no report,1585673076,Protesters,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZAM1292,2020-04-12,2020,1,Political violence,Violence against civilians,Attack,Police Forces of Zambia (2011-2021),,1,Civilians (Zambia),...,29.497,1,Zambia Watchdog,National,"On 12 April 2020, Zambian police attacked peop...",0,,1631575315,Police Forces of Zambia (2011-2021),0
ZAM1300,2020-07-24,2020,1,Political violence,Violence against civilians,Attack,Police Forces of Zambia (2011-2021),,1,Civilians (Zambia),...,31.325,1,Zambia Watchdog,National,"On 24 July 2020, a man was hit and injured by ...",0,,1631575315,Police Forces of Zambia (2011-2021),0
ZAM1315,2020-09-26,2020,1,Political violence,Violence against civilians,Attack,Police Forces of Zambia (2011-2021),,1,Civilians (Zambia),...,28.283,1,Mwebantu; Daily Nation (Zambia),National,"On 26 September 2020, a teenage boy was shot a...",1,,1631575311,Police Forces of Zambia (2011-2021),0
ZAM1323,2020-10-08,2020,2,Political violence,Riots,Mob violence,Rioters (Zambia),,5,Police Forces of Zambia (2011-2021),...,28.283,1,Lusaka Times,Subnational,"Around 8 October 2020 (as reported), a mob att...",1,crowd size=no report,1631575314,Rioters,1


# Чистка
Удалю ненужные столбцы, которые не помогут при анализе

In [42]:
useless = [
    'NOTES',
    'SOURCE', 
    'SOURCE_SCALE', 
    'TIME_PRECISION', 
    'TIMESTAMP', 
    'GEO_PRECISION', 
    'TAGS',
    'ACTOR1', 
    'ASSOC_ACTOR_1', 
    'ACTOR2', 
    'ASSOC_ACTOR_2', 
    'CIVILIAN_TARGETING',
    'ADMIN1',
    'ADMIN2',
    'ADMIN3',
    'LOCATION',
    'INTERACTION',
    'ACTOR1_TYPES'
]

In [43]:
df_data = df.drop(useless, axis = 1)

Сохраним все события в табличку для визуализаций

In [44]:
df_data.to_excel("disorders.xlsx", encoding='utf-8')

И отдельно сохраним именно протесты

In [60]:
df_protests = df_data[df_data["DISORDER_TYPE"] == "Demonstrations"]
df_protests.to_excel("protests.xlsx", encoding='utf-8')
df_protests

Unnamed: 0_level_0,EVENT_DATE,YEAR,DISORDER_TYPE,EVENT_TYPE,SUB_EVENT_TYPE,INTER1,INTER2,ISO,REGION,COUNTRY,LATITUDE,LONGITUDE,FATALITIES,NONPOLITICAL
EVENT_ID_CNTY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
XKX326,2020-03-19,2020,Demonstrations,Protests,Peaceful protest,6,0,0,Europe,Kosovo,42.667,21.172,0,1
XKX327,2020-03-20,2020,Demonstrations,Protests,Peaceful protest,6,0,0,Europe,Kosovo,42.667,21.172,0,1
XKX328,2020-03-21,2020,Demonstrations,Protests,Peaceful protest,6,0,0,Europe,Kosovo,42.667,21.172,0,1
XKX329,2020-03-21,2020,Demonstrations,Protests,Peaceful protest,6,0,0,Europe,Kosovo,42.659,20.288,0,1
XKX330,2020-03-21,2020,Demonstrations,Protests,Peaceful protest,6,0,0,Europe,Kosovo,42.638,21.093,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YEM54985,2020-06-18,2020,Demonstrations,Protests,Peaceful protest,6,0,887,Middle East,Yemen,15.943,48.793,0,1
YEM55677,2020-07-09,2020,Demonstrations,Protests,Peaceful protest,6,0,887,Middle East,Yemen,12.779,45.037,0,0
YEM56030,2020-07-19,2020,Demonstrations,Protests,Peaceful protest,6,0,887,Middle East,Yemen,14.602,49.239,0,1
YEM60731,2020-10-18,2020,Demonstrations,Protests,Peaceful protest,6,0,887,Middle East,Yemen,14.543,49.124,0,1
