In [1]:
import numpy as np
import pandas as pd
import datetime
import pyreadr
import pandas as pd
from IPython.display import display

from sklearn.preprocessing import OneHotEncoder

from nltk.corpus import stopwords 
import string
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize 
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

# Coronanet dataset

In [2]:
# import data
df = pd.read_csv('../data/coronanet_release_allvars.csv', encoding='mac_roman')

In [None]:
# colonnes
df.columns

In [None]:
# NaNs
df.isnull().sum()

In [24]:
df_2 = df[['policy_id','entry_type','correct_type', 
           'description', 'date_start','date_end','country', 
           'type', 'ISO_A3','init_country_level', 'domestic_policy',
           'type_sub_cat', 'compliance', 'enforcer']]

In [4]:
df_2.shape

(45640, 14)

In [5]:
df_2.isnull().sum()

policy_id                 0
entry_type                0
correct_type              0
description               0
date_start                0
date_end              23998
country                   0
type                      0
ISO_A3                  243
init_country_level      162
domestic_policy           0
type_sub_cat          13819
compliance              244
enforcer                154
dtype: int64

In [6]:
# commençons par le type (mesures plus simples pour les restrictions pour commencer et possibilité d'encodage)
print(df_2['type'].unique())
print(df_2['type'].nunique())

['Anti-Disinformation Measures' 'Closure and Regulation of Schools'
 'Curfew' 'Declaration of Emergency' 'External Border Restrictions'
 'Health Monitoring' 'Health Resources' 'Health Testing' 'Hygiene'
 'Internal Border Restrictions' 'Lockdown'
 'New Task Force, Bureau or Administrative Configuration'
 'Other Policy Not Listed Above' 'Public Awareness Measures' 'Quarantine'
 'Restriction and Regulation of Businesses'
 'Restriction and Regulation of Government Services'
 'Restrictions of Mass Gatherings' 'Social Distancing']
19


## Encodage des données compliance

In [None]:
for _ in df_2['compliance'].unique():
    print('-'*50)
    print(_)

In [None]:
mask = df_2['compliance'].value_counts() > 1000

In [None]:
df_2 = df_2[df_2['compliance'].isin(mask.index)]

In [None]:
df_2.head()

In [None]:
def handle_compliance(x):
    if 'Voluntary/Recommended' in x:
        return 0
    if 'Mandatory (Unspecified/Implied)' in x:
        return 1
    if 'Mandatory with Fines' in x:
        return 2
    if 'Mandatory with Exceptions' in x:
        return 3
    if 'Mandatory with Legal Penalties' in x:
        return 4

In [None]:
df_2['n_compliance'] = df_2['compliance'].map(handle_compliance)

In [None]:
df_2['n_compliance'].nunique()

## Encode type colonne

In [None]:
df_2['type'].unique()

In [None]:
df_3 = pd.get_dummies(df_2, columns=["type"]).head()

In [None]:
df_3.columns

## Missing value for ISO_3

In [15]:
df_2['ISO_A3'].unique()

array(['AFG', 'ALB', 'DZA', 'AND', 'AGO', 'ATG', 'ARG', 'ARM', 'AUS',
       'AUT', 'AZE', 'BHS', 'BHR', 'BGD', 'BRB', 'BLR', 'BEL', 'BLZ',
       'BEN', 'BTN', 'BOL', 'BIH', 'BWA', 'BRA', 'BRN', 'BGR', 'BFA',
       'BDI', 'KHM', 'CMR', 'CAN', 'CPV', 'CAF', 'TCD', 'CHL', 'CHN',
       'COL', 'COM', 'CRI', nan, 'HRV', 'CUB', 'CYP', 'CZE', 'COD', 'DNK',
       'DJI', 'DMA', 'DOM', 'ECU', 'EGY', 'SLV', 'GNQ', 'ERI', 'EST',
       'SWZ', 'ETH', 'FJI', 'FIN', 'FRA', 'GAB', 'GMB', 'GEO', 'DEU',
       'GHA', 'GRC', 'GRD', 'GTM', 'GIN', 'GNB', 'GUY', 'HTI', 'HND',
       'HKG', 'HUN', 'ISL', 'IND', 'IDN', 'IRN', 'IRQ', 'IRL', 'ISR',
       'ITA', 'CIV', 'JAM', 'JPN', 'JOR', 'KAZ', 'KEN', 'KIR', '-', 'KWT',
       'KGZ', 'LAO', 'LVA', 'LBN', 'LSO', 'LBR', 'LBY', 'LIE', 'LTU',
       'LUX', 'MDG', 'MWI', 'MYS', 'MDV', 'MLI', 'MLT', 'MHL', 'MRT',
       'MUS', 'MEX', 'FSM', 'MDA', 'MCO', 'MNG', 'MNE', 'MAR', 'MOZ',
       'MMR', 'NAM', 'NRU', 'NPL', 'NLD', 'NZL', 'NIC', 'NER', 'NGA',
       'PR

In [16]:
df_2['ISO_A3'] = np.where(df_2['ISO_A3'] == '-', df_2['country'], df_2['ISO_A3'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [17]:
df_2['ISO_A3'] = df_2['ISO_A3'].fillna(df_2['country'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [18]:
df_2['ISO_A3'].isna().sum()

0

In [20]:
df_2['ISO_A3'] = df_2['ISO_A3'].replace('Northern Cyprus', 'CYP')
df_2['ISO_A3'] = df_2['ISO_A3'].replace('Kosovo', 'XK')
df_2['ISO_A3'] = df_2['ISO_A3'].replace('CÙte díIvoire', 'CIV')
df_2['ISO_A3'] = df_2['ISO_A3'].replace('European Union', 'EU')
df_2['ISO_A3'] = df_2['ISO_A3'].replace('S„o TomÈ & PrÌncipe', 'STP')
df_2['ISO_A3'] = df_2['ISO_A3'].replace('Macau', 'MAC')
df_2['ISO_A3'] = df_2['ISO_A3'].replace('Timor Leste', 'TLS')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

In [21]:
df_2['ISO_A3'].unique()

array(['AFG', 'ALB', 'DZA', 'AND', 'AGO', 'ATG', 'ARG', 'ARM', 'AUS',
       'AUT', 'AZE', 'BHS', 'BHR', 'BGD', 'BRB', 'BLR', 'BEL', 'BLZ',
       'BEN', 'BTN', 'BOL', 'BIH', 'BWA', 'BRA', 'BRN', 'BGR', 'BFA',
       'BDI', 'KHM', 'CMR', 'CAN', 'CPV', 'CAF', 'TCD', 'CHL', 'CHN',
       'COL', 'COM', 'CRI', 'CIV', 'HRV', 'CUB', 'CYP', 'CZE', 'COD',
       'DNK', 'DJI', 'DMA', 'DOM', 'ECU', 'EGY', 'SLV', 'GNQ', 'ERI',
       'EST', 'SWZ', 'ETH', 'EU', 'FJI', 'FIN', 'FRA', 'GAB', 'GMB',
       'GEO', 'DEU', 'GHA', 'GRC', 'GRD', 'GTM', 'GIN', 'GNB', 'GUY',
       'HTI', 'HND', 'HKG', 'HUN', 'ISL', 'IND', 'IDN', 'IRN', 'IRQ',
       'IRL', 'ISR', 'ITA', 'JAM', 'JPN', 'JOR', 'KAZ', 'KEN', 'KIR',
       'XK', 'KWT', 'KGZ', 'LAO', 'LVA', 'LBN', 'LSO', 'LBR', 'LBY',
       'LIE', 'LTU', 'LUX', 'MAC', 'MDG', 'MWI', 'MYS', 'MDV', 'MLI',
       'MLT', 'MHL', 'MRT', 'MUS', 'MEX', 'FSM', 'MDA', 'MCO', 'MNG',
       'MNE', 'MAR', 'MOZ', 'MMR', 'NAM', 'NRU', 'NPL', 'NLD', 'NZL',
       'NIC', 'NER', '

In [23]:
pd.set_option('display.max_rows', None)
df_2['ISO_A3'].value_counts()

RUS    5721
USA    5075
DEU    2292
CHE    1885
JPN    1777
ARG    1529
NGA    1114
BRA    1100
CAN     990
IDN     959
IND     891
AUS     674
CHN     670
ESP     421
DZA     416
BGD     413
MEX     361
NZL     359
AFG     348
QAT     346
ITA     334
KAZ     322
AUT     319
MMR     317
ISR     303
LBN     299
ARE     295
FRA     292
GTM     284
AZE     284
SGP     280
GBR     276
SAU     263
HND     242
OMN     241
CYP     235
GRD     234
IRL     230
UZB     226
CHL     215
FIN     215
IRQ     211
BLZ     210
BOL     204
EGY     203
THA     190
ETH     185
PAK     185
MAR     172
BHR     164
DOM     163
WSM     162
HKG     158
NOR     157
AGO     155
TCD     155
SMR     149
SDN     146
ZAF     146
TWN     145
TLS     134
PNG     134
BRN     133
CIV     133
TUR     131
SWE     131
UKR     129
CZE     129
TUN     128
KGZ     127
BLR     126
YEM     124
ZWE     124
CRI     124
COL     122
IRN     122
KOR     122
EST     120
KEN     117
NLD     115
PAN     114
JAM     113
GAB     110
POL 

In [22]:
df_2.isna().sum()

policy_id                 0
entry_type                0
correct_type              0
description               0
date_start                0
date_end              23998
country                   0
type                      0
ISO_A3                    0
init_country_level      162
domestic_policy           0
type_sub_cat          13819
compliance              244
enforcer                154
dtype: int64

 ## Vectorization des données description (pour plus tard)

In [None]:
def clean(x):
    x = x.lower()
    x = ''.join([letter for letter in x if not letter.isdigit()])
    for p in string.punctuation:
        x = x.replace(p, " ")
    stops = set(stopwords.words("english"))
    tokens = word_tokenize(x)
    tokens = ' '.join([token for token in tokens if token not in stops])
    return tokens

In [None]:
# données description
df_2['clean_description'] = df_2.description.apply(clean)
df_2['clean_description'] = df_2['clean_description'].astype('str')

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(df_2.clean_description)

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation()

model = lda.fit(X)

In [None]:
def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-10 - 1:-1]])
        
print_topics(lda, vectorizer)

## Check type_sub_cat

In [None]:
for _ in df_2['type_sub_cat'].unique():
    print('-'*50)
    print(_)