In [2]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.model_selection import train_test_split
import numpy as np

In [3]:
df = pd.read_csv("africa.csv")
df

Unnamed: 0,case,cc3,country,year,systemic_crisis,exch_usd,domestic_debt_in_default,sovereign_external_debt_default,gdp_weighted_default,inflation_annual_cpi,independence,currency_crises,inflation_crises,banking_crisis
0,1,DZA,Algeria,1870,1,0.052264,0,0,0.0,3.441456,0,0,0,crisis
1,1,DZA,Algeria,1871,0,0.052798,0,0,0.0,14.149140,0,0,0,no_crisis
2,1,DZA,Algeria,1872,0,0.052274,0,0,0.0,-3.718593,0,0,0,no_crisis
3,1,DZA,Algeria,1873,0,0.051680,0,0,0.0,11.203897,0,0,0,no_crisis
4,1,DZA,Algeria,1874,0,0.051308,0,0,0.0,-3.848561,0,0,0,no_crisis
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1054,70,ZWE,Zimbabwe,2009,1,354.800000,1,1,0.0,-7.670000,1,1,0,crisis
1055,70,ZWE,Zimbabwe,2010,0,378.200000,1,1,0.0,3.217000,1,0,0,no_crisis
1056,70,ZWE,Zimbabwe,2011,0,361.900000,1,1,0.0,4.920000,1,0,0,no_crisis
1057,70,ZWE,Zimbabwe,2012,0,361.900000,1,1,0.0,3.720000,1,0,0,no_crisis


In [4]:
# колонка с категориальными переменными
df["banking_crisis"].unique()

array(['crisis', 'no_crisis'], dtype=object)

####### "crisis" = 1
####### "no_crisis" = 0

In [5]:
# перевод в численные значения 0 и 1
df["banking_crisis"] = df["banking_crisis"].apply(lambda x: ["crisis", "no_crisis"].index(x))

In [6]:
df.columns

Index(['case', 'cc3', 'country', 'year', 'systemic_crisis', 'exch_usd',
       'domestic_debt_in_default', 'sovereign_external_debt_default',
       'gdp_weighted_default', 'inflation_annual_cpi', 'independence',
       'currency_crises', 'inflation_crises', 'banking_crisis'],
      dtype='object')

In [None]:
# Для дальнейшего формирования данных для модели исключаются колонки: 
# case -- число, присвоенное стране, 
# cc3 -- трёхбуквенный код страны, 
# country -- название страны (так как первая модель строилась для одной страны, 
# year -- год исследования

In [7]:
import re

In [8]:
# выделение из всех данных страны из условия -- Ivory Coast
pat = r'Ivory Coast'
df2 = df[df['country'].str.contains(pat)]
df2

Unnamed: 0,case,cc3,country,year,systemic_crisis,exch_usd,domestic_debt_in_default,sovereign_external_debt_default,gdp_weighted_default,inflation_annual_cpi,independence,currency_crises,inflation_crises,banking_crisis
220,15,CIV,Ivory Coast,1952,0,0.0,0,0,0.0,16.216100,0,0,0,1
221,15,CIV,Ivory Coast,1953,0,0.0,0,0,0.0,2.326185,0,0,0,1
222,15,CIV,Ivory Coast,1954,0,0.0,0,0,0.0,-2.273304,0,0,0,1
223,15,CIV,Ivory Coast,1955,0,0.0,0,0,0.0,2.326185,0,0,0,1
224,15,CIV,Ivory Coast,1956,0,0.0,0,0,0.0,4.544795,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278,15,CIV,Ivory Coast,2010,0,0.0,0,1,0.0,1.800000,1,0,0,1
279,15,CIV,Ivory Coast,2011,0,0.0,0,1,0.0,4.448000,1,0,0,1
280,15,CIV,Ivory Coast,2012,0,0.0,0,1,0.0,1.300000,1,0,0,1
281,15,CIV,Ivory Coast,2013,0,0.0,0,0,0.0,2.584000,1,0,0,1


In [9]:
enc = OneHotEncoder(categories='auto')
enc.fit(df2[df.columns[5:14]])

OneHotEncoder(categorical_features=None, categories='auto', drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='error',
              n_values=None, sparse=True)

In [23]:
# модель предсказывает наличие (1) или отсутствие (0) системного кризиса
Y = df2["systemic_crisis"].values

In [10]:
# перевод данных в массив
df3 = enc.transform(df2[df.columns[5:14]]).toarray()

In [13]:
# разделение данных на обучающие выборки
train_X, test_X, train_Y, test_Y = train_test_split(df3, Y)

In [14]:
# в качестве модели выбрана логистическая регрессия
model = LogisticRegression()

In [15]:
# обучение модели
model.fit(train_X, train_Y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [16]:
# Предсказание системного кризиса в Ivory Coast
test_Yhat = model.predict(test_X)

In [25]:
# Проверка точности
accuracy_score(test_Y, test_Yhat)

0.9375

In [24]:
# Проверка точности вторым способом
balanced_accuracy_score(test_Y, test_Yhat)

0.5

### Ответы на вопросы

In [19]:
pd.crosstab(df["country"], df["systemic_crisis"])

systemic_crisis,0,1
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Algeria,81,4
Angola,77,0
Central African Republic,39,19
Egypt,149,6
Ivory Coast,59,4
Kenya,54,13
Mauritius,68,0
Morocco,73,2
Nigeria,50,10
South Africa,114,0


#### Больше всего системных кризисов произошло в Центральной Африканской республике

In [20]:
pd.crosstab(df["country"], df["banking_crisis"])

banking_crisis,0,1
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Algeria,4,81
Angola,7,70
Central African Republic,19,39
Egypt,11,144
Ivory Coast,4,59
Kenya,8,59
Mauritius,1,67
Morocco,2,73
Nigeria,11,49
South Africa,3,111


#### Больше всего кризисов банковской системы произошло в Египте

In [21]:
pd.crosstab(df["country"], df["inflation_crises"])

inflation_crises,0,1
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Algeria,71,14
Angola,51,26
Central African Republic,56,2
Egypt,144,11
Ivory Coast,59,4
Kenya,63,4
Mauritius,62,6
Morocco,65,10
Nigeria,48,12
South Africa,113,1


#### Больше всего инфляционных кризисов произошло в Анголе

##### Информации по ВВП в датасете нет, ответить на второй вопрос из условия затруднительно