# Python для анализа данных

*Рогович Татьяна, НИУ ВШЭ*

## Spider Chart в Plotly

Есть такой вид графика radarplot (он же Spiderchart), который позволяет нам сравнивать наблюдения по нескольким измерениям.

https://www.data-to-viz.com/caveat/spider.html

https://python-graph-gallery.com/390-basic-radar-chart/

Давайте поработаем с таким датасетом: у нас есть даннные, где наблюдение это страна, а признак - некоторый политико-экономический индекс.

Ссылка на исследование:
https://github.com/rogovich/2019-2020_PolSci_Data_Analysis_in_Python/blob/master/11week_Viz_Spiderchart_Scraping/GivingCreditWhereItsDue_LazardResearch_en.pdf

In [26]:
import pandas as pd

In [27]:
df = pd.read_csv('https://github.com/rogovich/Data/raw/master/data/countries_radar_plot.txt', encoding = 'utf8')

In [28]:
df.tail()

Unnamed: 0,Country,Index Spread,Voice & Accountability,Political Stability,Govt. Effectiveness,Regulatory Quality,Rule of Law,Control of Corruption,Corruption percetion,Ease of Doing Business,Human Dev. Index,Global Compet. Index,Eviron. Performance Index,Fragile States Index,UN World Risk Index,Freedom of the Press Index
60,Suriname,655,17,16,43,54,29,43,17,58,37,62,40,21,51,9
61,Ecuador,705,39,31,46,63,60,46,48,46,31,39,44,39,48,48
62,Belize,732,10,22,54,47,51,21,65,45,39,63,29,18,37,8
63,Mozambique,1770,40,43,57,48,56,54,59,53,65,61,65,55,52,28
64,Venezuela,24,57,58,54,64,65,65,63,63,65,21,60,20,49,27


In [29]:
df.columns

Index(['Country', 'Index Spread', 'Voice & Accountability',
       'Political Stability', 'Govt. Effectiveness', 'Regulatory Quality',
       'Rule of Law', 'Control of Corruption', 'Corruption percetion',
       'Ease of Doing Business', 'Human Dev. Index', 'Global Compet. Index',
       'Eviron. Performance Index', 'Fragile States Index',
       'UN World Risk Index', 'Freedom of the Press Index'],
      dtype='object')

In [30]:
df[df['Country']=='Slovakia'].loc[:, "Voice & Accountability":]

Unnamed: 0,Voice & Accountability,Political Stability,Govt. Effectiveness,Regulatory Quality,Rule of Law,Control of Corruption,Corruption percetion,Ease of Doing Business,Human Dev. Index,Global Compet. Index,Eviron. Performance Index,Fragile States Index,UN World Risk Index,Freedom of the Press Index
0,6,2,5,6,7,13,9,6,4,23,4,5,8,7


In [31]:
df.loc[:, "Voice & Accountability":].describe()

Unnamed: 0,Voice & Accountability,Political Stability,Govt. Effectiveness,Regulatory Quality,Rule of Law,Control of Corruption,Corruption percetion,Ease of Doing Business,Human Dev. Index,Global Compet. Index,Eviron. Performance Index,Fragile States Index,UN World Risk Index,Freedom of the Press Index
count,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0
mean,32.984615,33.061538,32.846154,32.984615,33.0,33.030769,33.0,32.969231,33.676923,32.4,33.615385,32.553846,33.338462,32.507692
std,18.887407,18.983444,18.690933,18.881615,18.90767,18.958812,18.90767,18.856339,19.255768,18.654591,19.127608,18.866331,18.995293,18.634748
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0
50%,33.0,33.0,33.0,33.0,33.0,33.0,33.0,33.0,34.0,32.0,34.0,32.0,34.0,32.0
75%,49.0,49.0,49.0,49.0,49.0,49.0,49.0,49.0,50.0,48.0,50.0,48.0,49.0,48.0
max,65.0,65.0,65.0,64.0,65.0,65.0,65.0,64.0,65.0,65.0,65.0,65.0,65.0,65.0


Мы хотим сравнивать страны по этим индексам. Для такого графика как Radarplot нам важно, чтобы индексы были стандартизированы. Здесь этой проблемы нет, потому что наши данные - это ранг стран. Но мы можем и стандартирозовать всю эту историю к другой шкале: например, от 0 до 1.

Для этого из модуля sklearn импортируем модуль предобработки данных.

In [32]:
from sklearn import preprocessing


In [33]:
x = df.loc[:, "Voice & Accountability":].values # достаем матрицу наших значений
x

array([[ 6,  2,  5,  6,  7, 13,  9,  6,  4, 23,  4,  5,  8,  7],
       [ 3,  3,  6,  4,  3,  5,  3,  5,  1,  5, 10,  2,  6, 13],
       [ 5,  5,  1,  2,  2,  6,  4,  3,  2,  4,  3,  4,  3,  3],
       [ 7, 11,  2,  3,  4,  7,  6,  1,  6, 12,  2,  7,  7,  6],
       [29, 48, 22, 28, 35, 33, 37, 38, 45, 17, 27, 52, 65, 23],
       [12,  4,  9,  8, 11, 14, 12, 11,  5, 26,  5, 11, 21, 22],
       [19, 52, 23, 43, 20, 29, 24, 52, 54,  7, 61, 45, 39, 21],
       [ 4, 13,  3,  1,  1,  2,  2, 17,  3,  3, 17,  3, 59, 10],
       [41, 18,  4,  7,  6,  8, 10,  4, 14,  1, 24, 19, 32, 52],
       [18, 17, 28,  9, 15, 17, 14,  8, 10, 21,  8, 12, 26, 16],
       [65, 41, 10, 39, 33, 24, 23, 28, 32,  2, 49, 36, 33, 64],
       [24, 15, 21, 23, 23, 22, 20, 14, 18, 38, 15, 29, 43, 29],
       [27, 39, 40, 11, 44, 45, 38, 16, 30, 25, 31, 30, 38, 24],
       [14, 12, 13, 16, 24, 27, 27, 25, 15,  9, 16, 13, 45, 18],
       [15, 10,  8, 17, 13, 11, 11, 12,  7, 30,  1, 10, 13, 17],
       [63, 25, 25, 49, 3

Мы воспользуемся функцией MinMaxScaler(), которая стандартизирует признаки по колонке (самое низкое значение становится 0, самое большое 1)

In [34]:
min_max_scaler = preprocessing.MinMaxScaler() 
x_scaled = min_max_scaler.fit_transform(x)

In [35]:
df.shape

(65, 16)

In [36]:
x_scaled.shape # вроде все получилось

(65, 14)

In [37]:
df_normalized = df.copy() # создаем копию данных

In [38]:
df_normalized.loc[:, "Voice & Accountability":] = x_scaled # перезаписываем оригинальные значение отшкалированными переменными

In [39]:
df_normalized.head(1)

Unnamed: 0,Country,Index Spread,Voice & Accountability,Political Stability,Govt. Effectiveness,Regulatory Quality,Rule of Law,Control of Corruption,Corruption percetion,Ease of Doing Business,Human Dev. Index,Global Compet. Index,Eviron. Performance Index,Fragile States Index,UN World Risk Index,Freedom of the Press Index
0,Slovakia,32,0.078125,0.015625,0.0625,0.079365,0.09375,0.1875,0.125,0.079365,0.046875,0.34375,0.046875,0.0625,0.109375,0.09375


In [40]:
df.head(1)

Unnamed: 0,Country,Index Spread,Voice & Accountability,Political Stability,Govt. Effectiveness,Regulatory Quality,Rule of Law,Control of Corruption,Corruption percetion,Ease of Doing Business,Human Dev. Index,Global Compet. Index,Eviron. Performance Index,Fragile States Index,UN World Risk Index,Freedom of the Press Index
0,Slovakia,32,6,2,5,6,7,13,9,6,4,23,4,5,8,7


Было бы здорово, кроме страны нанести на график еще медиану.

In [41]:
df_normalized.median()


Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.



Index Spread                  297.000000
Voice & Accountability          0.500000
Political Stability             0.500000
Govt. Effectiveness             0.500000
Regulatory Quality              0.507937
Rule of Law                     0.500000
Control of Corruption           0.500000
Corruption percetion            0.500000
Ease of Doing Business          0.507937
Human Dev. Index                0.515625
Global Compet. Index            0.484375
Eviron. Performance Index       0.515625
Fragile States Index            0.484375
UN World Risk Index             0.515625
Freedom of the Press Index      0.484375
dtype: float64

Radarplot работает с векторами numpy. Поэтому давайте поймем, что мы будем ему отдавать.

In [42]:
df_normalized[df_normalized['Country'] == 'Slovakia'].loc[:,"Voice & Accountability":].values[0]

array([0.078125  , 0.015625  , 0.0625    , 0.07936508, 0.09375   ,
       0.1875    , 0.125     , 0.07936508, 0.046875  , 0.34375   ,
       0.046875  , 0.0625    , 0.109375  , 0.09375   ])

Экспресс-график. А ниже построим в обычном синтаксисе с медианой.

In [43]:
import plotly.express as px

fig = px.line_polar(df_normalized, r=df_normalized.loc[:,"Voice & Accountability":].values[0], 
                    line_close=True, theta = df_normalized.columns[2:] )
fig.show()


А теперь давайте вернемся к нашей неотнормированной шкале и добавим на график еще и середину (33 место).

In [44]:
import plotly
import plotly.graph_objs as go
from plotly.subplots import make_subplots


trace0 = go.Scatterpolar(
      r = df[df['Country'] == 'Slovakia'].loc[:,"Voice & Accountability":].values[0], # данные на шкалы 
      theta = df.columns[2:], # подписи шкал
      fill = 'toself', # заливка
      name = 'Slovakia'
    )
    
trace_median= go.Scatterpolar(
      r = df.loc[:,"Voice & Accountability":].median().values,
      theta = df.columns[2:],
      fill = 'none',
      name = 'Average'
    )

layout = go.Layout(
    template="plotly_dark",
    polar = dict(
    radialaxis = dict(
      visible = True,
      range = [1, 65]
    )
  ),
  showlegend = False
)

fig = go.Figure(data=[trace0, trace_median], layout=layout)
fig.show()

Контур не замкнулся. Давайте схитрим и сделаем вид, что последня колонка называется как первая и содержит те же значения и Poltly построит их друг над другом.

In [45]:
col_names = list(df.columns)[2:]

In [46]:
col_names.append('Voice & Accountability')

In [47]:
df['Voice & Accountability 2'] = df['Voice & Accountability']

In [48]:
import plotly
import plotly.graph_objs as go


trace0 = go.Scatterpolar(
      r = df[df['Country'] == 'Slovakia'].loc[:,"Voice & Accountability":].values[0], # данные на шкалы 
      theta = col_names, # подписи шкал
      fill = 'toself', # заливка
      name = 'Slovakia',
      marker = dict(color = '#aaf0d1')
    )
    
trace_median= go.Scatterpolar(
      r = df.loc[:,"Voice & Accountability":].median().values,
      theta = col_names,
      fill = 'none',
      name = 'Median',
      marker = dict(color = '#fa8072')
    )

layout = go.Layout(
    title = 'Slovakia',
    template="plotly_dark",
    polar = dict(
    radialaxis = dict(
      visible = True,
      range = [1, 65]
    )
  ),
  showlegend = False
)

fig = go.Figure(data=[trace0, trace_median], layout=layout)
fig.show()

А теперь давайте построим самостоятельно графики для всех наших стран.

In [49]:
for country in df['Country']:
    trace0 = go.Scatterpolar(
          r = df[df['Country'] == country].loc[:,"Voice & Accountability":].values[0], # данные на шкалы 
          theta = col_names, # подписи шкал
          fill = 'toself', # заливка
          name = 'Slovakia',
          marker = dict(color = '#aaf0d1')
        )

    trace_median= go.Scatterpolar(
          r = df.loc[:,"Voice & Accountability":].median().values,
          theta = col_names,
          fill = 'none',
          name = 'Median',
          marker = dict(color = '#fa8072')
        )

    layout = go.Layout(
        title = country,
        template="plotly_dark",
        polar = dict(
        radialaxis = dict(
          visible = True,
          range = [65, 1]
        )
      ),
      showlegend = False
    )
    fig = go.Figure(data=[trace0, trace_median], layout=layout)
    fig.show()