# Настройка ноутбука

In [19]:
import pandas as pd
import numpy as np
import datetime as dt

from pandasql import sqldf

In [20]:
import warnings
warnings.filterwarnings('ignore')

In [21]:
# Расширить рабочее поле ноутбука на весь экран
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Описание

Сводная таблица - это отличный способ представления данных в разрезе нескольких категий. В этом ноутбуке представлены практики построения сводных таблиц с помощью языка SQL.

# Практики

In [22]:
data = pd.DataFrame({
    'date': [
        dt.date(2022, 8, 1), dt.date(2022, 8, 1), dt.date(2022, 8, 1),
        dt.date(2022, 8, 2), dt.date(2022, 8, 2), dt.date(2022, 8, 2),
        dt.date(2022, 8, 3), dt.date(2022, 8, 3), dt.date(2022, 8, 3),
    ],
    'cat': [
        'A', 'B', 'C',
        'A', 'B', 'C',
        'A', 'B', 'C',
    ],
    'value': [
        1, 2, 3,
        1, 2, 3,
        1, 2, 3,
    ]
})

data

Unnamed: 0,date,cat,value
0,2022-08-01,A,1
1,2022-08-01,B,2
2,2022-08-01,C,3
3,2022-08-02,A,1
4,2022-08-02,B,2
5,2022-08-02,C,3
6,2022-08-03,A,1
7,2022-08-03,B,2
8,2022-08-03,C,3


## С помощью объединения подзапросов

In [23]:
query = """
select
    a.date, cat_a, cat_b, cat_c
from
              (select date, sum(value) as cat_a from data where cat = 'A' group by date) a
    left join (select date, sum(value) as cat_b from data where cat = 'B' group by date) b on a.date = b.date
    left join (select date, sum(value) as cat_c from data where cat = 'C' group by date) c on b.date = c.date
"""

pysqldf = lambda q: sqldf(q, globals())
pysqldf(query)

Unnamed: 0,date,cat_a,cat_b,cat_c
0,2022-08-01,1,2,3
1,2022-08-02,1,2,3
2,2022-08-03,1,2,3


## C помощью оператора case

In [24]:
query = """
select
      date
    , sum(case when cat = 'A' then value end) as cat_a
    , sum(case when cat = 'B' then value end) as cat_b
    , sum(case when cat = 'C' then value end) as cat_c
from data
group by date
"""

pysqldf = lambda q: sqldf(q, globals())
pysqldf(query)

Unnamed: 0,date,cat_a,cat_b,cat_c
0,2022-08-01,1,2,3
1,2022-08-02,1,2,3
2,2022-08-03,1,2,3


#### Примеры задач

- [Stratascratch: Premium vs Freemium - Hard](https://platform.stratascratch.com/coding/10300-premium-vs-freemium?tabname=solutions)

# Кейсы

## Разложить многоуровневые записи в строку

### Есть индекс группы и уровень

In [47]:
data = pd.DataFrame({
    'group_id': [1,1,1, 2,2,2, 3,3,3, 4,4,4, 5,5,5],
    'lvl': [0,1,2, 0,1,2, 0,1,2, 0,1,2, 0,1,2],
    'val': ['val11','val12','val13', 'val21','val22','val23', 'val31','val32','val33', 'val41','val42','val43', 'val51','val52','val53'],
})

data

Unnamed: 0,group_id,lvl,val
0,1,0,val11
1,1,1,val12
2,1,2,val13
3,2,0,val21
4,2,1,val22
5,2,2,val23
6,3,0,val31
7,3,1,val32
8,3,2,val33
9,4,0,val41


#### С помощью объединения подзапросов

In [64]:
query = """
select
    lvl_0, lvl_1, lvl_2
from
    (select group_id, val as lvl_0 from data where lvl=0) l0
    join
    (select group_id, val as lvl_1 from data where lvl=1) l1 on l0.group_id = l1.group_id
    join
    (select group_id, val as lvl_2 from data where lvl=2) l2 on l1.group_id = l2.group_id
"""

pysqldf = lambda q: sqldf(q, globals())
pysqldf(query)

Unnamed: 0,lvl_0,lvl_1,lvl_2
0,val11,val12,val13
1,val21,val22,val23
2,val31,val32,val33
3,val41,val42,val43
4,val51,val52,val53


### Есть индекс родителя и номер уровня

In [27]:
data = pd.DataFrame({
    'lvl': [0,1,2, 0,1,2, 0,1,2, 0,1,2, 0,1,2],
    'val_id': [11.,12.,13., 21.,22.,23., 31.,32.,33., 41.,42.,43., 51.,52.,53.,],
    'parent_id': [None,11.,12., None,21.,22., None,31.,32., None,41.,42., None,51.,52.],
    'val': ['val11','val12','val13', 'val21','val22','val23', 'val31','val32','val33', 'val41','val42','val43', 'val51','val52','val53']
})
data

Unnamed: 0,lvl,val_id,parent_id,val
0,0,11.0,,val11
1,1,12.0,11.0,val12
2,2,13.0,12.0,val13
3,0,21.0,,val21
4,1,22.0,21.0,val22
5,2,23.0,22.0,val23
6,0,31.0,,val31
7,1,32.0,31.0,val32
8,2,33.0,32.0,val33
9,0,41.0,,val41


#### С помощью объединения подзапросов

In [28]:
query = """
select
    lvl_0, lvl_1, lvl_2
from
    (select val_id, parent_id, val as lvl_0 from data where lvl = 0) a 
    join 
    (select val_id, parent_id, val as lvl_1 from data where lvl = 1) b on a.val_id = b.parent_id
    join
    (select val_id, parent_id, val as lvl_2 from data where lvl = 2) c on b.val_id = c.parent_id
"""

pysqldf = lambda q: sqldf(q, globals())
pysqldf(query)

Unnamed: 0,lvl_0,lvl_1,lvl_2
0,val11,val12,val13
1,val21,val22,val23
2,val31,val32,val33
3,val41,val42,val43
4,val51,val52,val53


### Есть только индекс родителя

In [13]:
data = pd.DataFrame({
    'val_id': [11.,12.,13., 21.,22.,23., 31.,32.,33., 41.,42.,43., 51.,52.,53.,],
    'parent_id': [None,11.,12., None,21.,22., None,31.,32., None,41.,42., None,51.,52.],
    'val': ['val11','val12','val13', 'val21','val22','val23', 'val31','val32','val33', 'val41','val42','val43', 'val51','val52','val53']
})
data

Unnamed: 0,val_id,parent_id,val
0,11.0,,val11
1,12.0,11.0,val12
2,13.0,12.0,val13
3,21.0,,val21
4,22.0,21.0,val22
5,23.0,22.0,val23
6,31.0,,val31
7,32.0,31.0,val32
8,33.0,32.0,val33
9,41.0,,val41


In [33]:
query = """
with

lvl_0_tab as (select val_id, parent_id, val from data where parent_id is null),
lvl_1_tab as (select val_id, parent_id, val from data where parent_id in (select val_id from lvl_0_tab)),
lvl_2_tab as (select val_id, parent_id, val from data where parent_id in (select val_id from lvl_1_tab))

select 
      l0.val as lvl_0
    , l1.val as lvl_1
    , l2.val as lvl_2
from
    lvl_0_tab l0
    join 
    lvl_1_tab l1 on l0.val_id = l1.parent_id
    join
    lvl_2_tab l2 on l1.val_id = l2.parent_id
"""

pysqldf = lambda q: sqldf(q, globals())
pysqldf(query)

Unnamed: 0,lvl_0,lvl_1,lvl_2
0,val11,val12,val13
1,val21,val22,val23
2,val31,val32,val33
3,val41,val42,val43
4,val51,val52,val53
