In [1]:
import numpy as np
import pandas as pd
import datetime as dt

# Создание сводных таблиц

- Статья в документации pandas, посвященная изменению формы таблиц - https://pandas.pydata.org/docs/user_guide/reshaping.html
- Хороший туториал по этой теме - https://towardsdatascience.com/pandas-pivot-the-ultimate-guide-5c693e0771f3

In [2]:
df = pd.DataFrame({
    'datetime': pd.date_range('1/1/2000', periods=9, freq='T'),
    'oper': ['A','B', 'A','C','C','C','A','A','B'],
    'reaction': [1,2,1,3,2,3,2,1,2],
    'chat_id': ['1000','1001','1002','1003','1004','1005','1006','1007','1008']
})

opers = df['oper'].unique()

df

Unnamed: 0,datetime,oper,reaction,chat_id
0,2000-01-01 00:00:00,A,1,1000
1,2000-01-01 00:01:00,B,2,1001
2,2000-01-01 00:02:00,A,1,1002
3,2000-01-01 00:03:00,C,3,1003
4,2000-01-01 00:04:00,C,2,1004
5,2000-01-01 00:05:00,C,3,1005
6,2000-01-01 00:06:00,A,2,1006
7,2000-01-01 00:07:00,A,1,1007
8,2000-01-01 00:08:00,B,2,1008


In [3]:
# Продублируем последнюю строку
wdf = df.append(
    pd.DataFrame({
        'datetime': [dt.datetime(2000,1,1,0,8,0)], 
        'oper': ['B'], 
        'reaction': [2],
        'chat_id': ['1009']
    })
)
wdf

Unnamed: 0,datetime,oper,reaction,chat_id
0,2000-01-01 00:00:00,A,1,1000
1,2000-01-01 00:01:00,B,2,1001
2,2000-01-01 00:02:00,A,1,1002
3,2000-01-01 00:03:00,C,3,1003
4,2000-01-01 00:04:00,C,2,1004
5,2000-01-01 00:05:00,C,3,1005
6,2000-01-01 00:06:00,A,2,1006
7,2000-01-01 00:07:00,A,1,1007
8,2000-01-01 00:08:00,B,2,1008
0,2000-01-01 00:08:00,B,2,1009


## Метод pivot - https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.pivot.html

In [4]:
df1 = df.set_index('datetime')
df1 = df1.pivot(columns='oper', values='reaction')
df1

oper,A,B,C
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000-01-01 00:00:00,1.0,,
2000-01-01 00:01:00,,2.0,
2000-01-01 00:02:00,1.0,,
2000-01-01 00:03:00,,,3.0
2000-01-01 00:04:00,,,2.0
2000-01-01 00:05:00,,,3.0
2000-01-01 00:06:00,2.0,,
2000-01-01 00:07:00,1.0,,
2000-01-01 00:08:00,,2.0,


#### В принципе pivot делает то же, что и эта команда
<img src="data/img/reshaping_unstack.png" width="500">

In [5]:
df.set_index(["datetime", "oper"])["reaction"].unstack()

oper,A,B,C
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000-01-01 00:00:00,1.0,,
2000-01-01 00:01:00,,2.0,
2000-01-01 00:02:00,1.0,,
2000-01-01 00:03:00,,,3.0
2000-01-01 00:04:00,,,2.0
2000-01-01 00:05:00,,,3.0
2000-01-01 00:06:00,2.0,,
2000-01-01 00:07:00,1.0,,
2000-01-01 00:08:00,,2.0,


##### А вот если в индексах или колонках есть дубликаты - pivot ломается

In [6]:
df1 = wdf.set_index('datetime')
df1 = df1.pivot(columns='oper', values='reaction')
df1

ValueError: Index contains duplicate entries, cannot reshape

##### Есть вариант получше

## Метод pivot_table - https://pandas.pydata.org/docs/reference/api/pandas.pivot_table.html

In [7]:
df2 = wdf.set_index('datetime')
df2 = df2.pivot_table(index='datetime', columns='oper', values='chat_id', aggfunc='count')
df2

oper,A,B,C
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000-01-01 00:00:00,1.0,,
2000-01-01 00:01:00,,1.0,
2000-01-01 00:02:00,1.0,,
2000-01-01 00:03:00,,,1.0
2000-01-01 00:04:00,,,1.0
2000-01-01 00:05:00,,,1.0
2000-01-01 00:06:00,1.0,,
2000-01-01 00:07:00,1.0,,
2000-01-01 00:08:00,,2.0,


##### Механизм работы pivot_table больше напоминает эту команду.

In [8]:
wdf.groupby(['datetime', 'oper']).reaction.sum().unstack()

oper,A,B,C
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000-01-01 00:00:00,1.0,,
2000-01-01 00:01:00,,2.0,
2000-01-01 00:02:00,1.0,,
2000-01-01 00:03:00,,,3.0
2000-01-01 00:04:00,,,2.0
2000-01-01 00:05:00,,,3.0
2000-01-01 00:06:00,2.0,,
2000-01-01 00:07:00,1.0,,
2000-01-01 00:08:00,,4.0,


##### Здесь агрегирующая функция - это sum. Но в принципе мы можем применить любую функцию

#### С помощью метода stack можно свернуть сводную таблицу
<img src="data/img/reshaping_stack.png" width="500">

In [9]:
df2.stack()

datetime             oper
2000-01-01 00:00:00  A       1.0
2000-01-01 00:01:00  B       1.0
2000-01-01 00:02:00  A       1.0
2000-01-01 00:03:00  C       1.0
2000-01-01 00:04:00  C       1.0
2000-01-01 00:05:00  C       1.0
2000-01-01 00:06:00  A       1.0
2000-01-01 00:07:00  A       1.0
2000-01-01 00:08:00  B       2.0
dtype: float64

## Как посчитать записи по заданным временным периодам? - метод resample

In [10]:
tmp_count = df2.resample('3min').count()
tmp_count

oper,A,B,C
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000-01-01 00:00:00,2,1,0
2000-01-01 00:03:00,0,0,3
2000-01-01 00:06:00,2,1,0


#### Чтобы превести индексацию к привычному виду, достаточно применить метод .reset_index()

In [11]:
tmp_count = tmp_count.reset_index()
tmp_count

oper,datetime,A,B,C
0,2000-01-01 00:00:00,2,1,0
1,2000-01-01 00:03:00,0,0,3
2,2000-01-01 00:06:00,2,1,0


#### Правда в этом случае сохранится название индекса. Это можно исправить

In [12]:
tmp_count = tmp_count.rename_axis(None).rename_axis(None, axis=1)
tmp_count

Unnamed: 0,datetime,A,B,C
0,2000-01-01 00:00:00,2,1,0
1,2000-01-01 00:03:00,0,0,3
2,2000-01-01 00:06:00,2,1,0


In [13]:
tmp_count['date'] = tmp_count.datetime.dt.date
tmp_count['time'] = tmp_count.datetime.dt.strftime('%H:%M')
tmp_count['all'] = tmp_count[opers].sum(axis=1)

tmp_count = tmp_count[['date','time','all', *opers]].fillna(0)

tmp_count

Unnamed: 0,date,time,all,A,B,C
0,2000-01-01,00:00,3,2,1,0
1,2000-01-01,00:03,3,0,0,3
2,2000-01-01,00:06,3,2,1,0


### К счетным таблицам можно добавить heatmap

In [14]:
tmp_count = (
    tmp_count.style
             .format({col: "{}" for col in tmp_count.columns})
             .background_gradient(
                 subset=opers, # По каким колонкам считать heatmap 
                 axis=1, # 0 - считать по колонкам, 1 - считать по строкам
                 cmap='Greys' # выбор цвета
             )
)

tmp_count

Unnamed: 0,date,time,all,A,B,C
0,2000-01-01,00:00,3,2,1,0
1,2000-01-01,00:03,3,0,0,3
2,2000-01-01,00:06,3,2,1,0
