# Chap5

## 全部のColumnでどの値が何回出てるか調べる（HistGram的）

In [6]:
import pandas as pd

df = pd.DataFrame(
    {
        'a': [1,2,3,4,5],
        'b': [2,4,6,8,10],
        'c': [1, 3, 6, 9, 12],
    }
)

df.apply(pd.value_counts)

      a    b    c
1   1.0  NaN  1.0
2   1.0  1.0  NaN
3   1.0  NaN  1.0
4   1.0  1.0  NaN
5   1.0  NaN  NaN
6   NaN  1.0  1.0
8   NaN  1.0  NaN
9   NaN  NaN  1.0
10  NaN  1.0  NaN
12  NaN  NaN  1.0

In [7]:
df.apply(pd.value_counts).fillna(0)

      a    b    c
1   1.0  0.0  1.0
2   1.0  1.0  0.0
3   1.0  0.0  1.0
4   1.0  1.0  0.0
5   1.0  0.0  0.0
6   0.0  1.0  1.0
8   0.0  1.0  0.0
9   0.0  0.0  1.0
10  0.0  1.0  0.0
12  0.0  0.0  1.0

# Chap6

## mapの使い方

In [17]:
import pandas as pd

def test(x):
    return x

df = pd.DataFrame({
    'value': range(21),
})

df['value'].map(test)

0      0
1      1
2      2
3      3
4      4
5      5
6      6
7      7
8      8
9      9
10    10
11    11
12    12
13    13
14    14
15    15
16    16
17    17
18    18
19    19
20    20
Name: value, dtype: int64

In [18]:
import pandas as pd

def test(x):
    if x % 3 == 0:
        return 'fiz'
    elif x % 2 == 0:
        return 'buz'
    else:
        return 'fizbuz'

df = pd.DataFrame({
    'value': range(21),
})

df['value'].map(test)

0        fiz
1     fizbuz
2        buz
3        fiz
4        buz
5     fizbuz
6        fiz
7     fizbuz
8        buz
9        fiz
10       buz
11    fizbuz
12       fiz
13    fizbuz
14       buz
15       fiz
16       buz
17    fizbuz
18       fiz
19    fizbuz
20       buz
Name: value, dtype: object

## 数字をカテゴリーに分ける（Cut）

### 18 - 25, 26 - 35, 36 - 60 

In [21]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 41, 32]
bins = [18, 25, 35, 60, 100]

cats = pd.cut(ages, bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (35, 60], (25, 35], (60, 100], (35, 60], (25, 35]]
Length: 11
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [23]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 1], dtype=int8)

In [26]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]]
              closed='right',
              dtype='interval[int64]')

In [28]:
cats.categories[1]

Interval(25, 35, closed='right')

In [29]:
pd.value_counts(cats)

(18, 25]     5
(25, 35]     3
(35, 60]     2
(60, 100]    1
dtype: int64

### name categories

In [30]:
group_names  = ['youth', 'young-adult', 'middle-aged', 'senior']

pd.cut(ages, bins, labels=group_names)


[youth, youth, youth, young-adult, youth, ..., middle-aged, young-adult, senior, middle-aged, young-adult]
Length: 11
Categories (4, object): [youth < young-adult < middle-aged < senior]

### 勝手にBinに仕分ける方法

In [49]:
# 4つのBin
data = range(20)
cate = pd.cut(data, 4, precision=2)
pd.value_counts(cate)

(14.25, 19.0]     5
(9.5, 14.25]      5
(4.75, 9.5]       5
(-0.019, 4.75]    5
dtype: int64

In [50]:
# 2つのBin
data = range(13)
cate = pd.cut(data, 2, precision=2)
pd.value_counts(cate)

(-0.012, 6.0]    7
(6.0, 12.0]      6
dtype: int64

#### Cutは適当にBinにいれる、QcutはBinを同じぐらいにしてくれる

In [67]:
# qcutだとRandomNumでもBinのサイズを同じぐらいにしてくれる
import numpy as np
data = np.random.randn(1000)
cate = pd.qcut(data, 4)
pd.value_counts(cate)


(0.663, 2.876]      250
(0.0358, 0.663]     250
(-0.661, 0.0358]    250
(-4.013, -0.661]    250
dtype: int64

In [68]:
# cut だと都合よくいかない
import numpy as np
data = np.random.randn(1000)
cate = pd.cut(data, 4)
pd.value_counts(cate)

(-1.376, 0.169]     456
(0.169, 1.714]      396
(-2.927, -1.376]     97
(1.714, 3.259]       51
dtype: int64