# 分类数据

- 对象创建
    - 系列创作
    - DataFrame创建
    - 控制行为
    - 恢复原始数据
- 范畴型
    - 平等语义
- 描述
- 处理类别
    - 重命名类别
    - 附加新类别
    - 移除类别
    - 删除未使用的类别
    - 设置类别
- 排序和订购
    - 重排
    - 多列排序
- 比较
- 操作
- 数据处理
    - 获得
    - 字符串和日期时间访问器
    - 设置
    - 合并
    - 联合
    - 级联
- 输入/输出数据
- 缺失数据
- R的差异因子
- 性能优化
    - 内存使用
    - 范畴化不是蒙皮列阵
    - 适用中的dtype
    - 分类索引
    - 副作用

In [2]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

## 1 对象创建

### 1.1 系列创作

In [4]:
s = pd.Series(["a","b","c","a"], dtype="category")
s

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): [a, b, c]

In [5]:
df = pd.DataFrame({"A":["a","b","c","a"]})
df["B"] = df["A"].astype('category')
df

Unnamed: 0,A,B
0,a,a
1,b,b
2,c,c
3,a,a


In [8]:
df = pd.DataFrame({'value': np.random.randint(0, 100, 20)})
labels = [f"{i} - {i+9}" for i in range(0, 100, 10)]
df['group'] = pd.cut(df.value, range(0, 105, 10), right=False, labels=labels)
df.head(10)

Unnamed: 0,value,group
0,11,10 - 19
1,56,50 - 59
2,35,30 - 39
3,71,70 - 79
4,23,20 - 29
5,59,50 - 59
6,50,50 - 59
7,7,0 - 9
8,68,60 - 69
9,80,80 - 89


- `cut`

In [9]:
pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3)

[(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], (0.994, 3.0]]
Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] < (5.0, 7.0]]

In [10]:
pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True)

([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], (0.994, 3.0]]
 Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] < (5.0, 7.0]],
 array([0.994, 3.   , 5.   , 7.   ]))

In [11]:
pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, labels=["bad", "medium", "good"])

[bad, good, medium, medium, good, bad]
Categories (3, object): [bad < medium < good]

In [14]:
s = pd.Series(np.array([2, 4, 6, 8, 10]), index=['a', 'b', 'c', 'd', 'e'])
pd.cut(s, 3)

a    (1.992, 4.667]
b    (1.992, 4.667]
c    (4.667, 7.333]
d     (7.333, 10.0]
e     (7.333, 10.0]
dtype: category
Categories (3, interval[float64]): [(1.992, 4.667] < (4.667, 7.333] < (7.333, 10.0]]

In [16]:
s = pd.Series(np.array([2, 4, 6, 8, 10]), index=['a', 'b', 'c', 'd', 'e'])
pd.cut(s, [0, 2, 4, 6, 8, 10], labels=False, retbins=True, right=False)

(a    1.0
 b    2.0
 c    3.0
 d    4.0
 e    NaN
 dtype: float64, array([ 0,  2,  4,  6,  8, 10]))

In [18]:
pd.cut(s, [0, 2, 4, 6, 10, 10], labels=False, retbins=True, right=False, duplicates='drop')

(a    1.0
 b    2.0
 c    3.0
 d    3.0
 e    NaN
 dtype: float64, array([ 0,  2,  4,  6, 10], dtype=int64))

In [19]:
bins = pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)])
pd.cut([0, 0.5, 1.5, 2.5, 4.5], bins)

[NaN, (0, 1], NaN, (2, 3], (4, 5]]
Categories (3, interval[int64]): [(0, 1] < (2, 3] < (4, 5]]

- `pd.Categorical`

In [20]:
raw_cat = pd.Categorical(["a","b","c","a"], categories=["b","c","d"], ordered=False)
s = pd.Series(raw_cat)
s

0    NaN
1      b
2      c
3    NaN
dtype: category
Categories (3, object): [b, c, d]

In [22]:
df = pd.DataFrame({"A":["a","b","c","a"]})
df["B"] = raw_cat

display(
    df,
    df.dtypes
)

Unnamed: 0,A,B
0,a,
1,b,b
2,c,c
3,a,


A      object
B    category
dtype: object

### 1.2 DataFrame创建

In [26]:
df = pd.DataFrame({'A': list('abca'), 'B': list('bccd')}, dtype="category")

display(
    df.dtypes,
    df['A'],
    df['B']
)

A    category
B    category
dtype: object

0    a
1    b
2    c
3    a
Name: A, dtype: category
Categories (3, object): [a, b, c]

0    b
1    c
2    c
3    d
Name: B, dtype: category
Categories (3, object): [b, c, d]

In [27]:
df = pd.DataFrame({'A': list('abca'), 'B': list('bccd')})
df_cat = df.astype('category')

display(
    df_cat.dtypes,
    df_cat['A']
)

A    category
B    category
dtype: object

0    a
1    b
2    c
3    a
Name: A, dtype: category
Categories (3, object): [a, b, c]

### 1.3 控制行为

- 默认行为：
    - 类别是从数据中推断出来的。
    - 类别是无序的。

In [28]:
from pandas.api.types import CategoricalDtype

s = pd.Series(["a", "b", "c", "a"])
cat_type = CategoricalDtype(categories=["b", "c", "d"], ordered=True)
s_cat = s.astype(cat_type)
s_cat

0    NaN
1      b
2      c
3    NaN
dtype: category
Categories (3, object): [b < c < d]

In [29]:
df = pd.DataFrame({'A': list('abca'), 'B': list('bccd')})
cat_type = CategoricalDtype(categories=list('abcd'), ordered=True)
df_cat = df.astype(cat_type)
df_cat['A']

0    a
1    b
2    c
3    a
Name: A, dtype: category
Categories (4, object): [a < b < c < d]

In [36]:
splitter = np.random.choice([0,1], 5, p=[0.3,0.7])
s = pd.Series(pd.Categorical.from_codes(splitter, categories=["train", "test"]))
s

0     test
1    train
2    train
3     test
4     test
dtype: category
Categories (2, object): [train, test]

In [37]:
display(splitter)
pd.Categorical.from_codes(splitter, categories=["train", "test"])

array([1, 0, 0, 1, 1])

[test, train, train, test, test]
Categories (2, object): [train, test]

### 1.4 恢复原始数据

- 回到原版的 `Series` 或 `numpy 数组`。
    - `Series.astype(original_dtype)`
    - `np.asarray(categorical)`

In [40]:
s = pd.Series(["a","b","c","a"])
s2 = s.astype('category')

display(
    s2,
    s2.astype(str),
    np.asarray(s2)
)

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): [a, b, c]

0    a
1    b
2    c
3    a
dtype: object

array(['a', 'b', 'c', 'a'], dtype=object)

## 2 范畴型

In [41]:
from pandas.api.types import CategoricalDtype

display(
    CategoricalDtype(['a', 'b', 'c']),
    CategoricalDtype(['a', 'b', 'c'], ordered=True),
    CategoricalDtype()
)

CategoricalDtype(categories=['a', 'b', 'c'], ordered=None)

CategoricalDtype(categories=['a', 'b', 'c'], ordered=True)

CategoricalDtype(categories=None, ordered=None)

In [43]:
CategoricalDtype(['a', 'b', 'c'])

CategoricalDtype(categories=['a', 'b', 'c'], ordered=None)

### 2.1 比较

In [46]:
c1 = CategoricalDtype(['a', 'b', 'c'], ordered=False)

display(
    c1 == CategoricalDtype(['b', 'c', 'a'], ordered=False),
    c1 == CategoricalDtype(['a',  'b', 'c'], ordered=True),
    c1 == 'category',
)

True

False

True

## 3 描述

In [48]:
cat = pd.Categorical(["a", "c", "c", np.nan], categories=["b", "a", "c"])
df = pd.DataFrame({"cat":cat, "s":["a", "c", "c", np.nan]})

display(
    df.describe(),
    df["cat"].describe(),
)

Unnamed: 0,cat,s
count,3,3
unique,2,2
top,c,c
freq,2,2


count     3
unique    2
top       c
freq      2
Name: cat, dtype: object

## 4 类别处理操作

- 类别：`s.cat.categories`
- 有序属性：`s.cat.ordered`

In [52]:
s = pd.Series(["a","b","c","a"], dtype="category")

display(
    s.cat.categories,
    s.cat.ordered
)

Index(['a', 'b', 'c'], dtype='object')

False

In [53]:
s = pd.Series(pd.Categorical(["a","b","c","a"], categories=["c","b","a"]))

display(
    s.cat.categories,
    s.cat.ordered
)

Index(['c', 'b', 'a'], dtype='object')

False

In [55]:
s = pd.Series(list('babc')).astype(CategoricalDtype(list('abcd')))

display(
    s,
    s.cat.categories,
    s.unique()
)

0    b
1    a
2    b
3    c
dtype: category
Categories (4, object): [a, b, c, d]

Index(['a', 'b', 'c', 'd'], dtype='object')

[b, a, c]
Categories (3, object): [b, a, c]

### 4.1 重命名类别

In [59]:
s = pd.Series(["a","b","c","a"], dtype="category")
display(s)

s.cat.categories = ["Group %s" % g for g in s.cat.categories]
display(s)

s = s.cat.rename_categories([1,2,3])
display(s)

s = s.cat.rename_categories({1: 'x', 2: 'y', 3: 'z'})
display(s)

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): [a, b, c]

0    Group a
1    Group b
2    Group c
3    Group a
dtype: category
Categories (3, object): [Group a, Group b, Group c]

0    1
1    2
2    3
3    1
dtype: category
Categories (3, int64): [1, 2, 3]

0    x
1    y
2    z
3    x
dtype: category
Categories (3, object): [x, y, z]

In [60]:
# 类别必须是唯一的，否则会报 ValueError
try:
    s.cat.categories = [1,1,1]
except ValueError as e:
    print("ValueError: " + str(e))

ValueError: Categorical categories must be unique


### 4.2 添加新类别

In [61]:
s = s.cat.add_categories([4])

display(
    s.cat.categories, 
    s
)

Index(['x', 'y', 'z', 4], dtype='object')

0    x
1    y
2    z
3    x
dtype: category
Categories (4, object): [x, y, z, 4]

### 4.3 移除类别

In [62]:
s = s.cat.remove_categories([4])

display(
    s.cat.categories, 
    s
)

Index(['x', 'y', 'z'], dtype='object')

0    x
1    y
2    z
3    x
dtype: category
Categories (3, object): [x, y, z]

### 4.4 删除未使用的类别

In [64]:
s = pd.Series(pd.Categorical(["a","b","a"], categories=["a","b","c","d"]))


display(
    s,
    s.cat.remove_unused_categories()
)

0    a
1    b
2    a
dtype: category
Categories (4, object): [a, b, c, d]

0    a
1    b
2    a
dtype: category
Categories (2, object): [a, b]

### 4.5 设置类别

In [65]:
s = pd.Series(["one","two","four", "-"], dtype="category")
display(s)

s = s.cat.set_categories(["one","two","three","four"])
display(s)

0     one
1     two
2    four
3       -
dtype: category
Categories (4, object): [-, four, one, two]

0     one
1     two
2    four
3     NaN
dtype: category
Categories (4, object): [one, two, three, four]

## 5 排序和设置顺序

In [69]:
s = pd.Series(pd.Categorical(["a","b","c","a"], ordered=False))
s.sort_values(inplace=True)
display(s)

s = pd.Series(["a","b","c","a"]).astype(CategoricalDtype(ordered=True))
s.sort_values(inplace=True)
display(s)

display(s.min(), s.max())

0    a
3    a
1    b
2    c
dtype: category
Categories (3, object): [a, b, c]

0    a
3    a
1    b
2    c
dtype: category
Categories (3, object): [a < b < c]

'a'

'c'

In [72]:
s = pd.Series([1,2,3,1], dtype="category")
s = s.cat.set_categories([2,3,1], ordered=True)
display(s)

s.sort_values(inplace=True)
display(s)

display(s.min(), s.max())

0    1
1    2
2    3
3    1
dtype: category
Categories (3, int64): [2 < 3 < 1]

1    2
2    3
0    1
3    1
dtype: category
Categories (3, int64): [2 < 3 < 1]

2

1

### 5.1 重排序

In [73]:
s = pd.Series([1,2,3,1], dtype="category")
s = s.cat.reorder_categories([2,3,1], ordered=True)
display(s)

s.sort_values(inplace=True)
display(s)

display(s.min(), s.max())

0    1
1    2
2    3
3    1
dtype: category
Categories (3, int64): [2 < 3 < 1]

1    2
2    3
0    1
3    1
dtype: category
Categories (3, int64): [2 < 3 < 1]

2

1

### 5.2 多列排序

In [74]:
dfs = pd.DataFrame({'A' : pd.Categorical(list('bbeebbaa'), categories=['e','a','b'], ordered=True),
                    'B' : [1,2,1,2,2,1,2,1] })
dfs.sort_values(by=['A', 'B'])

Unnamed: 0,A,B
2,e,1
3,e,2
7,a,1
6,a,2
0,b,1
5,b,1
1,b,2
4,b,2


In [75]:
dfs['A'] = dfs['A'].cat.reorder_categories(['a','b','e'])
dfs.sort_values(by=['A','B'])

Unnamed: 0,A,B
7,a,1
6,a,2
0,b,1
5,b,1
1,b,2
4,b,2
2,e,1
3,e,2


## 6 比较

- 三种情况可以将分类数据和其他对象进行比较：
     - `==` 与 `!=`, 长度相等的列表、序列、数组等数据；
     - `==`, `!=`, `>`, `>=`, `<`，`<=`
 

## 7 操作

## 8 数据处理

### 8.1 获得

### 8.2 字符串和日期时间访问器

### 8.3 设置

### 8.4 合并

### 8.5 联合

### 8.6 级联

## 9 输入/输出数据

## 10 缺失数据

## 11 R的差异因子

## 12 性能优化

### 12.1 内存使用

### 12.2 范畴化不是蒙皮列阵

### 12.3 适用中的dtype

### 12.4 分类索引

### 12.5 副作用