In [41]:
import pandas as pd
import numpy as np
df = pd.DataFrame({'value': np.random.randint(0, 100, 20)})
labels = [ "{0} - {1}".format(i, i + 9) for i in range(0, 100, 10) ]
df['group'] = pd.cut(df.value, range(0, 105, 10), right=False, labels=labels)
df.head(10)

Unnamed: 0,value,group
0,40,40 - 49
1,62,60 - 69
2,17,10 - 19
3,86,80 - 89
4,66,60 - 69
5,57,50 - 59
6,87,80 - 89
7,43,40 - 49
8,50,50 - 59
9,0,0 - 9


In [42]:
raw_cat = pd.Categorical(["a","b","c","a"], categories=["b","c","d"],ordered=False)
s = pd.Series(raw_cat)
s # a is no belong to any category

0    NaN
1      b
2      c
3    NaN
dtype: category
Categories (3, object): [b, c, d]

In [43]:
from pandas.api.types import CategoricalDtype
s = pd.Series(["a", "b", "c", "a"])
cat_type = CategoricalDtype(categories=["b", "c", "d"], ordered=True)
s_cat = s.astype(cat_type)
s_cat.dtype
s_cat.astype(str)
#np.asarray(s_cat) # back to array object

0    nan
1      b
2      c
3    nan
dtype: object

In [44]:
#splitter = np.random.choice([0,1], 5, p=[0.5,0.5])
#s = pd.Series(pd.Categorical.from_codes(splitter, categories=["train", "test"]))

In [45]:
##comparing categories
c1 = CategoricalDtype(['a', 'b', 'c'], ordered=False)
c1 == CategoricalDtype(['a', 'b', 'c'], ordered=False)

True

In [46]:
c1 == 'category'

True

In [47]:
##renaming
s = pd.Series(["a","b","c","a"], dtype="category")
s.cat.categories = ["Group %s" % g for g in s.cat.categories]
##adding new category & removing, doing the both together -> set_categories
s = s.cat.add_categories([4])
s = s.cat.remove_categories([4])
s = s.cat.set_categories(["one","two","three","four"])

In [48]:
#unioning catetories
from pandas.api.types import union_categoricals
a = pd.Categorical(["b", "c"])
b = pd.Categorical(["a", "b"])
union_categoricals([a, b], sort_categories=True) #sort lixically

[b, c, a, b]
Categories (3, object): [a, b, c]

In [49]:
#!!! Sorting will use the order defined by categories, 
#!!! not any lexical order present on the data type
s = pd.Series([1,2,3,1], dtype="category")
s = s.cat.set_categories([2,3,1], ordered=True)
s.sort_values(inplace=True)
s.min(), s.max()
##

(2, 1)

In [50]:
#s.cat.reorder_categories(['b','a','c'], ordered=True).sort_values()

In [51]:
str_s = pd.Series(list('aabb'))
str_cat = str_s.astype('category')
str_s

0    a
1    a
2    b
3    b
dtype: object

In [52]:
str_cat.str.contains("a")

0     True
1     True
2    False
3    False
dtype: bool

In [53]:
date_s = pd.Series(pd.date_range('1/1/2015', periods=5))
date_cat = date_s.astype('category')
date_cat.dt.date

0    2015-01-01
1    2015-01-02
2    2015-01-03
3    2015-01-04
4    2015-01-05
dtype: object

In [54]:
from io import StringIO
s = pd.Series(pd.Categorical(['a', 'b', 'b', 'a', 'a', 'd']))
s.cat.categories = ["very good", "good", "bad"]
s = s.cat.set_categories(["very bad", "bad", "medium", "good", "very good"])
df = pd.DataFrame({"cats":s, "vals":[1,2,3,4,5,6]})
csv = StringIO()
df.to_csv(csv)
df2 = pd.read_csv(StringIO(csv.getvalue()))
df2.dtypes

Unnamed: 0     int64
cats          object
vals           int64
dtype: object

In [55]:
df2["cats"]

0    very good
1         good
2         good
3    very good
4    very good
5          bad
Name: cats, dtype: object

In [56]:
df2["cats"] = df2["cats"].astype("category")
df2["cats"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"],inplace=True)
df2["cats"]

0    very good
1         good
2         good
3    very good
4    very good
5          bad
Name: cats, dtype: category
Categories (5, object): [very bad, bad, medium, good, very good]

In [57]:
#error fill value must be in categories
g = g.cat.add_categories([1])
g.fillna(1)
#To check if a Series contains Categorical data, use hasattr(s, 'cat'):
hasattr(pd.Series(['a'], dtype='category'), 'cat')

NameError: name 'g' is not defined

In [58]:
##Side Effects
cat = pd.Categorical([1,2,3,10], categories=[1,2,3,4,10])
s = pd.Series(cat, name="cat")
s.iloc[0:2] = 10
df = pd.DataFrame(s)
df["cat"].cat.categories = [1,2,3,4,5]
cat

[5, 5, 3, 5]
Categories (5, int64): [1, 2, 3, 4, 5]

In [60]:
##prevent modify the oridinal cat data use copy = True
cat = pd.Categorical([1,2,3,10], categories=[1,2,3,4,10])
s = pd.Series(cat, name="cat", copy=True)
s.iloc[0:2] = 10
df = pd.DataFrame(s)
df["cat"].cat.categories = [1,2,3,4,5]
cat

[1, 2, 3, 10]
Categories (5, int64): [1, 2, 3, 4, 10]