In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import pandas as pd

# use seaborn plotting defaults
import seaborn as sns; sns.set()

In [2]:
values = pd.Series(['apple', 'orange', 'apple',
                     'apple'] * 2)

In [3]:
values.value_counts() / values.value_counts().sum()

apple     0.75
orange    0.25
Name: count, dtype: float64

In [4]:
values = pd.Series([0, 1, 0, 0] * 2)
dim = pd.Series(['peppa', 'pig'])

In [7]:
fruits = ['apple', 'orange', 'apple', 'apple'] * 2

In [8]:
N = len(fruits)
rng = np.random.default_rng(seed=27)

In [9]:
df = pd.DataFrame({'fruit': fruits,
                    'basket_id': np.arange(N),
                    'count': rng.integers(3, 15, size=N),
                    'weight': rng.uniform(0, 4, size=N)},
                    columns=['basket_id', 'fruit', 'count', 'weight'])

df

Unnamed: 0,basket_id,fruit,count,weight
0,0,apple,3,3.724847
1,1,orange,11,3.158669
2,2,apple,5,0.040076
3,3,apple,6,0.795733
4,4,apple,14,1.172455
5,5,orange,4,3.773663
6,6,apple,10,1.613748
7,7,apple,6,0.727264


In [10]:
fruit_cat = df['fruit'].astype('category')
fruit_cat

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: category
Categories (2, object): ['apple', 'orange']

In [11]:
c = fruit_cat.array
c

['apple', 'orange', 'apple', 'apple', 'apple', 'orange', 'apple', 'apple']
Categories (2, object): ['apple', 'orange']

In [12]:
c.codes

array([0, 1, 0, 0, 0, 1, 0, 0], dtype=int8)

In [13]:
dict(enumerate(c.categories))

{0: 'apple', 1: 'orange'}

In [18]:
df['fruit'] = df['fruit'].astype('category')
df['fruit']

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: category
Categories (2, object): ['apple', 'orange']

In [21]:
categories = ['foo', 'bar', 'baz']
codes = [0, 1, 2, 0, 0, 1]

pd.Categorical.from_codes(codes, categories, ordered=True)

['foo', 'bar', 'baz', 'foo', 'foo', 'bar']
Categories (3, object): ['foo' < 'bar' < 'baz']

In [22]:
draws = rng.standard_normal(1000)
draws[:5]

array([-1.83985714, -0.14555697,  0.39827897, -2.16206127,  0.01447731])

In [25]:
bins = pd.qcut(draws, 4, labels=['q1','q2','q3','q4'])
bins

['q1', 'q2', 'q3', 'q1', 'q2', ..., 'q4', 'q4', 'q1', 'q4', 'q3']
Length: 1000
Categories (4, object): ['q1' < 'q2' < 'q3' < 'q4']

In [27]:
bins.codes[:5]

array([0, 1, 2, 0, 1], dtype=int8)

In [28]:
bins = pd.Series(bins, name='quartile')
bins

0      q1
1      q2
2      q3
3      q1
4      q2
       ..
995    q4
996    q4
997    q1
998    q4
999    q3
Name: quartile, Length: 1000, dtype: category
Categories (4, object): ['q1' < 'q2' < 'q3' < 'q4']

In [31]:
results = (pd.Series(draws)
           .groupby(bins, observed=False)
           .agg(['count', 'min', 'max', 'mean'])
           .reset_index())

results

Unnamed: 0,quartile,count,min,max,mean
0,q1,250,-3.027768,-0.639834,-1.268908
1,q2,250,-0.637633,0.079614,-0.273015
2,q3,250,0.081223,0.72585,0.384137
3,q4,250,0.729181,2.773904,1.336958


In [32]:
s = pd.Series(['a', 'b', 'c', 'd'] * 2)
cat_s = s.astype('category')
cat_s

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']

In [33]:
actual_categories = ['a', 'b', 'c', 'd', 'e']
cat_s2 = cat_s.cat.set_categories(actual_categories)
cat_s2

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (5, object): ['a', 'b', 'c', 'd', 'e']

In [34]:
cat_s2.value_counts()

a    2
b    2
c    2
d    2
e    0
Name: count, dtype: int64

In [41]:
fruit_cat = df['fruit'].astype('category')
fruit_cat

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: category
Categories (2, object): ['apple', 'orange']

In [46]:
fruit_cat.cat.categories

Index(['apple', 'orange'], dtype='object')