In [1]:
import pandas as pd
import numpy as np

# pd.cut

Use `cut` when you need to segment and sort data values into bins. This
function is also useful for going from a continuous variable to a
categorical variable. For example, `cut` could convert ages to groups of
age ranges. Supports binning into an equal number of bins, or a
pre-specified array of bins.


```python
pd.cut(
    x,
    bins,
    right=True,
    labels=None,
    retbins=False,
    precision=3,
    include_lowest=False,
    duplicates='raise',
)
```

In [4]:
#set the keyword: right = false --> each interval is not CLOSED on the right side  --> e.g: [18, 25), [25, 35)
#by default: right = True --> each interval is CLOSED on the right side --> e.g: (18, 25], (25, 35],...

In [6]:
#discretize into 3 equal-sized bins
vals = [1,2,3,4,5,6]
pd.cut(vals, 3)

[(0.995, 2.667], (0.995, 2.667], (2.667, 4.333], (2.667, 4.333], (4.333, 6.0], (4.333, 6.0]]
Categories (3, interval[float64]): [(0.995, 2.667] < (2.667, 4.333] < (4.333, 6.0]]

In [8]:
#binning at certain points
delimiters = [0, 2, 4, 8]
pd.cut(vals, delimiters)

[(0, 2], (0, 2], (2, 4], (2, 4], (4, 8], (4, 8]]
Categories (3, interval[int64]): [(0, 2] < (2, 4] < (4, 8]]

In [9]:
#open right interval
pd.cut(vals, 3, right = False)

[[1.0, 2.667), [1.0, 2.667), [2.667, 4.333), [2.667, 4.333), [4.333, 6.005), [4.333, 6.005)]
Categories (3, interval[float64]): [[1.0, 2.667) < [2.667, 4.333) < [4.333, 6.005)]

In [13]:
#set label for each bin
pd.cut(vals, delimiters, labels = ['small', 'medium', 'large'])

[small, small, medium, medium, large, large]
Categories (3, object): [small < medium < large]

In [16]:
#set retbins = True return 2 values
binning_array, delimiter = pd.cut(vals, 3, retbins = True)

In [17]:
binning_array

[(0.995, 2.667], (0.995, 2.667], (2.667, 4.333], (2.667, 4.333], (4.333, 6.0], (4.333, 6.0]]
Categories (3, interval[float64]): [(0.995, 2.667] < (2.667, 4.333] < (4.333, 6.0]]

In [19]:
delimiters

[0, 2, 4, 8]

In [32]:
#binning a series map each value to an interval
nums = pd.Series(range(5), index = ['a', 'b', 'c', 'd', 'e'])
pd.cut(nums, 3)

a    (-0.004, 1.333]
b    (-0.004, 1.333]
c     (1.333, 2.667]
d       (2.667, 4.0]
e       (2.667, 4.0]
dtype: category
Categories (3, interval[float64]): [(-0.004, 1.333] < (1.333, 2.667] < (2.667, 4.0]]

In [33]:
#mapping each value to label of each interval

In [34]:
pd.cut(nums, 3, labels = ['low', 'medium', 'high'])

a       low
b       low
c    medium
d      high
e      high
dtype: category
Categories (3, object): [low < medium < high]

# pd.qcut

Quantile-based discretization function. Discretize variable into
equal-sized buckets based on rank or based on sample quantiles. For example
1000 values for 10 quantiles would produce a Categorical object indicating
quantile membership for each data point.

```python
pd.qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise')
```

In [24]:
pd.set_option('display.max_rows', 8)

In [25]:
heights = pd.Series(np.random.rand(300) * 175)
bars = pd.qcut(heights, [0, .25, .5, .75, 1], labels = ['-2 sigma', '-sigma', 'sigma', '2 sigma'])
bars

0         sigma
1         sigma
2         sigma
3      -2 sigma
         ...   
296      -sigma
297      -sigma
298       sigma
299       sigma
Length: 300, dtype: category
Categories (4, object): [-2 sigma < -sigma < sigma < 2 sigma]

In [31]:
pd.qcut(range(5), 4)

[(-0.001, 1.0], (-0.001, 1.0], (1.0, 2.0], (2.0, 3.0], (3.0, 4.0]]
Categories (4, interval[float64]): [(-0.001, 1.0] < (1.0, 2.0] < (2.0, 3.0] < (3.0, 4.0]]