In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import KBinsDiscretizer
import seaborn as sns
from IPython.display import display
pd.set_option('max_rows', 5)

# 離散化

## 離散化の効果
---
連続変数をカテゴリ変数化することを離散化という。  
連続変数のほうが含まれる情報は多いが、例えば年齢を 10 歳ごとの階級に分割するなどしたほうがデータの理解がしやすかったり、単純な線形でない関係を捉えられたりするため、離散化することがある。

## 等間隔に分割
---
年齢を 10 歳ごとの階級に分割するなど、各階級の幅が等しくなるように分割。

### Pythonでの等間隔階級への分割の実行方法
---
`pandas.cut`または`sklearn.preprocessing.KBinsDiscretizer`を使用する。

In [2]:
fare = sns.load_dataset('titanic')['fare']
print('fare')
display(fare)

fare


0       7.2500
1      71.2833
        ...   
889    30.0000
890     7.7500
Name: fare, Length: 891, dtype: float64

In [3]:
help(pd.cut)

Help on function cut in module pandas.core.reshape.tile:

cut(x, bins, right: bool = True, labels=None, retbins: bool = False, precision: int = 3, include_lowest: bool = False, duplicates: str = 'raise')
    Bin values into discrete intervals.
    
    Use `cut` when you need to segment and sort data values into bins. This
    function is also useful for going from a continuous variable to a
    categorical variable. For example, `cut` could convert ages to groups of
    age ranges. Supports binning into an equal number of bins, or a
    pre-specified array of bins.
    
    Parameters
    ----------
    x : array-like
        The input array to be binned. Must be 1-dimensional.
    bins : int, sequence of scalars, or IntervalIndex
        The criteria to bin by.
    
        * int : Defines the number of equal-width bins in the range of `x`. The
          range of `x` is extended by .1% on each side to include the minimum
          and maximum values of `x`.
        * sequence of scal

In [4]:
pd.cut(fare, 5)

0      (-0.512, 102.466]
1      (-0.512, 102.466]
             ...        
889    (-0.512, 102.466]
890    (-0.512, 102.466]
Name: fare, Length: 891, dtype: category
Categories (5, interval[float64]): [(-0.512, 102.466] < (102.466, 204.932] < (204.932, 307.398] < (307.398, 409.863] < (409.863, 512.329]]

In [5]:
help(KBinsDiscretizer)

Help on class KBinsDiscretizer in module sklearn.preprocessing._discretization:

class KBinsDiscretizer(sklearn.base.TransformerMixin, sklearn.base.BaseEstimator)
 |  KBinsDiscretizer(n_bins=5, *, encode='onehot', strategy='quantile')
 |  
 |  Bin continuous data into intervals.
 |  
 |  Read more in the :ref:`User Guide <preprocessing_discretization>`.
 |  
 |  .. versionadded:: 0.20
 |  
 |  Parameters
 |  ----------
 |  n_bins : int or array-like, shape (n_features,) (default=5)
 |      The number of bins to produce. Raises ValueError if ``n_bins < 2``.
 |  
 |  encode : {'onehot', 'onehot-dense', 'ordinal'}, (default='onehot')
 |      Method used to encode the transformed result.
 |  
 |      onehot
 |          Encode the transformed result with one-hot encoding
 |          and return a sparse matrix. Ignored features are always
 |          stacked to the right.
 |      onehot-dense
 |          Encode the transformed result with one-hot encoding
 |          and return a dense array

In [6]:
range_transformer = KBinsDiscretizer(n_bins=5,
                                     encode='ordinal',
                                     strategy='uniform')
range_transformer.fit(fare.values.reshape((-1, 1)))
edges = range_transformer.bin_edges_[0]
interval = [f'{edges[i]}~{edges[i+1]}' for i in range(edges.size - 1)]
print(f'interval: {interval}')
display(
    pd.Series(
        range_transformer.transform(fare.values.reshape((-1, 1))).ravel()))

interval: ['0.0~102.46584', '102.46584~204.93168', '204.93168~307.39752', '307.39752~409.86336', '409.86336~512.3292']


0      0.0
1      0.0
      ... 
889    0.0
890    0.0
Length: 891, dtype: float64

練習問題

---
`age`データセットを 0 歳 ~ 80 歳 まで 10 歳刻みで離散化する。 (`sklearn.preprocessing.KBinsDiscretizer`は間隔を指定できないので、`pandas.cut`を使用する)

In [7]:
age = sns.load_dataset('titanic')['age']
age.fillna(0.0, inplace=True)
print('age')
display(age)

age


0      22.0
1      38.0
       ... 
889    26.0
890    32.0
Name: age, Length: 891, dtype: float64

解答例

---

In [8]:
pd.cut(age, bins=np.arange(0, 90, 10))

0      (20, 30]
1      (30, 40]
         ...   
889    (20, 30]
890    (30, 40]
Name: age, Length: 891, dtype: category
Categories (8, interval[int64]): [(0, 10] < (10, 20] < (20, 30] < (30, 40] < (40, 50] < (50, 60] < (60, 70] < (70, 80]]

---

## 頻度で分割
---
各階級に含まれるサンプルの数が等しくなるように分割。

### Pythonでの等頻度階級への分割の実行方法
---
`pandas.qcut`または`sklearn.preprocessing.KBinsDiscretizer`を使用する。

In [9]:
help(pd.qcut)

Help on function qcut in module pandas.core.reshape.tile:

qcut(x, q, labels=None, retbins: bool = False, precision: int = 3, duplicates: str = 'raise')
    Quantile-based discretization function.
    
    Discretize variable into equal-sized buckets based on rank or based
    on sample quantiles. For example 1000 values for 10 quantiles would
    produce a Categorical object indicating quantile membership for each data point.
    
    Parameters
    ----------
    x : 1d ndarray or Series
    q : int or list-like of int
        Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately
        array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles.
    labels : array or False, default None
        Used as labels for the resulting bins. Must be of the same length as
        the resulting bins. If False, return only integer indicators of the
        bins. If True, raises an error.
    retbins : bool, optional
        Whether to return the (bins, labels) or not. Can be 

In [10]:
# 四分位範囲で分割
pd.qcut(fare, 4, duplicates='drop')

0       (-0.001, 7.91]
1      (31.0, 512.329]
            ...       
889     (14.454, 31.0]
890     (-0.001, 7.91]
Name: fare, Length: 891, dtype: category
Categories (4, interval[float64]): [(-0.001, 7.91] < (7.91, 14.454] < (14.454, 31.0] < (31.0, 512.329]]

In [11]:
help(KBinsDiscretizer)

Help on class KBinsDiscretizer in module sklearn.preprocessing._discretization:

class KBinsDiscretizer(sklearn.base.TransformerMixin, sklearn.base.BaseEstimator)
 |  KBinsDiscretizer(n_bins=5, *, encode='onehot', strategy='quantile')
 |  
 |  Bin continuous data into intervals.
 |  
 |  Read more in the :ref:`User Guide <preprocessing_discretization>`.
 |  
 |  .. versionadded:: 0.20
 |  
 |  Parameters
 |  ----------
 |  n_bins : int or array-like, shape (n_features,) (default=5)
 |      The number of bins to produce. Raises ValueError if ``n_bins < 2``.
 |  
 |  encode : {'onehot', 'onehot-dense', 'ordinal'}, (default='onehot')
 |      Method used to encode the transformed result.
 |  
 |      onehot
 |          Encode the transformed result with one-hot encoding
 |          and return a sparse matrix. Ignored features are always
 |          stacked to the right.
 |      onehot-dense
 |          Encode the transformed result with one-hot encoding
 |          and return a dense array

In [12]:
quantile_transformer = KBinsDiscretizer(n_bins=4,
                                        encode='ordinal',
                                        strategy='quantile')
quantile_transformer.fit(fare.values.reshape((-1, 1)))
edges = quantile_transformer.bin_edges_[0]
interval = [f'{edges[i]}~{edges[i+1]}' for i in range(edges.size - 1)]
print(f'interval: {interval}')
display(
    pd.Series(
        quantile_transformer.transform(fare.values.reshape((-1, 1))).ravel()))

interval: ['0.0~7.9104', '7.9104~14.4542', '14.4542~31.0', '31.0~512.3292']


0      0.0
1      3.0
      ... 
889    2.0
890    0.0
Length: 891, dtype: float64

練習問題

---
`age`データセットを四分位範囲で分割する。

解答例

---

In [13]:
pd.qcut(age, 4)

0       (6.0, 24.0]
1      (35.0, 80.0]
           ...     
889    (24.0, 35.0]
890    (24.0, 35.0]
Name: age, Length: 891, dtype: category
Categories (4, interval[float64]): [(-0.001, 6.0] < (6.0, 24.0] < (24.0, 35.0] < (35.0, 80.0]]

In [14]:
pd.Series(
    KBinsDiscretizer(n_bins=4, encode='ordinal',
                     strategy='quantile').fit_transform(
                         age.values.reshape((-1, 1))).ravel())

0      1.0
1      3.0
      ... 
889    2.0
890    2.0
Length: 891, dtype: float64

---

## クラスタリングを利用して分割
---
対象の変数にクラスタリングを適用し、その結果を利用して分割。

### Pythonでのクラスタリングを利用した分割の実行方法
---
`sklearn.preprocessing.KBinsDiscretizer`を使用する。 (使用されるクラスタリングは k-means)

In [15]:
help(KBinsDiscretizer)

Help on class KBinsDiscretizer in module sklearn.preprocessing._discretization:

class KBinsDiscretizer(sklearn.base.TransformerMixin, sklearn.base.BaseEstimator)
 |  KBinsDiscretizer(n_bins=5, *, encode='onehot', strategy='quantile')
 |  
 |  Bin continuous data into intervals.
 |  
 |  Read more in the :ref:`User Guide <preprocessing_discretization>`.
 |  
 |  .. versionadded:: 0.20
 |  
 |  Parameters
 |  ----------
 |  n_bins : int or array-like, shape (n_features,) (default=5)
 |      The number of bins to produce. Raises ValueError if ``n_bins < 2``.
 |  
 |  encode : {'onehot', 'onehot-dense', 'ordinal'}, (default='onehot')
 |      Method used to encode the transformed result.
 |  
 |      onehot
 |          Encode the transformed result with one-hot encoding
 |          and return a sparse matrix. Ignored features are always
 |          stacked to the right.
 |      onehot-dense
 |          Encode the transformed result with one-hot encoding
 |          and return a dense array

In [16]:
cluster_transformer = KBinsDiscretizer(n_bins=3,
                                       encode='ordinal',
                                       strategy='kmeans')
cluster_transformer.fit(fare.values.reshape((-1, 1)))
edges = cluster_transformer.bin_edges_[0]
interval = [f'{edges[i]}~{edges[i+1]}' for i in range(edges.size - 1)]
print(f'interval: {interval}')
display(
    pd.Series(
        cluster_transformer.transform(fare.values.reshape((-1, 1))).ravel()))

interval: ['0.0~94.96351024105013', '94.96351024105013~339.922139', '339.922139~512.3292']


0      0.0
1      0.0
      ... 
889    0.0
890    0.0
Length: 891, dtype: float64

練習問題

---
`age`データセットを k-means クラスタリングで 3 分割する。

解答例

---

In [17]:
pd.Series(
    KBinsDiscretizer(n_bins=3, encode='ordinal',
                     strategy='kmeans').fit_transform(
                         age.values.reshape((-1, 1))).ravel())

0      1.0
1      2.0
      ... 
889    1.0
890    1.0
Length: 891, dtype: float64

---