In [1]:
import random
from math import ceil
import numpy as np
from sklearn.datasets import make_blobs, make_classification, make_regression
from sklearn.cluster import MiniBatchKMeans
from sklearn.linear_model import SGDClassifier, SGDRegressor

# ミニバッチ学習
---
確率的勾配降下法のように、データセットのミニバッチを使って少しずつ学習させていく手法。

## Pythonでのミニバッチ作成
---
`yield`を使用してジェネレーターを作成する。  
DB からの読み込みと組み合わせると、サイズの大きいデータセットでも学習可能。

In [2]:
class BatchGenerator(object):
    def __init__(self, x, y=None, batch_size=10, random_state=None):
        self.x = x
        self.y = y
        self.batch_size = batch_size
        self.n_data = len(x)
        self.step = 0
        self.step_per_epoch = ceil(self.n_data / self.batch_size)
        self.index = np.arange(self.n_data)
        if random_state:
            random.seed(random_state)

    def __iter__(self):
        return self

    def __next__(self):
        if self.step == 0:
            random.shuffle(self.index)
        begin = self.batch_size * self.step
        end = min(begin + self.batch_size, self.n_data)
        batch_x = self.x[begin:end]
        if self.y is None:
            batch = batch_x
        else:
            batch_y = self.y[begin:end]
            batch = (batch_x, batch_y)
        self.step = (self.step + 1) % self.step_per_epoch

        return batch

In [3]:
x, y = make_regression(n_samples=100, n_features=2, random_state=1234)
gen = BatchGenerator(x, y, batch_size=3, random_state=1234)
for _ in range(2):
    x, y = next(gen)
    print(x, y)

[[ 0.85958841 -0.6365235 ]
 [-0.33407737  0.00211836]
 [-0.20393287 -0.18217541]] [-9.7636448  -3.49365597 -7.584586  ]
[[ 0.01569637 -2.24268495]
 [-0.8596683   0.22598549]
 [ 1.03380073 -2.40045363]] [-66.47629433  -2.43671178 -60.32574025]


## scikit-learnのミニバッチ学習実装
---
`partial_fit`メソッドにミニバッチを与えて学習する。

### Mini-Batch K-Means
---
ミニバッチ学習による K-Means クラスタリング。  
`sklearn.cluster.MiniBatchKMeans`を使用する。

In [4]:
help(MiniBatchKMeans)

Help on class MiniBatchKMeans in module sklearn.cluster._kmeans:

class MiniBatchKMeans(KMeans)
 |  MiniBatchKMeans(n_clusters=8, *, init='k-means++', max_iter=100, batch_size=100, verbose=0, compute_labels=True, random_state=None, tol=0.0, max_no_improvement=10, init_size=None, n_init=3, reassignment_ratio=0.01)
 |  
 |  Mini-Batch K-Means clustering.
 |  
 |  Read more in the :ref:`User Guide <mini_batch_kmeans>`.
 |  
 |  Parameters
 |  ----------
 |  
 |  n_clusters : int, default=8
 |      The number of clusters to form as well as the number of
 |      centroids to generate.
 |  
 |  init : {'k-means++', 'random'} or ndarray of shape             (n_clusters, n_features), default='k-means++'
 |      Method for initialization
 |  
 |      'k-means++' : selects initial cluster centers for k-mean
 |      clustering in a smart way to speed up convergence. See section
 |      Notes in k_init for more details.
 |  
 |      'random': choose k observations (rows) at random from data for
 |

In [5]:
x_cluster, _ = make_blobs(n_samples=100000, n_features=2, random_state=1234)
gen_cluster = BatchGenerator(x_cluster, batch_size=1000, random_state=1234)
km = MiniBatchKMeans(n_clusters=3, random_state=1234)
tol = 100
best_score = np.inf
early_stopping = 10
no_improvement = 0
while(best_score > tol):
    batch = next(gen_cluster)
    km.partial_fit(batch)
    if best_score <= km.inertia_:
        no_improvement += 1
    else:
        no_improvement = 0
        best_score = km.inertia_
    if no_improvement == early_stopping:
        break

### SGD Classifier
---
確率的勾配降下法 (SGD) による分類。`loss='hinge'`で SVM ・`loss='log'`でロジスティック回帰の SGD 版になる。  
`sklearn.linear_model.SGDClassifier`を使用する。

In [6]:
help(SGDClassifier)

Help on class SGDClassifier in module sklearn.linear_model._stochastic_gradient:

class SGDClassifier(BaseSGDClassifier)
 |  SGDClassifier(loss='hinge', *, penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=0.001, shuffle=True, verbose=0, epsilon=0.1, n_jobs=None, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, class_weight=None, warm_start=False, average=False)
 |  
 |  Linear classifiers (SVM, logistic regression, etc.) with SGD training.
 |  
 |  This estimator implements regularized linear models with stochastic
 |  gradient descent (SGD) learning: the gradient of the loss is estimated
 |  each sample at a time and the model is updated along the way with a
 |  decreasing strength schedule (aka learning rate). SGD allows minibatch
 |  (online/out-of-core) learning via the `partial_fit` method.
 |  For best results using the default learning rate schedule, the data sho

In [7]:
x_clf, y_clf = make_classification(n_samples=100000, random_state=1234)
gen_clf = BatchGenerator(x_clf, y_clf, batch_size=100, random_state=1234)
clf = SGDClassifier(loss='log', n_jobs=-1, random_state=1234)
classes = np.unique(y_clf)
for _ in range(10000):
    batch_x, batch_y = next(gen_clf)
    clf.partial_fit(batch_x, batch_y, classes=classes)

### SGD Regressor
---
確率的勾配降下法 (SGD) による回帰。  
`sklearn.linear_model.SGDRegressor`を使用する。

In [8]:
help(SGDRegressor)

Help on class SGDRegressor in module sklearn.linear_model._stochastic_gradient:

class SGDRegressor(BaseSGDRegressor)
 |  SGDRegressor(loss='squared_loss', *, penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=0.001, shuffle=True, verbose=0, epsilon=0.1, random_state=None, learning_rate='invscaling', eta0=0.01, power_t=0.25, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, warm_start=False, average=False)
 |  
 |  Linear model fitted by minimizing a regularized empirical loss with SGD
 |  
 |  SGD stands for Stochastic Gradient Descent: the gradient of the loss is
 |  estimated each sample at a time and the model is updated along the way with
 |  a decreasing strength schedule (aka learning rate).
 |  
 |  The regularizer is a penalty added to the loss function that shrinks model
 |  parameters towards the zero vector using either the squared euclidean norm
 |  L2 or the absolute norm L1 or a combination of both (Elastic Net). If the
 | 

In [9]:
x_reg, y_reg = make_regression(n_samples=100000, random_state=1234)
gen_reg = BatchGenerator(x_reg, y_reg, batch_size=100, random_state=1234)
reg = SGDRegressor(random_state=1234)
for _ in range(10000):
    batch_x, batch_y = next(gen_reg)
    reg.partial_fit(batch_x, batch_y)