<a href="https://colab.research.google.com/github/ShinAsakawa/ShinAsakawa.github.io/blob/master/notebooks/2020SightVisit_kmeans_gmm_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

出典: <http://codh.rois.ac.jp/kmnist/>

<center>
<img src="http://codh.rois.ac.jp/img/kmnist.png" style="widht:59%">
</center>

In [None]:
# データの読み込み
!wget http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-train-imgs.npz
!wget http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-train-labels.npz
!wget http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-test-imgs.npz
!wget http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-test-labels.npz

In [None]:
import numpy as np

def load(f):
    return np.load(f)['arr_0']

# Load the data
x_train = load('kmnist-train-imgs.npz')
x_test = load('kmnist-test-imgs.npz')
y_train = load('kmnist-train-labels.npz')
y_test = load('kmnist-test-labels.npz')

# Flatten images
n_samples = 10000
x_train = x_train.reshape(-1, 784)[:n_samples]
y_train = y_train[:n_samples]
x_test = x_test.reshape(-1, 784)

ind2c =[c for c in 'おきすつなまはやれを']

In [None]:
!pip install japanize-matplotlib

In [None]:
%pylab inline
import matplotlib.pyplot as plt
import japanize_matplotlib

In [None]:
def print_digits(images,y,max_n=10):
    fig = plt.figure(figsize=(14, 14))
    fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.8, wspace=0.05)
    i = 0
    while i < max_n:
        p = fig.add_subplot(20, 20, i + 1, xticks=[], yticks=[])
        p.imshow(images[i].reshape(28,-1), cmap=plt.cm.bone)
        p.text(10, 0, ind2c[y[i]])
        i = i + 1
    
print_digits(x_train, y_train, max_n=10)


In [None]:
from sklearn.model_selection import train_test_split
_X_train, _X_test, _y_train, _y_test = train_test_split(
        x_train, y_train, test_size=0.2)

n_samples, n_features = _X_train.shape
n_digits = len(np.unique(_y_train))


In [None]:
from sklearn import cluster
# K 平均法による分類を行います
kmeans_clf = cluster.KMeans(init='k-means++', n_clusters=10, random_state=42)

kmeans_clf.fit(_X_train, _y_train)
print_digits(_X_test, kmeans_clf.labels_, max_n=10)



In [None]:
# 結果の予測を行います
y_pred = kmeans_clf.predict(_X_test)

def print_cluster(X, y, cluster_number):
    images = X[y==cluster_number]
    y_pred = y[y==cluster_number]
    print_digits(images, y_pred, max_n=10)

for i in range(10):
     print_cluster(_X_test, y_pred, i)


In [None]:
from sklearn import metrics
print("Addjusted rand score:{:.2}".format(metrics.adjusted_rand_score(_y_test, y_pred)))
print("Homogeneity score:{:.2} ".format(metrics.homogeneity_score(_y_test, y_pred)))
print("Completeness score: {:.2} ".format(metrics.completeness_score(_y_test, y_pred)))
print("Confusion matrix")
print(metrics.confusion_matrix(_y_test, y_pred))

In [None]:
from sklearn import mixture

# Define a heldout dataset to estimate covariance type
X_train_heldout, X_test_heldout, y_train_heldout, y_test_heldout = train_test_split(
        x_train, y_train, test_size=0.2) 
for covariance_type in ['spherical','tied','diag','full']:
    gm=mixture.GaussianMixture(n_components=n_digits, covariance_type=covariance_type, random_state=42, n_init=5)
    gm.fit(X_train_heldout)
    y_pred=gm.predict(X_test_heldout)
    print("Adjusted rand score for covariance={}:{:.2}".format(covariance_type, metrics.adjusted_rand_score(y_test_heldout, y_pred)))


In [None]:
gm = mixture.GaussianMixture(n_components=n_digits, covariance_type='tied', random_state=42)
gm.fit(x_train)

In [None]:
# Print train clustering and confusion matrix
y_pred = gm.predict(x_test)
print("Addjusted rand score:{:.2}".format(metrics.adjusted_rand_score(y_test, y_pred)))
print("Homogeneity score:{:.2} ".format(metrics.homogeneity_score(y_test, y_pred)) )
print("Completeness score: {:.2} ".format(metrics.completeness_score(y_test, y_pred)))
for i in range(10):
     print_cluster(x_test, y_pred, i)
print("Confusion matrix")
print(metrics.confusion_matrix(y_test, y_pred))