In [1]:
import os

import numpy as np
import polars as pl

import joblib
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import cm
from PIL import Image
from pathlib import Path

from sklearn.preprocessing import StandardScaler
from sklearn.mixture import BayesianGaussianMixture
from sklearn.decomposition import PCA

from scipy.stats import rankdata


In [2]:
def scatter_each_classes(data, class_labels, rank, markers, colors, xylabel, facecolor='valid', scatter_classe=(None,)):
    labels = scatter_classe if not (None in scatter_classe) else np.unique(class_labels)
    markers = markers if markers is not None else ['o'] * np.unique(class_labels).shape[0]
    colors = [plt.get_cmap('tab10')(i) for i in range(10)] if colors == 'tab10' else colors
    print(markers)
    
    if isinstance(colors, matplotlib.colors.LinearSegmentedColormap):
        for l, r in zip(labels, rank):
            if (facecolor == 'None') and (markers[l] != 'x'):
                plt.scatter(data[class_labels==l, 0], data[class_labels==l, 1], edgecolors=colors(l / len(labels)), label=f'cluster{l}', marker=markers[l], facecolor=facecolor, zorder=r)
            else:
                plt.scatter(data[class_labels==l, 0], data[class_labels==l, 1], c=colors(l / len(labels)), label=f'cluster{l}', marker=markers[l], zorder=r)
    else:
        for l, r in zip(labels, rank):
            if (facecolor == 'None') and (markers[l] != 'x'):
                plt.scatter(data[class_labels==l, 0], data[class_labels==l, 1], edgecolors=colors[l], label=f'cluster{l}', marker=markers[l], facecolor=facecolor, zorder=r)
            else:
                plt.scatter(data[class_labels==l, 0], data[class_labels==l, 1], c=colors[l], label=f'cluster{l}', marker=markers[l], zorder=r)

    plt.xlabel(xylabel[0])
    plt.ylabel(xylabel[1])
    plt.legend(loc='upper left', bbox_to_anchor=(1,1))
    plt.tight_layout()
    plt.show()

In [3]:
def concat_images(imgs, n_col, n_row, padding):
    w, h = imgs[0].size
    w_with_pad = w + padding
    h_with_pad = h + padding
    W = (w_with_pad) * n_col + padding
    H = (h_with_pad) * n_row + padding

    dst = Image.new('L', (W, H))
    iter_imgs = iter(imgs)
    for j in range(n_row):
        for i in range(n_col):
            img = next(iter_imgs)
            dst.paste(img, (padding + w_with_pad * i, padding + h_with_pad * j))
    return dst

In [None]:
def make_path_from_df(row):
    return Path(f"{row['dirname']}/{row['filename']}")

In [4]:
def image_concat_and_imshow(df, labels, colrow, image_root):
    concat_imgs = []
    num_labels = len(np.unique(labels))
    for l in np.unique(labels):
        imgs = []
        df_ = df[labels == l].sample(colrow[0] * colrow[1])
        df_.apply(make_path_from_df, axis=1)
        for p in df_.apply(make_path_from_df, axis=1):
            imgs.append(Image.open(image_root / p))
        concat_imgs.append(concat_images(imgs, colrow[0], colrow[1], 2))

    fig = plt.figure(figsize=(10,11))
    axes = list(map(lambda f: fig.add_subplot(1,num_labels,f+1), range(num_labels)))
    for i in range(num_labels):
        axes[i].imshow(concat_imgs[i], )
        axes[i].set_title(f'cluster {i}')
        axes[i].axis('off')

    plt.show()

    return concat_imgs

In [5]:
colors = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 'tab:brown', 'tab:pink', 'tab:gray', 'tab:olive', 'tab:cyan']
markers = ['s', 'D', 'o', 'p', '*', 'h', 'D', '8', 'v', 'x']

feature_csv_path = "/home/shinsei/MyResearchs/feat_extrc/reports/features/SimpleCAE32/2023-07-07/12-15-24/features.csv"

In [9]:
df_train = pl.read_csv(feature_csv_path)
df_train.select([
    pl.exclude(["filename", "dirname"])
])

column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,column_10,column_11,column_12,column_13,column_14,column_15,column_16,column_17,column_18,column_19,column_20,column_21,column_22,column_23,column_24,column_25,column_26,column_27,column_28,column_29,column_30,column_31,column_32,column_33,column_34,column_35,column_36,…,column_91,column_92,column_93,column_94,column_95,column_96,column_97,column_98,column_99,column_100,column_101,column_102,column_103,column_104,column_105,column_106,column_107,column_108,column_109,column_110,column_111,column_112,column_113,column_114,column_115,column_116,column_117,column_118,column_119,column_120,column_121,column_122,column_123,column_124,column_125,column_126,column_127
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.006472,-0.000018,-0.000016,0.00096,0.004346,-0.000056,-0.000052,0.004446,0.001696,-0.000026,-0.000029,-0.000078,-0.000073,-0.000062,-0.000051,0.001236,-0.000012,-0.000003,-0.00013,-0.000075,0.000357,-0.000005,0.004875,0.001264,0.005539,-0.000088,-0.000036,-0.000017,-0.000011,-0.000057,-0.000053,-0.000053,0.000609,-0.000048,-0.000033,0.005627,-0.000013,…,-0.000076,-0.000022,-0.000066,-0.000005,-0.000003,-0.000022,0.004437,0.002324,0.000861,0.001192,-0.000012,-0.000025,-0.000126,0.000836,-0.000086,0.000888,-0.000043,0.000553,-0.000025,-0.000063,0.004848,0.000804,-0.000026,0.003336,0.001801,-0.000065,-0.000036,0.001521,0.001728,-0.000019,0.000818,0.00031,-0.000059,-0.000033,0.00436,0.00197,0.000142
0.005489,-0.000009,-0.00002,-0.000002,0.003268,-0.000051,-0.000027,0.003189,0.001194,-0.000032,-0.000021,-0.000063,-0.000073,-0.000058,-0.000043,0.001139,-0.000021,-0.000013,-0.000094,-0.000051,-0.000006,-0.000005,0.00406,-0.000003,0.004351,-0.000079,-0.000036,-0.000023,-0.000019,-0.000056,-0.000052,-0.000048,0.000639,-0.000015,-0.000032,0.005356,-0.000034,…,-0.000045,-0.000021,-0.000064,0.000125,0.000054,-0.000027,0.002891,0.001542,-0.000018,0.001638,-0.000043,-0.000009,-0.000096,0.000891,-0.000082,-0.000007,-0.000037,0.001002,0.000177,-0.000036,0.004962,-2.9109e-7,0.000016,0.002736,0.001173,-0.000052,-0.000035,0.001707,0.001849,-0.000018,0.000713,-0.000013,-0.000042,-0.000017,0.002883,0.001838,-0.000035
0.005089,-0.000001,-0.000033,0.001084,0.003872,-0.000064,-0.000049,0.003297,-0.000002,-0.000023,-0.000024,-0.000062,-0.000059,-0.000069,-0.000017,0.001358,-0.000012,-0.000006,-0.000106,-0.000066,-0.000009,-0.000009,0.004427,0.000392,0.004185,-0.000091,-0.000052,-0.000009,-0.000049,-0.000059,-0.000034,-0.000055,0.002325,-0.000011,-0.000047,0.005932,-0.000011,…,-0.000057,-0.000022,-0.000044,-0.000004,-0.000018,-0.000026,0.003398,0.001535,0.001097,0.001118,-0.00003,-0.000015,-0.000094,-0.000003,-0.000077,-0.000015,-0.000015,0.000538,0.000845,-0.000022,0.003508,-0.000031,0.000012,0.001627,0.000156,-0.000056,-0.000029,0.00004,0.001158,-0.000008,0.000564,-0.000015,-0.000057,-0.000009,0.003786,0.000918,-0.000018
0.005846,-0.000003,-0.000029,-0.000021,0.003356,-0.000066,-0.000049,0.003043,0.00073,-0.000014,-0.000035,-0.000083,-0.000048,-0.000059,-0.000038,0.001954,-0.000029,-0.000008,-0.000083,-0.000061,0.001383,0.000381,0.004402,0.000524,0.003496,-0.00007,-0.000033,-0.000006,-0.000004,-0.000062,-0.000062,-0.000014,0.000582,0.000082,-0.000016,0.004784,-0.000002,…,-0.000042,-0.000009,-0.000073,-0.000008,-0.000008,-0.000027,0.004133,0.001721,-0.000013,0.001491,-0.000006,-0.000024,-0.000108,0.001624,-0.000075,-0.000003,-0.000048,0.001534,-0.000019,-0.000059,0.002916,0.00014,-0.000016,0.001632,0.002316,-0.000054,-0.000062,0.00062,0.00134,-0.000024,0.000512,0.00051,-0.000039,-0.000032,0.003557,0.00215,-0.000014
0.004845,-0.000029,-0.000011,-1.4778e-7,0.003868,-0.000058,-0.000061,0.003428,0.001491,-0.000023,-0.000042,-0.000075,-0.000059,-0.000072,-0.000044,0.001844,-0.000035,0.000304,-0.000123,-0.000053,0.001069,-8.6979e-7,0.005026,0.000379,0.002774,-0.000091,-0.000043,-0.000016,0.000124,-0.00004,-0.000075,-0.000039,0.00106,-0.000052,0.000804,0.00482,-0.000005,…,-0.000073,-0.000004,-0.000069,-0.00001,0.000131,-0.000027,0.004213,0.002442,0.000494,0.001931,-0.000026,-0.000022,-0.000113,0.001011,-0.000068,0.000755,-0.000044,0.001587,-0.000012,-0.000067,0.003504,-0.00001,-0.000011,0.002861,0.000715,-0.00005,-0.00004,0.001997,0.001362,-0.00001,-0.000005,0.000716,-0.000028,-0.000013,0.003236,0.001758,-0.000023
0.005323,0.000252,-0.000015,0.000779,0.002941,-0.000052,-0.000047,0.003653,0.001041,-0.000005,-0.000044,-0.000065,-0.00007,-0.000049,-0.000058,-0.000007,-0.00004,0.000171,-0.000088,-0.000063,0.000468,-0.000008,0.005354,0.001106,0.003548,-0.000086,-0.000058,0.000169,-0.000034,-0.000055,-0.000031,-0.000036,0.001841,-0.00004,0.000069,0.003554,-0.000013,…,-0.000045,0.000015,-0.00006,-0.000011,-8.3981e-7,-0.000023,0.003592,0.002678,0.00125,0.001786,-0.000044,-0.000008,-0.000112,0.000077,-0.000065,0.000842,-0.000067,0.000789,-0.000003,-0.000049,0.004402,-0.000007,-0.000027,0.002782,0.001846,-0.000055,-0.000047,0.001452,0.001489,0.000037,-0.000011,-0.000013,-0.000037,-0.000033,0.002537,0.001965,-0.000019
0.004296,-0.000015,-0.000018,0.000286,0.003486,-0.000055,-0.000054,0.004113,0.00239,-0.000034,-0.000047,-0.000082,-0.000068,-0.000074,-0.000044,0.001412,-0.00004,0.001355,-0.00013,-0.000082,0.001353,-0.000023,0.003709,0.001069,0.002991,-0.000084,-0.000047,-0.000018,-0.000034,-0.000058,-0.000046,-0.00004,0.001443,-0.000041,-0.000025,0.00499,-0.000018,…,-0.000058,-0.000012,-0.000041,0.001238,-0.00002,-0.000011,0.003905,0.001044,0.000381,0.001489,-0.000037,-0.000024,-0.000111,0.001576,-0.000055,-0.000013,-0.000048,-0.000011,-0.000004,-0.000039,0.004605,-0.000008,-0.000021,0.002425,0.001712,-0.000059,-0.000017,0.001507,0.000774,-0.000017,-0.000008,-0.000037,-0.000062,-0.000018,0.003515,0.00175,-0.000006
0.005646,-0.000015,-0.000028,-0.000015,0.002945,-0.000069,-0.000066,0.003088,0.001676,-0.000009,-0.000037,-0.000059,-0.000069,-0.000066,-0.000026,0.00221,-0.00002,-0.000015,-0.000128,-0.000085,0.001908,-0.000002,0.004664,0.000339,0.004087,-0.000091,-0.000039,-0.000012,-0.000023,-0.00007,-0.000067,-0.000048,0.001959,-0.000035,-0.000028,0.005752,-0.000026,…,-0.000058,-0.00002,-0.000043,-0.000018,0.000601,-0.000042,0.003666,0.002496,0.000989,0.000728,-0.000038,-0.000038,-0.000091,0.000125,-0.000101,-0.000021,-0.00003,-0.000023,0.000038,-0.000061,0.003635,-0.000043,-0.000025,0.002647,0.00222,-0.000036,-0.000032,0.00238,0.001305,-0.000005,0.001012,-0.000007,-0.000047,-0.000013,0.003853,0.000826,-0.000035
0.005797,-0.000025,-0.00002,-0.000016,0.004093,-0.000048,-0.000041,0.003936,0.0002,-0.000037,-0.000039,-0.000071,-0.000071,-0.000059,-0.000035,0.002471,-0.000039,0.000324,-0.000109,-0.00008,0.000812,0.000254,0.004947,0.00064,0.003204,-0.000061,-0.000022,-0.000033,-0.000009,-0.000058,-0.000053,-0.000026,0.000803,-0.000035,-0.000024,0.00618,-0.000012,…,-0.000051,-0.000015,-0.000063,0.000305,-0.00001,-0.000004,0.002769,0.001287,0.00083,0.001202,-0.000004,-0.000018,-0.000108,0.000194,-0.000043,-0.000008,-0.000043,0.000711,-0.000032,-0.00004,0.003671,-0.000017,-0.00002,0.001194,0.002002,-0.000045,-0.000044,0.001198,0.002462,-0.000022,-0.000009,0.000522,-0.000043,-0.000015,0.003238,0.001397,-0.00001
0.006759,-0.000011,-0.000007,-0.000011,0.003961,-0.000056,-0.000014,0.004746,0.00209,-0.000036,-0.000036,-0.000073,-0.000084,-0.00008,-0.000055,0.0015,-0.000039,-0.000001,-0.000101,-0.000054,0.001146,-0.000018,0.005008,0.001181,0.002965,-0.000101,-0.00006,0.000774,-0.000029,-0.000052,-0.000043,-0.000038,0.001129,-0.000034,-0.000022,0.006346,-0.000026,…,-0.000046,-0.000008,-0.000065,-0.000005,-0.000017,-0.000017,0.004202,0.002257,0.00112,0.002714,-0.000048,-0.00003,-0.000132,0.000088,-0.00005,-0.000024,-0.000062,0.000947,0.000244,-0.000045,0.00576,-0.000008,-0.000008,0.003473,0.000626,-0.000057,-0.000047,0.000396,0.001185,0.000853,-0.00001,-0.000005,-0.000031,-0.000029,0.003853,0.001411,0.000282


In [None]:
stds = StandardScaler()
# df_train = pd.read_csv(feature_csv_pardir / 'check_data' / check_data_feature_csv_name)
df_train = pl.read_csv(feature_csv_path)
X_train = df_train.select([
    pl.exclude(["filename", "dirname"])
])
X_train_std = stds.fit_transform(X_train)

In [None]:
pca = PCA(n_features)
X_train_pca = pca.fit_transform(X_train_std)
cumsum_contrb_rate = pca.explained_variance_ratio_.cumsum()

plt.plot(range(n_features+1), [0]+list(cumsum_contrb_rate))
plt.xlabel('Number of principal components')
plt.ylabel('Cumulative contribution rate')
plt.yticks(np.arange(0., 1.1, 0.1))
plt.grid()
plt.show()