In [None]:
import os

import numpy as np
import polars as pl

import joblib
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import cm
from PIL import Image
from pathlib import Path

from sklearn.preprocessing import StandardScaler
from sklearn.mixture import BayesianGaussianMixture
from sklearn.decomposition import PCA

from scipy.stats import rankdata


In [None]:
def scatter_each_classes(data, class_labels, rank, markers, colors, xylabel, facecolor='valid', scatter_classe=(None,)):
    labels = scatter_classe if not (None in scatter_classe) else np.unique(class_labels)
    markers = markers if markers is not None else ['o'] * np.unique(class_labels).shape[0]
    colors = [plt.get_cmap('tab10')(i) for i in range(10)] if colors == 'tab10' else colors
    print(markers)
    
    if isinstance(colors, matplotlib.colors.LinearSegmentedColormap):
        for l, r in zip(labels, rank):
            if (facecolor == 'None') and (markers[l] != 'x'):
                plt.scatter(data[class_labels==l, 0], data[class_labels==l, 1], edgecolors=colors(l / len(labels)), label=f'cluster{l}', marker=markers[l], facecolor=facecolor, zorder=r)
            else:
                plt.scatter(data[class_labels==l, 0], data[class_labels==l, 1], c=colors(l / len(labels)), label=f'cluster{l}', marker=markers[l], zorder=r)
    else:
        for l, r in zip(labels, rank):
            if (facecolor == 'None') and (markers[l] != 'x'):
                plt.scatter(data[class_labels==l, 0], data[class_labels==l, 1], edgecolors=colors[l], label=f'cluster{l}', marker=markers[l], facecolor=facecolor, zorder=r)
            else:
                plt.scatter(data[class_labels==l, 0], data[class_labels==l, 1], c=colors[l], label=f'cluster{l}', marker=markers[l], zorder=r)

    plt.xlabel(xylabel[0])
    plt.ylabel(xylabel[1])
    plt.legend(loc='upper left', bbox_to_anchor=(1,1))
    plt.tight_layout()
    plt.show()

In [None]:
def concat_images(imgs, n_col, n_row, padding):
    w, h = imgs[0].size
    w_with_pad = w + padding
    h_with_pad = h + padding
    W = (w_with_pad) * n_col + padding
    H = (h_with_pad) * n_row + padding

    dst = Image.new('L', (W, H))
    iter_imgs = iter(imgs)
    for j in range(n_row):
        for i in range(n_col):
            img = next(iter_imgs)
            dst.paste(img, (padding + w_with_pad * i, padding + h_with_pad * j))
    return dst

In [None]:
def make_path_from_df(row):
    return Path(f"{row['dirname']}/{row['filename']}")

In [None]:
def image_concat_and_imshow(df, labels, colrow, image_root):
    concat_imgs = []
    num_labels = len(np.unique(labels))
    for l in np.unique(labels):
        imgs = []
        df_ = df[labels == l].sample(colrow[0] * colrow[1])
        df_.apply(make_path_from_df, axis=1)
        for p in df_.apply(make_path_from_df, axis=1):
            imgs.append(Image.open(image_root / p))
        concat_imgs.append(concat_images(imgs, colrow[0], colrow[1], 2))

    fig = plt.figure(figsize=(10,11))
    axes = list(map(lambda f: fig.add_subplot(1,num_labels,f+1), range(num_labels)))
    for i in range(num_labels):
        axes[i].imshow(concat_imgs[i], )
        axes[i].set_title(f'cluster {i}')
        axes[i].axis('off')

    plt.show()

    return concat_imgs

In [None]:
colors = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 'tab:brown', 'tab:pink', 'tab:gray', 'tab:olive', 'tab:cyan']
markers = ['s', 'D', 'o', 'p', '*', 'h', 'D', '8', 'v', 'x']

feature_csv_path = "/home/shinsei/MyResearchs/feat_extrc/reports/features/SimpleCAE32/2023-07-07/12-15-24/features.csv"

In [None]:
stds = StandardScaler()
# df_train = pd.read_csv(feature_csv_pardir / 'check_data' / check_data_feature_csv_name)
df_train = pl.read_csv(feature_csv_path)
X_train = df_train.
X_train_std = stds.fit_transform(X_train)

In [None]:
pca = PCA(n_features)
X_train_pca = pca.fit_transform(X_train_std)
cumsum_contrb_rate = pca.explained_variance_ratio_.cumsum()

plt.plot(range(n_features+1), [0]+list(cumsum_contrb_rate))
plt.xlabel('Number of principal components')
plt.ylabel('Cumulative contribution rate')
plt.yticks(np.arange(0., 1.1, 0.1))
plt.grid()
plt.show()