# Write a function with which you can save a trained som using pickle

In [None]:
import pickle

def save_trained_som(som, filename):
    try:
        with open(filename, 'wb') as file:
            pickle.dump(som, file)
        print(f"训练好的 SOM 已成功保存到文件：{filename}")
    except Exception as e:
        print(f"保存 SOM 时出现错误：{e}")


# Write a function to load a saved som

In [None]:
import pickle

def load_trained_som(filename):
   
    try:
        with open(filename, 'rb') as file:
            som = pickle.load(file)
        print(f"已成功从文件 {filename} 加载 SOM。")
        return som
    except FileNotFoundError:
        print(f"错误：文件 {filename} 不存在。")
    except Exception as e:
        print(f"加载 SOM 时出现错误：{e}")


# Write a function that used PCA to reduce the dimensionality of a SOM (of data with at least 4 dimensions or more, like image data) to 3 dimensions, after which the 3D PCA-components are used to display the SOM with RGB values. Do this by fitting PCA to the entire training set and then taking the PCA transformation of each cell of the SOM, and also do the same by fitting the PCA to only the SOM cells. Show the difference and explain the reason for why they are different.

In [23]:
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

def visualize_som_with_pca_comparison(som_data, training_data):
    
    if som_data.shape[1] < 4 or training_data.shape[1] < 4:
        raise ValueError("SOM 和训练数据的维度必须至少为 4！")

    pca_training = PCA(n_components=3)
    pca_training.fit(training_data)  
    som_pca_training = pca_training.transform(som_data)  

    pca_som = PCA(n_components=3)
    pca_som.fit(som_data)  
    som_pca_som = pca_som.transform(som_data) 

    som_rgb_training = (som_pca_training - np.min(som_pca_training, axis=0)) / np.ptp(som_pca_training, axis=0)
    som_rgb_som = (som_pca_som - np.min(som_pca_som, axis=0)) / np.ptp(som_pca_som, axis=0)

    fig = plt.figure(figsize=(14, 6))
    ax1 = fig.add_subplot(121, projection='3d')
    ax1.scatter(
        som_pca_training[:, 0], som_pca_training[:, 1], som_pca_training[:, 2],
        c=som_rgb_training, marker='o', s=100, edgecolor='k', alpha=0.8
    )
    ax1.set_title("PCA 拟合到训练数据集")
    ax1.set_xlabel("PCA 1")
    ax1.set_ylabel("PCA 2")
    ax1.set_zlabel("PCA 3")

    ax2 = fig.add_subplot(122, projection='3d')
    ax2.scatter(
        som_pca_som[:, 0], som_pca_som[:, 1], som_pca_som[:, 2],
        c=som_rgb_som, marker='o', s=100, edgecolor='k', alpha=0.8
    )
    ax2.set_title("PCA 拟合到 SOM 单元")
    ax2.set_xlabel("PCA 1")
    ax2.set_ylabel("PCA 2")
    ax2.set_zlabel("PCA 3")

    plt.show()

    return pca_training, pca_som


# Create a function that displays the SOM based on the images that are projected to their BMU. When filling the SOM with images, make sure that the images that are displayed are in fact the images that are closest to the SOM vector of that particular cell (in the current notebook this is a random image from the list of images in that cell)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import euclidean_distances
from PIL import Image

def display_som_with_images(som_data, image_data, image_paths, grid_shape):
  
    rows, cols = grid_shape
    n_units = som_data.shape[0]
    
    if n_units != rows * cols:
        raise ValueError("SOM 的单元数量必须等于网格大小 (rows * cols)。")

    if len(image_data) != len(image_paths):
        raise ValueError("图像数据与图像路径的长度必须一致。")

    distances = euclidean_distances(som_data, image_data)

    closest_image_indices = np.argmin(distances, axis=1)

    fig, axes = plt.subplots(rows, cols, figsize=(cols * 2, rows * 2))

    for i in range(rows):
        for j in range(cols):
            unit_idx = i * cols + j
            img_idx = closest_image_indices[unit_idx]

            img = Image.open(image_paths[img_idx])
            img = img.resize((64, 64))  

            ax = axes[i, j]
            ax.imshow(img)
            ax.axis('off')

    plt.suptitle("SOM Visualization with Images", fontsize=16)
    plt.tight_layout()
    plt.show()


# Create a function that shows an activation of the SOM based on an input as an overlay over the image SOM

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import euclidean_distances

def visualize_som_activation(som_data, input_data, grid_shape):

    rows, cols = grid_shape
    n_units = som_data.shape[0]

    if n_units != rows * cols:
        raise ValueError("SOM 的单元数量必须等于网格大小 (rows * cols)。")

    if som_data.shape[1] != input_data.shape[1]:
        raise ValueError("SOM 数据和输入数据的特征维度 (data_dim) 必须一致。")

    distances = euclidean_distances(input_data, som_data)  

    activation_map = np.mean(1 / (1 + distances), axis=0) 

    activation_grid = activation_map.reshape(rows, cols)

    plt.figure(figsize=(10, 8))
    plt.imshow(activation_grid, cmap='hot', interpolation='nearest')
    plt.colorbar(label='Activation Intensity')
    plt.title("SOM Activation Map")
    plt.axis('off')
    plt.show()


# Write a function that returns a grid of images where each image is the closest image from the training data set to that cell in the SOM. (so this is the one where one image can show up multiple times in the SOM, like we discussed in the class)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import euclidean_distances
from PIL import Image

def create_image_grid(som_data, image_data, image_paths, grid_shape):
  
    rows, cols = grid_shape
    n_units = som_data.shape[0]

    if n_units != rows * cols:
        raise ValueError("SOM 的单元数量必须等于网格大小 (rows * cols)。")

    if len(image_data) != len(image_paths):
        raise ValueError("图像数据与图像路径的长度必须一致。")

    distances = euclidean_distances(image_data, som_data) 
    
    bmu_indices = np.argmin(distances, axis=1)  

    fig, axes = plt.subplots(rows, cols, figsize=(cols * 2, rows * 2))

    for i in range(rows):
        for j in range(cols):
            unit_idx = i * cols + j  

            mapped_images = np.where(bmu_indices == unit_idx)[0]

            if len(mapped_images) > 0:
                img_idx = mapped_images[0]  
                img = Image.open(image_paths[img_idx])
                img = img.resize((64, 64))  
            else:
                
                img = Image.new('RGB', (64, 64), (255, 255, 255))

            ax = axes[i, j]
            ax.imshow(img)
            ax.axis('off')

    plt.suptitle("SOM Image Grid", fontsize=16)
    plt.tight_layout()
    plt.show()

    return fig


# Now do the same with a text dataset, or another dataset of your choosing that is not image based.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import euclidean_distances

def create_text_grid(som_data, text_data, grid_shape):
    
    rows, cols = grid_shape
    n_units = som_data.shape[0]

    if n_units != rows * cols:
        raise ValueError("SOM 的单元数量必须等于网格大小 (rows * cols)。")

    vectorizer = TfidfVectorizer(max_features=som_data.shape[1])
    text_vectors = vectorizer.fit_transform(text_data).toarray()  

    if som_data.shape[1] != text_vectors.shape[1]:
        raise ValueError("SOM 数据的特征维度与文本向量化后的维度必须一致。")

    distances = euclidean_distances(text_vectors, som_data)  

    bmu_indices = np.argmin(distances, axis=1)  

    fig, axes = plt.subplots(rows, cols, figsize=(cols * 3, rows * 3))

    for i in range(rows):
        for j in range(cols):
            unit_idx = i * cols + j  

            mapped_texts = np.where(bmu_indices == unit_idx)[0]

            ax = axes[i, j]
            if len(mapped_texts) > 0:
                text_idx = mapped_texts[0]  
                ax.text(0.5, 0.5, text_data[text_idx], ha='center', va='center', wrap=True, fontsize=8)
            else:
                ax.text(0.5, 0.5, "No Data", ha='center', va='center', fontsize=8)

            ax.axis('off')

    plt.suptitle("SOM Text Grid", fontsize=16)
    plt.tight_layout()
    plt.show()

    return fig
