In [None]:
%reset -f
import numpy as np                             # Import NumPy library for numerical computation and array operations
import pandas as pd                            # Import Pandas library for data manipulation and analysis
import matplotlib.pyplot as plt                # Import Matplotlib library for data visualization
from concurrent.futures import ProcessPoolExecutor  # Parallel processing
import os, gc, time, zipfile                   # Import os module for file path operations
from tqdm import tqdm                          # Import tqdm library for progress bar display
from datetime import datetime                  # Import datetime module for time-related operations
from sklearn.preprocessing import StandardScaler
import umap.umap_ as umap
import umap
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import warnings

# Import custom module or function
from init_project_environment import init_project_environment
init_project_environment()

# ============ Project Path Configuration ============
project_root = os.path.abspath(".")            # Get the absolute path of the current project root directory
data_dir = os.path.join(project_root, "A_Data")       # Concatenate the path for the data folder
log_dir = os.path.join(project_root, "B_log")
result_dir = os.path.join(project_root, "D_Result")   # Concatenate the path for the result output folder
model_dir = os.path.join(project_root, "C_Model")
saved_models = os.path.join(project_root, "saved_models")

In [None]:
# === Step 1: Read Data ===
data_path = os.path.join(data_dir, "lag_merged_all.parquet")
df = pd.read_parquet(data_path)

# Remove unnecessary columns
drop_cols = ["Estbdt_13_lag", "Ipodt_13_lag", "Province_lag"]
df.drop(columns=drop_cols, inplace=True, errors="ignore")

# === Step 2: Separate X, y, and ID ===
y = df["insider_trading"]
id_cols = ["Stkcd", "Trddt"]
X = df.drop(columns=["insider_trading"] + id_cols, errors="ignore")

# X_sample = X
# y_sample = y

# === Step 3: Separate positive and negative samples ===
# Keep all positive samples (1) and randomly sample a subset of negative samples (0)
df_pos, y_pos = X[y == 1], y[y == 1]
df_neg, y_neg = X[y == 0], y[y == 0]

# === General Settings ===
n_runs  = 5
seed = 42
rng = np.random.RandomState(seed)

In [None]:
# Total number of unique stocks
total_stocks = df["Stkcd"].nunique()

# Stocks that have experienced at least one insider trading incident (label == 1)
stocks_with_insider = df[df["insider_trading"] == 1]["Stkcd"].unique()
num_stocks_with_insider = len(stocks_with_insider)

# Stocks that have never experienced insider trading (label always == 0)
all_stocks = df["Stkcd"].unique()
stocks_without_insider = np.setdiff1d(all_stocks, stocks_with_insider)
num_stocks_without_insider = len(stocks_without_insider)

# Total number of samples (i.e., trading day records)
total_samples = len(df)

# Number of insider trading samples (label == 1)
num_positive = (df["insider_trading"] == 1).sum()

# Number of non-insider trading samples (label == 0)
num_negative = (df["insider_trading"] == 0).sum()

# Print results
print(f"Total number of stocks: {total_stocks}")
print(f"Stocks with at least one insider trading incident: {num_stocks_with_insider}")
print(f"Stocks with no insider trading incidents: {num_stocks_without_insider}")
print(f"Total number of samples (trading days): {total_samples}")
print(f"Number of insider trading samples (label = 1): {num_positive}")
print(f"Number of non-insider trading samples (label = 0): {num_negative}")

### Only PCA 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
import os
import shutil

# Set output directory for saving figures
out_dir = "/root/autodl-tmp/UMAP_fig_20250928/OnlyPCA"
os.makedirs(out_dir, exist_ok=True)

# Define sampling ratios
sampling_ratios = [1, 2, 5]
n_runs = 10  # Number of repeated runs for each sampling ratio

for ratio in sampling_ratios:
    for run in range(1, n_runs + 1):
        n_pos = len(df_pos)
        n_neg = int(n_pos * ratio)  # Calculate the number of negative samples

        # resampling
        df_neg_resampled, y_neg_resampled = resample(df_neg, y_neg, n_samples=n_neg, random_state=42+run)

        # Combine positive and negative samples
        df_sampled = pd.concat([df_pos, df_neg_resampled], axis=0)
        y_sampled = pd.concat([y_pos, y_neg_resampled], axis=0)
        print(y_sampled.value_counts())

        # Step 1: Data cleaning – replace NaN/inf values with 0 and standardize the dataset
        df_sampled = df_sampled.replace([np.inf, -np.inf], np.nan).fillna(0)
        scaler = StandardScaler()
        df_scaled = scaler.fit_transform(df_sampled)  # Standardize data

        # Step 2: PCA dimensionality reduction (set n_components=2 for visualization)
        pca = PCA(n_components=2)
        pca_result = pca.fit_transform(df_scaled)

        # Step 3: Visualization
        colors = np.where(y_sampled == 1, 'red', 'black')
        plt.figure(figsize=(6, 6))
        # Keep equal aspect ratio for both axes
        scatter = plt.scatter(pca_result[:, 0], pca_result[:, 1], c=colors, alpha=0.5,s=5)
        legend_handles = [
            plt.Line2D([0], [0], marker='o', color='w', label='Non-Insider Trading (0)', markerfacecolor='black', markersize=6),
            plt.Line2D([0], [0], marker='o', color='w', label='Insider Trading (1)', markerfacecolor='red', markersize=6)
        ]
        plt.legend(handles=legend_handles, loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=2, fontsize=10, frameon=False)
        plt.title(f'PCA Visualization - Sample {ratio}:1 - Run {run}')
        plt.xlabel('Principal Component 1')
        plt.ylabel('Principal Component 2')

        #  Save figure with transparent background
        out_path = os.path.join(out_dir, f"OnlyPCA_{ratio}_run{run}.png")
        plt.savefig(out_path, dpi=300, bbox_inches="tight", transparent=True)
        plt.show()
        print(f"Run {run} completed and figure saved to {out_path}"")

### Only t-SNE

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
import os
import shutil

# Set output directory
out_dir = "/root/autodl-tmp/UMAP_fig_20250928/Onlyt-SNE"
#shutil.rmtree(out_dir)  # Remove all contents in the specified folder
os.makedirs(out_dir, exist_ok=True)

# Define sampling ratios (negatives per positive)
sampling_ratios = [1, 2, 5]
n_runs = 10  # 10 resamples for each ratio
random_state =42

for ratio in sampling_ratios:
    for run in range(1, n_runs + 1):
        # Sampling: resample the negative class to achieve the desired class ratio
        n_pos = len(df_pos)
        n_neg = int(n_pos * ratio)  # Compute the number of negative samples

        # resampling
        df_neg_resampled, y_neg_resampled = resample(df_neg, y_neg, n_samples=n_neg, random_state=42+run)

        # Combine positive and resampled negative samples
        df_sampled = pd.concat([df_pos, df_neg_resampled], axis=0)
        y_sampled = pd.concat([y_pos, y_neg_resampled], axis=0)
        print(y_sampled.value_counts())

        # Step 1: Data cleaning—replace NaNs with 0 and standardize
        df_sampled = df_sampled.replace([np.inf, -np.inf], np.nan).fillna(0)
        scaler = StandardScaler()
        df_scaled = scaler.fit_transform(df_sampled)  # Standardize features

        # Step 2: Dimensionality reduction with t-SNE
        tsne = TSNE(n_components=2, random_state=random_state, perplexity=30)  # 可以调整perplexity和其他参数
        tsne_result = tsne.fit_transform(df_scaled)  # 使用t-SNE进行降维

        # Step 3: Visualization
        colors = np.where(y_sampled == 1, 'red', 'black')
        plt.figure(figsize=(6, 6))
        # Keep equal aspect ratio for the axes
        plt.gca().set_aspect('equal', adjustable='box')
        scatter = plt.scatter(tsne_result[:, 0], tsne_result[:, 1], c=colors, alpha=0.5, s=5)
        legend_handles = [
            plt.Line2D([0], [0], marker='o', color='w', label='Non-Insider Trading (0)', markerfacecolor='black', markersize=6),
            plt.Line2D([0], [0], marker='o', color='w', label='Insider Trading (1)', markerfacecolor='red', markersize=6)
        ]
        plt.legend(handles=legend_handles, loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=2, fontsize=10, frameon=False)
        plt.title(f't-SNE Visualization - Sample {ratio}:1 - Run {run}')
        plt.xlabel('t-SNE Component 1')
        plt.ylabel('t-SNE Component 2')

        # Save the figure with a transparent background
        out_path = os.path.join(out_dir, f"Onlyt-SNE_{ratio}_per30_run{run}.png")
        plt.savefig(out_path, dpi=300, bbox_inches="tight", transparent=True)
        plt.show()
        print(f"Completed run {run} and saved figure to {out_path}")

### Only UMAP

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from umap import UMAP
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
import os
import shutil

# Set output directory
out_dir = "/root/autodl-tmp/UMAP_fig_20250928/OnlyUMAP"
os.makedirs(out_dir, exist_ok=True)

# # Define sampling ratios 
sampling_ratios = [1, 2, 5]
n_runs = 5        # number of resamples per ratio
random_state = 42
n_neighbors_values = [15, 50, 100, 5, 10]

for ratio in sampling_ratios:
    for run in range(1, n_runs + 1):
        # Sampling: resample the negative class to achieve the desired class balance
        n_pos = len(df_pos)
        n_neg = int(n_pos * ratio)  # number of negative samples

        # resampling
        df_neg_resampled, y_neg_resampled = resample(df_neg, y_neg, n_samples=n_neg, random_state=42 + run)

        # Combine positive and resampled negative samples
        df_sampled = pd.concat([df_pos, df_neg_resampled], axis=0)
        y_sampled = pd.concat([y_pos, y_neg_resampled], axis=0)
        print(y_sampled.value_counts())

        # Step 1: Data cleaning—replace NaN/Inf and standardize
        df_sampled = df_sampled.replace([np.inf, -np.inf], np.nan).fillna(0)
        scaler = StandardScaler()
        df_scaled = scaler.fit_transform(df_sampled)  # standardize features

        for n_neighbors in n_neighbors_values:
            # Step 2: Dimensionality reduction with UMAP 
            umap = UMAP(n_components=2, random_state=random_state, n_neighbors=n_neighbors, min_dist=0.1)  # Set different values for the “n_neighbors” parameter.
            umap_result = umap.fit_transform(df_scaled)  # fit UMAP and obtain 2D embedding

            # Step 3: Visualization
            colors = np.where(y_sampled == 1, 'red', 'black')
            plt.figure(figsize=(6, 6))
            # Keep equal aspect ratio for axes
            plt.gca().set_aspect('equal', adjustable='box')
            scatter = plt.scatter(umap_result[:, 0], umap_result[:, 1], c=colors, alpha=0.5, s=5)
            legend_handles = [
                plt.Line2D([0], [0], marker='o', color='w', label='Non-Insider Trading (0)', markerfacecolor='black', markersize=6),
                plt.Line2D([0], [0], marker='o', color='w', label='Insider Trading (1)', markerfacecolor='red', markersize=6)
            ]
            plt.legend(handles=legend_handles, loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=2, fontsize=10, frameon=False)
            plt.title(f'UMAP Visualization - Sample {ratio}:1 - Run {run} - n_neighbors={n_neighbors}')
            plt.xlabel('UMAP Component 1')
            plt.ylabel('UMAP Component 2')

            # Save figure with transparent background
            out_path = os.path.join(out_dir, f"OnlyUMAP_{ratio}_n{n_neighbors}_run{run}.png")
            plt.savefig(out_path, dpi=300, bbox_inches="tight", transparent=True)
            plt.show()
            print(f"Completed run {run} and saved figure to {out_path}, n_neighbors={n_neighbors}")

### PCA then UMAP

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from umap import UMAP
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA  # 引入PCA
from sklearn.utils import resample
import os
import shutil

# Set output directory
out_dir = "/root/autodl-tmp/UMAP_fig_20250928/PCA+UMAP"
#shutil.rmtree(out_dir) # Remove all contents in the specified folder
os.makedirs(out_dir, exist_ok=True)

# Define sampling ratios
sampling_ratios = [1, 2, 5]
n_runs = 5        # number of resamples per ratio
random_state = 42
n_neighbors_values = [15, 50, 100, 5, 10]  # can be set to multiple values
# Variance retention threshold
variance_threshold = 0.90  # retain 90% of variance

for ratio in sampling_ratios:
    for run in range(1, n_runs + 1):
        # Sampling (resample the negative class to maintain the desired class ratio)
        n_pos = len(df_pos)
        n_neg = int(n_pos * ratio)  # compute the number of negative samples

        df_neg_resampled, y_neg_resampled = resample(df_neg, y_neg, n_samples=n_neg, random_state=42 + run)

        # Combine positive and resampled negative samples
        df_sampled = pd.concat([df_pos, df_neg_resampled], axis=0)
        y_sampled = pd.concat([y_pos, y_neg_resampled], axis=0)
        print(y_sampled.value_counts())

        # Step 1: Data cleaning—replace NaN/Inf with 0 and standardize
        df_sampled = df_sampled.replace([np.inf, -np.inf], np.nan).fillna(0)
        scaler = StandardScaler()
        df_scaled = scaler.fit_transform(df_sampled)  # standardize features

        # Step 2: Apply PCA to retain 90% of the variance
        pca = PCA(n_components=variance_threshold, random_state=random_state)  # retain 90% variance
        pca_result = pca.fit_transform(df_scaled)
        print(f"Selected PCA components: {pca.n_components_}")  # print the number of selected principal components

        # Step 3: Further reduce with UMAP
        for n_neighbors in n_neighbors_values:
            umap = UMAP(n_components=2, random_state=random_state, n_neighbors=n_neighbors, min_dist=0.1)  # Set different values for the “n_neighbors” parameter
            umap_result = umap.fit_transform(pca_result)  # apply UMAP on the PCA output

            # Step 4: Visualization
            colors = np.where(y_sampled == 1, 'red', 'black')
            plt.figure(figsize=(6, 6))
            # Keep equal aspect ratio for the axes
            plt.gca().set_aspect('equal', adjustable='box')
            scatter = plt.scatter(umap_result[:, 0], umap_result[:, 1], c=colors, alpha=0.5, s=5)
            legend_handles = [
                plt.Line2D([0], [0], marker='o', color='w', label='Non-Insider Trading (0)', markerfacecolor='black', markersize=6),
                plt.Line2D([0], [0], marker='o', color='w', label='Insider Trading (1)', markerfacecolor='red', markersize=6)
            ]
            plt.legend(handles=legend_handles, loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=2, fontsize=10, frameon=False)
            plt.title(f'PCA + UMAP Visualization - Sample {ratio}:1 - Run {run} - n_neighbors={n_neighbors}')
            plt.xlabel('Component 1')
            plt.ylabel('Component 2')

            # Save the figure with a transparent background
            out_path = os.path.join(out_dir, f"PCA_UMAP_{ratio}_n{n_neighbors}_run{run}.png")
            plt.savefig(out_path, dpi=300, bbox_inches="tight", transparent=True)  
            plt.show() 
            print(f"Completed run {run} and saved figure to {out_path}, n_neighbors={n_neighbors}")

### PCA then t-SNE 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA  # 引入PCA
from sklearn.utils import resample
import os
import shutil

# Set output directory
out_dir = "/root/autodl-tmp/UMAP_fig_20250928/PCA+t-SNE"
#shutil.rmtree(out_dir) # 清空指定文件夹中的所有内容
os.makedirs(out_dir, exist_ok=True)

# # Define sampling ratios
sampling_ratios = [1, 2, 5]
n_runs = 5       # number of resamples per ratio
random_state = 42
perplexity_values = [5, 10, 30, 50]
# Variance retention threshold
variance_threshold = 0.90  # retain 90% of the variance

for ratio in sampling_ratios:
    for run in range(1, n_runs + 1):
        # Sampling: resample the negative class to maintain the desired class ratio
        n_pos = len(df_pos)
        n_neg = int(n_pos * ratio)  # compute the number of negative samples

        df_neg_resampled, y_neg_resampled = resample(df_neg, y_neg, n_samples=n_neg, random_state=42 + run)

        # Combine positive and resampled negative samples
        df_sampled = pd.concat([df_pos, df_neg_resampled], axis=0)
        y_sampled = pd.concat([y_pos, y_neg_resampled], axis=0)
        print(y_sampled.value_counts())

        # Step 1: Data cleaning—replace NaN/Inf with 0 and standardize
        df_sampled = df_sampled.replace([np.inf, -np.inf], np.nan).fillna(0)
        scaler = StandardScaler()
        df_scaled = scaler.fit_transform(df_sampled)  # standardize features

        # Step 2: Apply PCA and retain 90% of the variance
        pca = PCA(n_components=variance_threshold, random_state=random_state)  # retain 90% variance
        pca_result = pca.fit_transform(df_scaled)

        print(f"Selected PCA components: {pca.n_components_}")  # number of selected principal components

        # Step 3: Further reduce with t-SNE
        for perp in perplexity_values:
            tsne = TSNE(n_components=2, perplexity=perp, learning_rate="auto", n_iter=2000, init="pca", random_state=42 + run )  # try different perplexity values
            tsne_result = tsne.fit_transform(pca_result)  # apply t-SNE on the PCA output

            # Step 4: Visualization
            colors = np.where(y_sampled == 1, 'red', 'black')
            plt.figure(figsize=(6, 6))
            # Keep equal aspect ratio for the axes
            plt.gca().set_aspect('equal', adjustable='box')
            scatter = plt.scatter(tsne_result[:, 0], tsne_result[:, 1], c=colors, alpha=0.5, s=5)
            legend_handles = [
                plt.Line2D([0], [0], marker='o', color='w', label='Non-Insider Trading (0)', markerfacecolor='black', markersize=6),
                plt.Line2D([0], [0], marker='o', color='w', label='Insider Trading (1)', markerfacecolor='red', markersize=6)
            ]
            plt.legend(handles=legend_handles, loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=2, fontsize=10, frameon=False)
            plt.title(f'PCA + t-SNE Visualization - Sample {ratio}:1 - Run {run} - perplexity={perp}')
            plt.xlabel('Component 1')
            plt.ylabel('Component 2')

            # Save the figure with a transparent background
            out_path = os.path.join(out_dir, f"PCA_tSNE_{ratio}_perp{perp}_run{run}.png")
            plt.savefig(out_path, dpi=300, bbox_inches="tight", transparent=True)
            plt.show()
            print(f"Completed run {run} and saved figure to {out_path}, perplexity={perp}")

### UMAP+ Leiden

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from umap import UMAP
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
import igraph as ig
import leidenalg
import os

# Set output directory
out_dir = "/root/autodl-tmp/UMAP_fig_20250928/UMAP+Leiden1029"
os.makedirs(out_dir, exist_ok=True)

# Parameters
sampling_ratios = [1]
n_runs = 5
random_state = 42
n_neighbors_values = [15]
resolution_values = [0.05, 0.1, 0.2, 0.3, 0.4, 0.7, 1.0, 1.5]

for ratio in sampling_ratios:
    for run in range(1, n_runs + 1):
        # Sampling: resample the negative class
        n_pos = len(df_pos)
        n_neg = int(n_pos * ratio)
        df_neg_resampled, y_neg_resampled = resample(
            df_neg, y_neg, n_samples=n_neg, random_state=random_state + run
        )
        df_sampled = pd.concat([df_pos, df_neg_resampled], axis=0)
        y_sampled = pd.concat([y_pos, y_neg_resampled], axis=0)
        print(y_sampled.value_counts())

        # Clean and standardize
        df_clean = df_sampled.replace([np.inf, -np.inf], np.nan).fillna(0)
        scaler = StandardScaler()
        df_scaled = scaler.fit_transform(df_clean)

        for n_neighbors in n_neighbors_values:
            # UMAP reduction
            umap_model = UMAP(
                n_components=2,
                random_state=random_state,
                n_neighbors=n_neighbors,
                min_dist=0.1,
                metric='euclidean'
            )
            umap_result = umap_model.fit_transform(df_scaled)

            # Build kNN graph & Leiden clustering
            knn_graph = umap_model.graph_
            sources, targets = knn_graph.nonzero()
            weights = knn_graph.data
            edges = list(zip(sources.tolist(), targets.tolist()))
            g = ig.Graph(edges=edges, directed=False)
            g.es['weight'] = weights

            for resolution in resolution_values:
                partition = leidenalg.find_partition(
                    g,
                    leidenalg.RBConfigurationVertexPartition,
                    weights=g.es['weight'],
                    resolution_parameter=resolution
                )
                cluster_labels = np.array(partition.membership)
                n_clusters = len(set(cluster_labels))
                print(f"n_neighbors={n_neighbors}, resolution={resolution} → {n_clusters} clusters")

                # Visualization: clusters and true labels side by side
                fig, axes = plt.subplots(1, 2, figsize=(12, 6), sharex=True, sharey=True)
                # Left panel: clusters
                sc0 = axes[0].scatter(
                    umap_result[:, 0], umap_result[:, 1],
                    c=cluster_labels, cmap='tab10', alpha=0.6, s=5
                )
                axes[0].set_title('Clusters Only')
                axes[0].set_xlabel('UMAP Component 1')
                axes[0].set_ylabel('UMAP Component 2')
                axes[0].set_aspect('equal', 'box')
                # Right panel: true labels
                colors = np.where(y_sampled == 1, 'red', 'black')
                axes[1].scatter(
                    umap_result[:, 0], umap_result[:, 1],
                    c=colors, alpha=0.4, s=5
                )
                axes[1].set_title('True Labels (Y)')
                axes[1].set_xlabel('UMAP Component 1')
                axes[1].set_ylabel('UMAP Component 2')
                axes[1].set_aspect('equal', 'box')
                # Add legend for true labels
                legend_handles = [
                    plt.Line2D([0], [0], marker='o', color='w', label='Non-Insider Trading (0)',
                               markerfacecolor='black', markersize=6),
                    plt.Line2D([0], [0], marker='o', color='w', label='Insider Trading (1)',
                               markerfacecolor='red', markersize=6)
                ]
                axes[1].legend(handles=legend_handles, title='Legend', loc='best')

                # Main title
                fig.suptitle(
                    f'UMAP + Leiden Clustering | Sample {ratio}:1 - Run {run}\n' +
                    f'n_neighbors={n_neighbors} - res={resolution} ({n_clusters} clusters)',
                    fontsize=14
                )

                # Colorbar: align height with subplots
                pos = axes[0].get_position()
                cax = fig.add_axes([pos.x1 + 0.01, pos.y0, 0.02, pos.height])
                fig.colorbar(sc0, cax=cax, label='Cluster ID')

                # Save and show
                path = os.path.join(
                    out_dir,
                    f"UMAP_side_by_side_centerbar_{ratio}_n{n_neighbors}_run{run}_res{resolution}.png"
                )
                plt.tight_layout(rect=[0, 0, 1, 0.95])
                plt.savefig(path, dpi=300, bbox_inches='tight', transparent=True)
                plt.show()

                # Visualization: cluster purity heatmap without numbers
                df_eval = pd.DataFrame({'cluster': cluster_labels, 'true': y_sampled.values})
                ct = pd.crosstab(df_eval['cluster'], df_eval['true'], normalize='index')
                plt.figure(figsize=(8, 6))
                sns.heatmap(
                    ct,
                    annot=False,
                    cmap='Blues'
                )
                plt.xlabel('True Label (Y)')
                plt.ylabel('Cluster ID')
                plt.title(
                    f'Cluster Purity Heatmap  |  Sample {ratio}:1 - Run {run} - ' +
                    f'n_neighbors={n_neighbors} - res={resolution}'
                )
                path = os.path.join(
                    out_dir,
                    f"cluster_purity_heatmap_{ratio}_n{n_neighbors}_run{run}_res{resolution}.png"
                )
                plt.savefig(path, dpi=300, bbox_inches='tight', transparent=True)
                plt.show()