AE feature extractor, main operator is main().
Change image folder to your labeled SEM image folder.
Model path to your trained VAE model.
Output to your desire CSV file name.
Specify latent dimensions.
Combines with template.csv to include full Magpie Features.


In [1]:
import torch
import torch.nn as nn
from torchvision import transforms
from PIL import Image
import os
import csv
import pandas as pd

In [2]:
transform = transforms.Compose([
    transforms.Resize((256, 144)),  # Adjust if needed
    transforms.Grayscale(),         # Optional depending on model
    transforms.ToTensor(),          # Converts to CxHxW and scales to [0,1]
    transforms.Normalize([0.5], [0.5])  # Normalize if model expects it
])

In [3]:
import torch
import torch.nn as nn

class VAE(nn.Module):
    def __init__(self, latent_dim=2048):
        super(VAE, self).__init__()
        self.latent_dim = latent_dim

        # Encoder with Dropout
        self.encoder = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=4, stride=2, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Conv2d(32, 64, kernel_size=4, stride=2, padding=1),
            #nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Conv2d(64, 128, kernel_size=4, stride=2, padding=1),
            #nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.Dropout(0.2),
        )


        self.flattened_size = 128 * 18 * 32

        self.fc_mu = nn.Linear(self.flattened_size, latent_dim)
        self.fc_logvar = nn.Linear(self.flattened_size, latent_dim)

        self.decoder_input = nn.Linear(latent_dim, self.flattened_size)

        # Decoder with light Dropout (optional)
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(128, 64, kernel_size=4, stride=2, padding=1),
            #nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.Dropout(0.1),

            nn.ConvTranspose2d(64, 32, kernel_size=4, stride=2, padding=1),
            #nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.Dropout(0.1),

            nn.ConvTranspose2d(32, 1, kernel_size=4, stride=2, padding=1),
            nn.Sigmoid(),
        )

    def encode(self, x):
        x = self.encoder(x)
        x = x.view(-1, self.flattened_size)
        return self.fc_mu(x), self.fc_logvar(x)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def decode(self, z):
        x = self.decoder_input(z)
        x = x.view(-1, 128, 18, 32)
        return self.decoder(x)

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar



In [4]:
def extract_features(model_path, image_folder, output_csv, latent_dim=2048,
                     template_csv=None):
    """
    Extract latent features from images using a VAE model and save to CSV.

    Output columns order:
    - Column 1: image filename (no path)
    - Columns 2-4: empty (or can be customized)
    - Column 5: replaced by template's 5th column (index 4)
    - Columns 6...: latent descriptors
    - Optionally appended columns from "MagpieData minimum Number" onward from template

    Parameters:
    - model_path: path to saved VAE model state_dict
    - image_folder: folder containing images
    - output_csv: CSV file to save features
    - latent_dim: latent vector dimension
    - template_csv: optional path to template CSV for replacing and appending columns
    - append_template_cols: if True, append columns after "MagpieData minimum Number" from template CSV
    """
    append_template_cols=True
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    model = VAE(latent_dim=latent_dim).to(device)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()

    transform = transforms.Compose([
        transforms.Resize((144, 256)),
        transforms.Grayscale(),
        transforms.ToTensor(),
        transforms.Normalize([0.5], [0.5])
    ])

    rows = []

    # Collect image files sorted to ensure consistent ordering
    image_files = []
    for root, _, files in os.walk(image_folder):
        for file in files:
            if file.lower().endswith(('.jpg', '.jpeg', '.png', '.tif')):
                image_files.append(os.path.join(root, file))
    image_files.sort()

    for img_path in image_files:
        try:
            image = Image.open(img_path).convert('L')
            tensor = transform(image).unsqueeze(0).to(device)
            with torch.no_grad():
                mu, _ = model.encode(tensor)
            mu_np = mu.squeeze().cpu().numpy()
            file_name = os.path.basename(img_path)
            # Compose row:
            # col1 = filename
            # col2, col3, col4 = empty strings (placeholders)
            # col5 = placeholder for template value (to be replaced later)
            # col6... = latent features
            row = [file_name, '', '', '', ''] + mu_np.tolist()
            rows.append(row)
            print(f"Extracted: {img_path}")
        except Exception as e:
            print(f"Failed: {img_path}, Error: {e}")

    if template_csv is None:
        # Save without template merging
        with open(output_csv, 'w', newline='') as f:
            writer = csv.writer(f)
            header = ['filename', 'col2', 'col3', 'col4', 'template_col5'] + [f'descriptor_{i}' for i in range(latent_dim)]
            writer.writerow(header)
            writer.writerows(rows)
        print(f"Saved features to {output_csv}")
        return

    # Load template CSV
    df_template = pd.read_csv(template_csv)

    if len(rows) != len(df_template):
        raise ValueError(f"Row count mismatch: extracted {len(rows)} rows vs template {len(df_template)} rows.")

    # Replace 5th column with template's column 5 values
    col5_values = df_template.iloc[:, 4].tolist()

    # Get start index of "MagpieData minimum Number"
    if append_template_cols:
        if "MagpieData minimum Number" not in df_template.columns:
            raise ValueError("Template CSV missing 'MagpieData minimum Number' column.")
        magpie_start_idx = df_template.columns.get_loc("MagpieData minimum Number")
        magpie_headers = df_template.columns[magpie_start_idx:].tolist()
        magpie_values = df_template.iloc[:, magpie_start_idx:].values
    else:
        magpie_headers = []
        magpie_values = None

    merged_rows = []
    for i, row in enumerate(rows):
        new_row = list(row)
        new_row[4] = col5_values[i]  # Replace column 5
        if append_template_cols:
            new_row.extend(magpie_values[i])  # Append Magpie columns
        merged_rows.append(new_row)

    # Build header:
    # First 5 columns:
    # 1: 'filename'
    # 2-4: placeholders or empty
    # 5: template's 5th column header
    base_header = ['filename', 'col2', 'col3', 'col4', df_template.columns[4]]

    # Then latent descriptors
    latent_headers = [f'descriptor_{i}' for i in range(latent_dim)]

    # Append Magpie headers if applicable
    if append_template_cols:
        header = base_header + latent_headers + magpie_headers
    else:
        header = base_header + latent_headers

    with open(output_csv, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(header)
        writer.writerows(merged_rows)

    print(f"Saved merged CSV to {output_csv} with template columns appended={append_template_cols}")

In [None]:
if __name__ == "__main__":
    image_folder = r"Preprocessed Images\Labeled SEM\ourimg_normgrey"
    model_path = r"Trained Models\Example VAE Finetuned Model.pth"
    output_csv = r"Extracted Features\Example VAE Features.csv"
    template = r'Miscelleous\template.csv'
    latent_dim = 2048

    extract_features(model_path, image_folder, output_csv, latent_dim, template)

Extracted: Preprocessed Images\Labeled SEM\ourimg_normgrey\0perc_00.jpg
Extracted: Preprocessed Images\Labeled SEM\ourimg_normgrey\0perc_01.jpg
Extracted: Preprocessed Images\Labeled SEM\ourimg_normgrey\0perc_02.jpg
Extracted: Preprocessed Images\Labeled SEM\ourimg_normgrey\0perc_03.jpg
Extracted: Preprocessed Images\Labeled SEM\ourimg_normgrey\0perc_04.jpg
Extracted: Preprocessed Images\Labeled SEM\ourimg_normgrey\10_00.jpg
Extracted: Preprocessed Images\Labeled SEM\ourimg_normgrey\10_01.jpg
Extracted: Preprocessed Images\Labeled SEM\ourimg_normgrey\10_02.jpg
Extracted: Preprocessed Images\Labeled SEM\ourimg_normgrey\10_03.jpg
Extracted: Preprocessed Images\Labeled SEM\ourimg_normgrey\10_04.jpg
Extracted: Preprocessed Images\Labeled SEM\ourimg_normgrey\10perc_00.jpg
Extracted: Preprocessed Images\Labeled SEM\ourimg_normgrey\10perc_01.jpg
Extracted: Preprocessed Images\Labeled SEM\ourimg_normgrey\10perc_02.jpg
Extracted: Preprocessed Images\Labeled SEM\ourimg_normgrey\10perc_03.jpg
Ext