AE feature extractor, main operator is main().
Change image folder to your labeled SEM image folder.
Model path to your trained AE model.
Output to your desire CSV file name.
Combines with template.csv to include full Magpie Features.
Specify Latent dimensions, ensure consistency.


In [1]:
import os
import csv
from PIL import Image
import torch
from torchvision import transforms
from tqdm import tqdm
import numpy as np
import torch
import torch.nn as nn

In [2]:
# Define image transform
transform = transforms.Compose([
    transforms.Grayscale(),
    transforms.Resize((144, 256)),  # Your model expects (1, 144, 256)
    transforms.ToTensor()
])

In [3]:
import torch
import torch.nn as nn

class AE(nn.Module):
    def __init__(self, latent_dim=32):
        super(AE, self).__init__()
        self.latent_dim = latent_dim

        # Encoder
        self.encoder = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=4, stride=2, padding=1),   # (B, 32, H/2, W/2)
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Conv2d(32, 64, kernel_size=4, stride=2, padding=1),  # (B, 64, H/4, W/4)
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Conv2d(64, 128, kernel_size=4, stride=2, padding=1), # (B, 128, H/8, W/8)
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.Dropout(0.2),
        )

        self.flattened_size = 128 * 18 * 32
        self.fc_enc = nn.Linear(self.flattened_size, latent_dim)
        self.fc_dec = nn.Linear(latent_dim, self.flattened_size)

        # Decoder
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(128, 64, kernel_size=4, stride=2, padding=1),  # (B, 64, H/4, W/4)
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.Dropout(0.1),

            nn.ConvTranspose2d(64, 32, kernel_size=4, stride=2, padding=1),   # (B, 32, H/2, W/2)
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.Dropout(0.1),

            nn.ConvTranspose2d(32, 1, kernel_size=4, stride=2, padding=1),    # (B, 1, H, W)
            nn.Sigmoid(),  # Output in [0, 1]
        )

    def encode(self, x):
        x = self.encoder(x)
        x = x.view(-1, self.flattened_size)
        return self.fc_enc(x)

    def decode(self, z):
        x = self.fc_dec(z)
        x = x.view(-1, 128, 18, 32)
        return self.decoder(x)

    def forward(self, x):
        z = self.encode(x)
        x_recon = self.decode(z)
        return x_recon


In [4]:

def get_all_image_paths(root_dir, extensions={".jpg", ".png", ".jpeg", ".bmp"}):
    paths = []
    for dirpath, _, filenames in os.walk(root_dir):
        for f in filenames:
            if any(f.lower().endswith(ext) for ext in extensions):
                paths.append(os.path.join(dirpath, f))
    return paths

In [5]:
def extract_latents(image_paths, model, transform, device):
    rows = []
    for path in tqdm(image_paths):
        try:
            image = Image.open(path).convert('L')
            image_tensor = transform(image).unsqueeze(0).to(device)
            with torch.no_grad():
                latent = model.encode(image_tensor).squeeze().cpu().numpy()
            row = [os.path.relpath(path)] + [""] * 4 + latent.tolist()
            rows.append(row)
        except Exception as e:
            print(f"Failed to process {path}: {e}")
    return rows

In [6]:
import csv
import pandas as pd

def save_to_csv(rows, template_csv, output_csv,
                              base_header_first4=None):
    """
    rows: list of lists, each row has at least 5 columns followed by extracted features
    template_csv: path to template CSV
    output_csv: path to save
    base_header_first4: list of 4 strings for first 4 columns' headers (optional)
    """

    # Load template
    df_template = pd.read_csv(template_csv)

    # Get template's header for column 5
    col5_name = df_template.columns[4]

    # Find start index for "MagpieData minimum Number"
    if "MagpieData minimum Number" not in df_template.columns:
        raise ValueError("Template does not contain column 'MagpieData minimum Number'.")
    start_idx = df_template.columns.get_loc("MagpieData minimum Number")

    # All template headers from that column onward
    extra_template_headers = df_template.columns[start_idx:].tolist()

    # Build base header (first 4 columns)
    if base_header_first4 is None:
        base_header_first4 = ["filename", "col2", "col3", "col4"]
    if len(base_header_first4) != 4:
        raise ValueError("base_header_first4 must have exactly 4 names.")

    # Determine how many extracted features are in rows
    n_extracted_features = len(rows[0]) - 5  # exclude first 5 columns
    # We'll append template extra columns later, so just count original features
    # If your rows already include template columns, adjust accordingly.

    # Feature headers
    feature_headers = [f"f_{i}" for i in range(n_extracted_features)]

    # Final header
    final_header = base_header_first4 + [col5_name] + feature_headers + extra_template_headers

    # Prepare data from template
    col5_values = df_template.iloc[:, 4].tolist()
    extra_template_values = df_template.iloc[:, start_idx:].values

    new_rows = []
    for i, row in enumerate(rows):
        if i >= len(df_template):
            raise ValueError("Template has fewer rows than your data.")
        # Replace 5th column with template col5
        merged = list(row)
        merged[4] = col5_values[i]
        # Append template columns at the end
        merged.extend(extra_template_values[i])
        new_rows.append(merged)

    # Save with header
    with open(output_csv, "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(final_header)
        writer.writerows(new_rows)

    print(f"✅ Saved CSV with extracted features and template columns to {output_csv}")


In [None]:
def main():
    image_folder = r"Preprocessed Images\Labeled SEM\ourimg_normgrey"
    model_path = r"Trained Models\Example AE Finetuned Model.pth"
    output_csv = r"Extracted Features\Example AE Features.csv"
    template = r'Miscelleous\template.csv'

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load model
    model = AE(latent_dim=2048)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model = model.to(device)
    model.eval()

    # Process images
    image_paths = get_all_image_paths(image_folder)
    rows = extract_latents(image_paths, model, transform, device)
    save_to_csv(rows,template, output_csv)
    print(f"Saved {len(rows)} entries to {output_csv}")
    
if __name__ == "__main__":
    main()

100%|██████████| 344/344 [00:01<00:00, 233.09it/s]


✅ Saved CSV with extracted features and template columns to Example AE Features.csv
Saved 344 entries to Example AE Features.csv
