In [7]:
import pandas as pd
from pathlib import Path
from typing import List, Optional
import re

In [8]:
class Spindle3DMetadata:
    """Container for Spindle3D metadata column names"""
    columns_df = [
        "Path_InputImage",
        "Spindle_Angle_Degrees",
        "Spindle_Length_um",
        "Spindle_Volume_um3",
        "Spindle_Width_Avg_um",
        "Tubulin_Spindle_Average_Intensity",
        "Chromatin_Volume_um3",
        "MetaphasePlate_Length_um",
        "MetaphasePlate_Width_um",
        "Version"
    ]

def concat_spindle3d(input_folders: List[str]) -> pd.DataFrame:
    """
    Concatenate Spindle3D measurement files from specified folders,
    adding timepoint information from folder paths.
    
    Args:
        input_folders: List of folder paths containing measurement.txt files
        
    Returns:
        Combined DataFrame of all measurements with added timepoint columns
        
    Raises:
        FileNotFoundError: If no measurement files are found
    """
    dataframes = []
    total_files = 0
    
    def extract_timepoint(folder_path: Path) -> str:
        """Extract timepoint (24h, 48h, 72h) from folder path."""
        folder_str = str(folder_path)
        if '24h' in folder_str.lower():
            return '24h'
        elif '48h' in folder_str.lower():
            return '48h'
        elif '72h' in folder_str.lower():
            return '72h'
        return None

    def extract_species(folder_path: Path) -> str:
        """Extract species (IM, CM) from folder path."""
        folder_str = str(folder_path)
        if '_IM_' in folder_str:
            return 'IM'
        elif '_CM_' in folder_str:
            return 'CM'
        return None

    def extract_RNA_info(folder_path: Path) -> str:
        """
        Extract RNA name and version from folder path.
        
        Args:
            folder_path: Path object containing folder name in format [RNAname]_[version]
            
        Returns:
            Tuple of (RNA name, version)
        """
        # Create pattern to match RNA name and version
        # (?<=\[) - positive lookbehind for '['
        # ([^\]]+) - capture one or more non-']' characters (RNA name)
        # _ - literal underscore
        # ([^\]]+) - capture one or more non-']' characters (version)
        # (?=\]) - positive lookahead for ']'
        pattern = r'_([A-Za-z0-9]+_(?:init|new|orig))_'
        
        # Convert folder name to string and find the first match
        folder_name = str(folder_path)
        match = re.search(pattern, folder_name)
        
        # If no match found, return empty strings
        if not match:
            print(f"Warning: Could not parse RNA info from folder: {folder_name}")
            return ""
        
        # Return the captured groups (RNA name and version)
        return match.group(1)
    
    for folder in input_folders:
        folder_path = Path(folder)
        if not folder_path.exists():
            print(f"Warning: Folder '{folder}' does not exist")
            continue
            
        measurement_files = list(folder_path.rglob('measurements.txt'))
        total_files += len(measurement_files)
        
        if not measurement_files:
            continue
            
        for file_path in measurement_files:
            try:
                # Read the CSV file
                df = pd.read_csv(
                    file_path,
                    delimiter = "\t",
                    encoding = 'unicode_escape',
                    usecols = Spindle3DMetadata.columns_df
                )
                
                # Extract timepoint from folder path
                timepoint = extract_timepoint(file_path.parent)
                df['Timepoint'] = timepoint
                df['Hours'] = int(timepoint.replace('h', '')) if timepoint else None

                # Extract species from folder path
                species = extract_species(file_path.parent)
                df['Species'] = species

                rna = extract_RNA_info(file_path.parent)
                df['siRNA'] = rna
               
                df["Spindle_Aspect_Ratio"] = df.Spindle_Length_um / df.Spindle_Width_Avg_um
                
                dataframes.append(df)
            except Exception as e:
                print(f"Error processing {file_path}: {str(e)}")
                
    if not dataframes:
        raise FileNotFoundError("No measurement files found in specified folders")
        
    return pd.concat(dataframes, ignore_index = True)

def save_spindle_data(df: Optional[pd.DataFrame], output_dir: str) -> None:
    """Save spindle data to CSV file."""
    if df is None:
        raise ValueError("DataFrame is None")
        
    output_path = Path(output_dir) / "MainDataFrame_Spindle3D.csv"
    output_path.parent.mkdir(parents = True, exist_ok = True)
    
    try:
        df.to_csv(output_path, index = False)
        print(f"Successfully saved master spindle 3D dataframe to: {output_path}")
    except Exception as e:
        print(f"Error saving dataframe: {str(e)}")

In [9]:
# Usage
if __name__ == "__main__":
    root_spindle_1 = Path("/Volumes/arxivBeta/_Tobias/Opera/20250204")
    root_spindle_2 = Path("/Volumes/arxivBeta/_Tobias/Opera/20250314")
    spindle_input_folders = [
        #root_spindle_1 / "05_Spindle3D/24h/Bipolar", 
        #root_spindle_1 / "05_Spindle3D/48h/Bipolar",
        #root_spindle_1 / "05_Spindle3D/72h/Bipolar",
        root_spindle_2 / "04_Spindle3D/Bipolar",
        
    ]
    
    try:
        spindle_df = concat_spindle3d(spindle_input_folders)
        save_spindle_data(spindle_df, "/Volumes/arxivBeta/_Tobias/Opera/DataFrames")
    except Exception as e:
        print(f"Processing failed: {str(e)}")

Successfully saved master spindle 3D dataframe to: /Volumes/arxivBeta/_Tobias/Opera/DataFrames/MainDataFrame_Spindle3D.csv
