This code imports csv files containing detection analysis of in situ data from QuPath. 
It assumes you have used classifiers in QuPath to define each cell as positive or negative for each gene of interest. Data is structured as one QuPath project (and one csv output) per mouse, with multiple images per project.

This code was adapted for use in Python from previous code written in R, available at https://github.com/SodenLab/Simon-et-al. Cursor AI coding software was used to convert the code from R to Python, and to assist with adding additional features.

### Loading libraries and defining functions


In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import datetime
import subprocess
from collections import Counter
import json

### Defining functions for subsetting cells 

In [None]:
def pos_cells(data, gene):
    """Return cells positive for a given gene (exact match, handles whitespace)"""
    # Split classifications, strip whitespace, and check for exact gene match
    return data[data['Classification'].apply(
        lambda x: gene in [g.strip() for g in x.split(':')] if pd.notna(x) else False
    )]

def neg_cells(data, gene):
    """Return cells negative for a given gene (exact match, handles whitespace)"""
    # Split classifications, strip whitespace, and check that gene is NOT in the list
    return data[data['Classification'].apply(
        lambda x: gene not in [g.strip() for g in x.split(':')] if pd.notna(x) else True
    )]

def posOR_cells(data, gene1, gene2):
    """Return cells positive for either of two genes (exact match, handles whitespace)"""
    # Split classifications, strip whitespace, and check if either gene is in the list
    return data[data['Classification'].apply(
        lambda x: any(g.strip() in [gene1, gene2] for g in x.split(':')) if pd.notna(x) else False
    )]

### Importing Data

In [None]:
# Defining some functions to check columns of CSVs
def check_csv_consistency(filepath):
    """
    Check if all CSV files have the same column names and order
    
    Parameters:
    filepath: path to directory containing CSV files
    
    Returns:
    dict with consistency information
    """
    # Get list of all CSV files
    csv_files = [f for f in os.listdir(filepath) if f.endswith('.csv')]
    print(f"Found {len(csv_files)} CSV files: {csv_files}")
    
    if len(csv_files) == 0:
        return {"error": "No CSV files found"}
    
    # Dictionary to store results
    results = {
        "files_checked": csv_files,
        "column_info": {},
        "all_same_columns": True,
        "all_same_order": True,
        "column_differences": [],
        "order_differences": []
    }
    
    # Read headers from all files
    file_columns = {}
    
    for csv_file in csv_files:
        try:
            # Read just the first row to get column names
            sample = pd.read_csv(os.path.join(filepath, csv_file), nrows=0)
            file_columns[csv_file] = list(sample.columns)
            results["column_info"][csv_file] = {
                "num_columns": len(sample.columns),
                "columns": list(sample.columns)
            }
            print(f"{csv_file}: {len(sample.columns)} columns")
        except Exception as e:
            print(f"Error reading {csv_file}: {e}")
            results["column_info"][csv_file] = {"error": str(e)}
    
    # Check if all files have the same columns (regardless of order)
    if len(file_columns) > 1:
        # Get all unique column sets
        column_sets = [set(cols) for cols in file_columns.values()]
        first_set = column_sets[0]
        
        # Check if all sets are identical
        for i, col_set in enumerate(column_sets[1:], 1):
            if col_set != first_set:
                results["all_same_columns"] = False
                file1 = csv_files[0]
                file2 = csv_files[i]
                
                missing_in_file2 = first_set - col_set
                extra_in_file2 = col_set - first_set
                
                results["column_differences"].append({
                    "file1": file1,
                    "file2": file2,
                    "missing_in_file2": list(missing_in_file2),
                    "extra_in_file2": list(extra_in_file2)
                })
    
    # Check if all files have the same column order
    if len(file_columns) > 1:
        first_order = list(file_columns.values())[0]
        
        for i, (filename, columns) in enumerate(list(file_columns.items())[1:], 1):
            if columns != first_order:
                results["all_same_order"] = False
                results["order_differences"].append({
                    "reference_file": csv_files[0],
                    "different_file": filename,
                    "reference_order": first_order,
                    "different_order": columns
                })
    
    return results

In [None]:
# Run this check on your data to confirm that all files have the same column names
filepath = './Data'
consistency_results = check_csv_consistency(filepath)

# Print summary
print("\n" + "="*50)
print("CONSISTENCY CHECK RESULTS")
print("="*50)

print(f"\nFiles checked: {len(consistency_results['files_checked'])}")
print(f"All files have same columns: {consistency_results['all_same_columns']}")
print(f"All files have same column order: {consistency_results['all_same_order']}")

if not consistency_results['all_same_columns']:
    print(f"\nCOLUMN DIFFERENCES FOUND:")
    for diff in consistency_results['column_differences']:
        print(f"\nBetween {diff['file1']} and {diff['file2']}:")
        if diff['missing_in_file2']:
            print(f"  Missing in {diff['file2']}: {diff['missing_in_file2']}")
        if diff['extra_in_file2']:
            print(f"  Extra in {diff['file2']}: {diff['extra_in_file2']}")

if not consistency_results['all_same_order']:
    print(f"\nCOLUMN ORDER DIFFERENCES FOUND:")
    for diff in consistency_results['order_differences']:
        print(f"\n{diff['different_file']} has different order than {diff['reference_file']}")
        print("First 10 columns comparison:")
        ref_cols = diff['reference_order'][:10]
        diff_cols = diff['different_order'][:10]
        for i, (ref, diff_col) in enumerate(zip(ref_cols, diff_cols)):
            match = "✓" if ref == diff_col else "✗"
            print(f"  {i+1:2d}: {ref:30} | {diff_col:30} {match}")

# Additional detailed analysis
print(f"\n" + "="*50)
print("DETAILED COLUMN ANALYSIS")
print("="*50)

# Show column count summary
column_counts = [info.get('num_columns', 0) for info in consistency_results['column_info'].values() if 'num_columns' in info]
if column_counts:
    print(f"\nColumn counts across files:")
    count_summary = Counter(column_counts)
    for count, freq in sorted(count_summary.items()):
        print(f"  {count} columns: {freq} files")
    
    if len(set(column_counts)) > 1:
        print(f"\n⚠️  WARNING: Files have different numbers of columns!")
        for filename, info in consistency_results['column_info'].items():
            if 'num_columns' in info:
                print(f"    {filename}: {info['num_columns']} columns")

In [None]:
#Only run after you have verified that all files have the same column names.
#Column names do not need to be in the same order, this will sort them and concatenate.

filepath = './Data'

# Get list of all CSV files in the directory
csv_files = [f for f in os.listdir(filepath) if f.endswith('.csv')]
print(f"Found CSV files: {csv_files}")

# Extract mouseIDs from CSV filenames (removing .csv extension)
mouseIDs = sorted(list(set([os.path.splitext(f)[0] for f in csv_files])))
print(f"Found mouseIDs: {mouseIDs}")

# Read all CSV files into a list first
print("Reading CSV files...")
dataframes = []
for csv_file in csv_files:
    print(f"Reading {csv_file}...")
    # Read CSV with columns 4 and 5 as strings to avoid mixed type warning
    data = pd.read_csv(os.path.join(filepath, csv_file), 
                   dtype={4: str, 5: str},
                   na_filter=False) 
    # Add mouseID column from filename (without .csv extension)
    data['mouseID'] = os.path.splitext(csv_file)[0]
    # Add NumGenes column before concatenating
    data['NumGenes'] = data['Classification'].apply(lambda x: len(x.split(':')) if x else 0)
    
    # Sort columns alphabetically to ensure consistent order
    data = data.reindex(sorted(data.columns), axis=1)
    
    dataframes.append(data)

# Concatenate all dataframes at once
all_data = pd.concat(dataframes, ignore_index=True)
print(f"Loaded {len(csv_files)} CSV files with {len(all_data)} total rows")

all_data.head()

### Importing Annotation Data

In [None]:
#Importing QuPath annotation files so we have the area of each subregion. Used to calculate normalized cell counts.

# Directory containing CSV files
annotations_filepath = r'./Annotations'

# Get list of all CSV files in the directory
csv_files = [f for f in os.listdir(annotations_filepath) if f.endswith('.csv')]
print(f"Found CSV files: {csv_files}")

# Read all CSV files into a list first
print("Reading CSV files...")
dataframes = []
all_columns = set()

# First pass: collect all unique column names (excluding "Num" columns)
for csv_file in csv_files:
    print(f"Scanning columns in {csv_file}...")
    data = pd.read_csv(os.path.join(annotations_filepath, csv_file))
    
    # Filter out columns that start with "Num"
    filtered_columns = [col for col in data.columns if not col.startswith('Num')]
    all_columns.update(filtered_columns)
    
    print(f"  {len(data.columns)} total columns, {len(filtered_columns)} after excluding 'Num' columns")

# Convert to sorted list for consistent ordering
all_columns = sorted(list(all_columns))
# Add our custom columns
all_columns.extend(['mouseID', 'source_file'])
print(f"\nTotal unique columns across all files (excluding 'Num' columns): {len(all_columns)}")

# Second pass: read files and standardize columns
for csv_file in csv_files:
    print(f"Reading {csv_file}...")
    # Read CSV file
    data = pd.read_csv(os.path.join(annotations_filepath, csv_file))
    
    # Remove columns that start with "Num"
    data = data[[col for col in data.columns if not col.startswith('Num')]]
    
    # Extract mouseID from filename
    mouseID = csv_file.replace(' annotations.csv', '')
    print(f"  Extracted mouseID: {mouseID}")
    
    # Add our custom columns
    data['mouseID'] = mouseID
    data['source_file'] = csv_file
    
    # Add any missing columns with NaN values
    for col in all_columns:
        if col not in data.columns:
            data[col] = pd.NA
            print(f"    Added missing column: {col}")
    
    # Reorder columns to match the standard order
    data = data[all_columns]
    
    dataframes.append(data)

# Concatenate all dataframes at once
print(f"\nConcatenating {len(dataframes)} dataframes...")
annotations = pd.concat(dataframes, ignore_index=True)
print(f"Loaded {len(csv_files)} CSV files with {len(annotations)} total rows")

# Display basic info about the combined dataframe
print(f"\nAnnotations dataframe shape: {annotations.shape}")
print(f"Columns: {len(annotations.columns)}")
print(f"Unique mouseIDs: {annotations['mouseID'].unique()}")
annotations.head()

### Making Plots

In [None]:
# Isolates all cells that express a given gene (cannon_gene) and then plots expression of up to 3 genes in those cells
# Cannon_OR and _AND genes are optional, and can be used to isolate cells that express either or both of the two genes.

# Plot settings
cannon_gene = 'Nts'
cannon_OR_gene = 'none'
cannon_AND_gene = 'none'
sub_gene1 = 'Nts'
sub_gene2 = 'Slc32a1'
sub_gene3 = 'Slc17a6'

# Colors for different genes
color1 = 'darkcyan'
color2 = 'darkorange'
color3 = 'darkmagenta'

# Create Plots subfolder within the main filepath
plots_dir = os.path.join(filepath, 'Plots')
os.makedirs(plots_dir, exist_ok=True)

# Get unique mouseIDs from the mouseID column
mouseIDs = sorted(all_data['mouseID'].unique())
print(f"Found mouseIDs: {mouseIDs}")

# Function to create a single subplot
def create_subplot(ax, data, secname):
    """Create a subplot for a single section"""
    # Get cells positive for each gene
    sub_cells1 = pos_cells(data=data, gene=sub_gene1) if sub_gene1 != 'none' else data
    sub_cells2 = pos_cells(data=data, gene=sub_gene2) if sub_gene2 != 'none' else None
    sub_cells3 = pos_cells(data=data, gene=sub_gene3) if sub_gene3 != 'none' else None
    
    # Plot points for each gene
    ax.scatter(sub_cells1['Centroid X µm'], sub_cells1['Centroid Y µm'], 
              color=color1, alpha=0.5, s=25, label=sub_gene1)
    
    if sub_cells2 is not None:
        ax.scatter(sub_cells2['Centroid X µm'], sub_cells2['Centroid Y µm'], 
                  color=color2, alpha=0.5, s=25, label=sub_gene2)
    
    if sub_cells3 is not None:
        ax.scatter(sub_cells3['Centroid X µm'], sub_cells3['Centroid Y µm'], 
                  color=color3, alpha=0.5, s=25, label=sub_gene3)
    
    # Set plot properties
    ax.set_xlim(0, 2500)
    ax.set_ylim(2500, 0)
    ax.set_title(secname, fontsize=10)
    ax.axis('equal')
    ax.grid(False)
    ax.legend(loc='lower center', fontsize=8)

# Process each mouse
for mouse in mouseIDs:
    # Filter data for this mouse
    mouseonly = all_data[all_data['mouseID'] == mouse]
    
    # Get unique section numbers for this mouse from the Image column
    sections = sorted(set([int(image.split('Image_')[1][:2]) for image in mouseonly['Image'].unique()]))
    print(f"Mouse {mouse} has sections: {sections}")

    
    # Calculate grid dimensions
    n_sections = len(sections)
    n_cols = min(3, n_sections)  # Max 3 columns
    n_rows = (n_sections + n_cols - 1) // n_cols  # Ceiling division
    
    # Create figure for this mouse
    fig = plt.figure(figsize=(5*n_cols, 5*n_rows))
    fig.suptitle(f'Mouse {mouse}', fontsize=16)
    
    # Create subplots for each section
    for i, section in enumerate(sections, 1):
        # Filter data for this section using Image column
        section_str = f"{section:02d}"  # Convert to zero-padded 2-digit string
        seconly = mouseonly[mouseonly['Image'].str.contains(f'Image_{section_str}')]
        
        # Generate section name
        if cannon_gene == 'none':
            secname = f'Section {section}'
            cannon_cells = seconly
        else:
            if cannon_OR_gene == 'none':
                secname = f'Section {section}\n{cannon_gene}+'
                cannon_cells = pos_cells(data=seconly, gene=cannon_gene)
            else:
                secname = f'Section {section}\n{cannon_gene}+ or {cannon_OR_gene}+'
                cannon_cells = posOR_cells(data=seconly, gene1=cannon_gene, gene2=cannon_OR_gene)
            
            if cannon_AND_gene != 'none':
                secname = f'Section {section}\n{cannon_gene}+ or {cannon_OR_gene}+ and {cannon_AND_gene}+'
                cannon_cells = pos_cells(data=cannon_cells, gene=cannon_AND_gene)
        
        # Create subplot
        ax = fig.add_subplot(n_rows, n_cols, i)
        create_subplot(ax, cannon_cells, secname)
    
    # Adjust layout and save
    plt.tight_layout()
    plt.savefig(os.path.join(plots_dir, f'{mouse}_all_sections.pdf'))
    plt.show()
    plt.close()

### Plot one gene with shading based on intensity

In [None]:
#Plots all cells positive for a single gene, with shading based on fluorescence intensity.
# Color scaling set to 10-90th percentile for each mouse to prevent outliers from dominating the plot, can be adjusted below

gene = 'Nts'

# Create Plots subfolder within the main filepath
plots_dir = os.path.join(filepath, 'Plots')
os.makedirs(plots_dir, exist_ok=True)

# Get unique mouseIDs from the mouseID column
mouseIDs = sorted(all_data['mouseID'].unique())
print(f"Found mouseIDs: {mouseIDs}")

# Function to create a single subplot
def create_intensity_subplot(ax, cells, secname, vmin, vmax):
    """Create an intensity subplot for a single section"""
    scatter = ax.scatter(cells['Centroid X µm'], cells['Centroid Y µm'],
                        c=cells[f'Nucleus: {gene} mean'],  # Using raw intensity values
                        cmap='Blues',
                        vmin=vmin, vmax=vmax,
                        s=25)
    
    # Set plot properties
    ax.set_xlim(0, 2500)
    ax.set_ylim(2500, 0)
    ax.set_title(secname, fontsize=10)
    ax.axis('equal')
    ax.grid(False)
    
    return scatter

# Process each mouse
for mouse in mouseIDs:
    # Filter data for this mouse
    mouseonly = all_data[all_data['mouseID'] == mouse]
    
    # Calculate intensity limits for this mouse only
    mouse_intensities = mouseonly[mouseonly['Classification'].str.contains(gene, na=False)][f'Cell: {gene} mean']
    vmin = mouse_intensities.quantile(0.1)  # 10th percentile
    vmax = mouse_intensities.quantile(0.9)  # 90th percentile
    print(f"Mouse {mouse} intensity range: {vmin:.2f} to {vmax:.2f}")
    
    # Get unique section numbers for this mouse from the Image column
    sections = sorted(set([int(image.split('Image_')[1][:2]) for image in mouseonly['Image'].unique()]))
    print(f"Mouse {mouse} has sections: {sections}")
    
    # Calculate grid dimensions
    n_sections = len(sections)
    n_cols = min(3, n_sections)
    n_rows = (n_sections + n_cols - 1) // n_cols
    
    # Create figure for this mouse
    fig = plt.figure(figsize=(5*n_cols, 5*n_rows))
    fig.suptitle(f'Mouse {mouse} - {gene} Raw Intensity', fontsize=16)
    
    # Create subplots for each section
    for i, section in enumerate(sections, 1):
        # Filter data for this section using Image column
        section_str = f"{section:02d}"  # Convert to zero-padded 2-digit string
        seconly = mouseonly[mouseonly['Image'].str.contains(f'Image_{section_str}')]
        secname = f'Section {section}'
        cells = pos_cells(data=seconly, gene=gene)
        
        # Create subplot
        ax = fig.add_subplot(n_rows, n_cols, i)
        scatter = create_intensity_subplot(ax, cells, secname, vmin, vmax)
        
        # Add colorbar to the first subplot only
        if i == 1:
            plt.colorbar(scatter, ax=ax, label=f'{gene} Raw Intensity')
    
    # Adjust layout and save
    plt.tight_layout()
    plt.savefig(os.path.join(plots_dir, f'{mouse}_{gene}_raw_intensity_all_sections.pdf'), 
                bbox_inches='tight', dpi=300)
    plt.show()
    plt.close()

### Rostral-Caudal Cell Counts

In [None]:
#Counts cells positive for each gene in each region (defined in QuPath), separated by section.

#pre_subset_gene is optional, and can be used to isolate only cells positive for a given gene.
#pre_subset_not_genes are optional, and can be used to count cells negative for a given gene(s) (i.e. Nts cells with no Vglut2 or Vgat)

pre_subset_gene = 'none'
pre_subset_not_gene1 = 'none'
pre_subset_not_gene2 = 'none'
genelist = sorted(set(
    gene.strip() 
    for classification in all_data['Classification'].dropna().unique()
    for gene in classification.split(':')
    if gene.strip()  # This will exclude empty strings and whitespace-only entries
))

# Get unique overlapping regions 
all_regions = set()
for regions_str in all_data['Overlapping Regions'].dropna().unique():
    regions = [r.strip() for r in regions_str.split(';') if r.strip() and r.strip() != 'DAPI cells']
    all_regions.update(regions)

overlapping_regions = sorted(list(all_regions))
print(f"Found overlapping regions: {overlapping_regions}")

# Create main output directory
main_plots_dir = os.path.join(filepath, 'Counts_by_Region')
os.makedirs(main_plots_dir, exist_ok=True)

# Initialize dictionaries to store data for all regions
all_raw_data = {}
all_normalized_data = {}

# Process each overlapping region separately
for region in overlapping_regions:
    print(f"\n=== Processing region: {region} ===")
    
    # Filter data to only include cells in this overlapping region
    region_data = all_data[all_data['Overlapping Regions'].apply(lambda x: region in [r.strip() for r in str(x).split(';')] if pd.notna(x) else False)]
    print(f"Found {len(region_data)} cells in {region}")
    
    if len(region_data) == 0:
        print(f"No data found for region {region}, skipping...")
        continue
    
    # Initialize lists to store data for this region
    region_raw_rows = []
    region_normalized_rows = []
    
    for mouse in mouseIDs:
        # Filter data for specific mouse and region
        mouseonly = region_data[region_data['mouseID'] == mouse]
        
        if len(mouseonly) == 0:
            print(f"No data for mouse {mouse} in region {region}")
            continue
        
        # Get unique section numbers for this mouse from the Image column
        sections = sorted(set([int(image.split('Image_')[1][:2]) for image in mouseonly['Image'].unique()]))
        print(f"Mouse {mouse} in {region} has sections: {sections}")
        
        # Pre-subset by gene if specified
        if pre_subset_gene != 'none':
            mouseonly = pos_cells(data=mouseonly, gene=pre_subset_gene)
            print(f"Pre-subsetting data to {pre_subset_gene}+ cells")

        if pre_subset_not_gene1 != 'none':
            mouseonly = neg_cells(data=mouseonly, gene=pre_subset_not_gene1)
            print(f"Pre-subsetting data to {pre_subset_not_gene1}- cells")

        if pre_subset_not_gene2 != 'none':
            mouseonly = neg_cells(data=mouseonly, gene=pre_subset_not_gene2)
            print(f"Pre-subsetting data to {pre_subset_not_gene2}- cells")
        
        for section in sections:
            # Filter data for specific section using Image column
            section_str = f"{section:02d}"  # Convert to zero-padded 2-digit string
            seconly = mouseonly[mouseonly['Image'].str.contains(f'Image_{section_str}')]
            
            # Find the region area for this section in annotations (if available)
            section_annotations = annotations[annotations['Image'].str.contains(f'Image_{section_str}')]
            region_annotation = section_annotations[section_annotations['Name'] == region]
            
            if len(region_annotation) > 0:
                region_area = region_annotation['Area µm^2'].iloc[0]  # Get the area in square microns
                print(f"  Section {section}: {region} area = {region_area:.2f} µm²")
            else:
                print(f"  Warning: No {region} annotation found for section {section}, using PAG area as fallback")
                # Fallback to PAG area if region annotation not found
                pag_annotation = section_annotations[section_annotations['Name'] == 'PAG']
                if len(pag_annotation) > 0:
                    region_area = pag_annotation['Area µm^2'].iloc[0]
                else:
                    region_area = None
            
            # Count cells for each gene
            for gene in genelist:
                sub_cells = pos_cells(data=seconly, gene=gene)
                cell_count = len(sub_cells)
                
                # Add raw count row
                region_raw_rows.append({
                    'Gene': gene,
                    'Section': section,
                    'MouseID': mouse,
                    'Count': cell_count
                })
                
                # Calculate normalized count (cells per square micron)
                if region_area is not None and region_area > 0:
                    normalized_count = cell_count / region_area
                else:
                    normalized_count = None
                
                # Add normalized count row
                region_normalized_rows.append({
                    'Gene': gene,
                    'Section': section,
                    'MouseID': mouse,
                    'Normalized_Count': normalized_count,
                    'Area_um2': region_area
                })
    
    # Convert to DataFrames and store in dictionaries
    if region_raw_rows:
        raw_df = pd.DataFrame(region_raw_rows)
        # Pivot to have mice as columns, with MultiIndex for Gene and Section
        raw_pivot = raw_df.pivot_table(
            index=['Gene', 'Section'], 
            columns='MouseID', 
            values='Count', 
            fill_value=0
        )
        all_raw_data[region] = raw_pivot
        
        normalized_df = pd.DataFrame(region_normalized_rows)
        # Pivot normalized data - mice as columns, genes and sections as rows
        normalized_pivot = normalized_df.pivot_table(
            index=['Gene', 'Section'], 
            columns='MouseID', 
            values='Normalized_Count', 
            fill_value=None
        )
        all_normalized_data[region] = normalized_pivot
        
        print(f"Processed {len(raw_df)} records for {region}")

# Save all raw counts to one Excel file
raw_counts_file = os.path.join(main_plots_dir, 'All_Regions_Raw_Counts.xlsx')
with pd.ExcelWriter(raw_counts_file, engine='openpyxl') as writer:
    for region, data in all_raw_data.items():
        # Clean sheet name 
        sheet_name = region.replace(' ', '_').replace('/', '_')[:31]  # Excel limit is 31 chars
        data.to_excel(writer, sheet_name=sheet_name)
        print(f"Added raw counts for {region} to Excel file")

print(f"Saved all raw counts to: {raw_counts_file}")

# Save all normalized counts to one Excel file
normalized_counts_file = os.path.join(main_plots_dir, 'All_Regions_Normalized_Counts.xlsx')
with pd.ExcelWriter(normalized_counts_file, engine='openpyxl') as writer:
    for region, data in all_normalized_data.items():
        # Clean sheet name 
        sheet_name = region.replace(' ', '_').replace('/', '_')[:31]  # Excel limit is 31 chars
        data.to_excel(writer, sheet_name=sheet_name)
        print(f"Added normalized counts for {region} to Excel file")

print(f"Saved all normalized counts to: {normalized_counts_file}")

print(f"\nCompleted processing all overlapping regions. Results saved in: {main_plots_dir}")
print(f"Created {len(all_raw_data)} region tabs in each Excel file")

### Dorsal-Ventral Plots

In [None]:
# Function to find minimum Y coordinate for normalization
def min_y(data):
    """Find the minimum Y coordinate in a dataset"""
    if len(data) == 0:
        return 0
    return data['Centroid Y µm'].min()
    
num_sections = max(
    int(image.split('Image_')[1][:2]) 
    for image in all_data['Image'].unique()
)
min_y_vec = []

for mouse in mouseIDs:
    mouseonly = all_data[all_data['mouseID'] == mouse]
    
    for section in range(1, num_sections + 1):
        section_str = f"{section:02d}"  # Convert to zero-padded 2-digit string
        seconly = mouseonly[mouseonly['Image'].str.contains(f'Image_{section_str}')]
        min_y_val = min_y(seconly)
        min_y_vec.extend([min_y_val] * len(seconly))

# Add minimum Y values and adjusted Y coordinates to data
adj_data_dv = all_data.copy()
adj_data_dv['min_y_vec'] = min_y_vec
adj_data_dv['YMinAdj'] = adj_data_dv['Centroid Y µm'] - adj_data_dv['min_y_vec']


In [None]:
# Generates a histogram of the dorsal-ventral distribution of cells positive for a given gene.

cannon_gene = 'Nts'
cannon_OR_gene = 'none'
cannon_AND_gene = 'none'

color = 'darkcyan'

# Create output directory if it doesn't exist
plots_dir = os.path.join(filepath, 'DV Histograms')
os.makedirs(plots_dir, exist_ok=True)

# Process data for all mice
all_sub_cells = pd.DataFrame()

for mouse in mouseIDs:
    mouseonly = adj_data_dv[adj_data_dv['mouseID'] == mouse]
    
    if cannon_OR_gene == 'none':
        cannon_cells = pos_cells(data=mouseonly, gene=cannon_gene)
    else:
        cannon_cells = posOR_cells(data=mouseonly, gene1=cannon_gene, gene2=cannon_OR_gene)
    
    if cannon_AND_gene != 'none':
        cannon_cells = pos_cells(data=cannon_cells, gene=cannon_AND_gene)
    

# Create histogram plot for dorsal-ventral distribution
plt.figure(figsize=(6, 8))  

# For smoothed density plot
from scipy.stats import gaussian_kde

if len(cannon_cells) > 0:
    # Create density plot
    density = gaussian_kde(cannon_cells['YMinAdj'])
    
    # Determine reasonable range based on data
    y_min = cannon_cells['YMinAdj'].min()
    y_max = cannon_cells['YMinAdj'].max()
    
    # Add some padding to the range
    y_range = y_max - y_min
    plot_min = max(0, y_min - y_range * 0.1)  
    plot_max = y_max + y_range * 0.1
    
    ys = np.linspace(plot_min, plot_max, 200)
    density_values = density(ys)
    
    # Plot with Y coordinates on Y-axis and density on X-axis
    plt.fill_betweenx(ys, density_values, alpha=0.8, color=color)
    
    plt.ylim(plot_min, plot_max)
    
    # Set x-limit for density (you may need to adjust this based on your data)
    max_density = density_values.max()
    plt.xlim(0, max_density * 1.1)
    
    plt.ylabel('Dorsal-Ventral Position (Y adjusted, µm)', fontsize=12)
    plt.xlabel('Density', fontsize=12)
else:
    plt.text(0.5, 0.5, 'No data available', ha='center', va='center', 
             transform=plt.gca().transAxes, fontsize=12)
    plt.ylim(0, 1000)
    plt.xlim(0, 0.001)

# Invert Y-axis so 0 is at the top (dorsal) and increases downward (ventral)
plt.gca().invert_yaxis()

plt.title(f"Dorsal-Ventral Distribution: {cannon_gene}_OR_{cannon_OR_gene}_AND_{cannon_AND_gene}")
plt.grid(False)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

# Save plot
plt.savefig(os.path.join(plots_dir, f"DV_{cannon_gene}_OR_{cannon_OR_gene}_AND_{cannon_AND_gene}.pdf"))
plt.show()
plt.close()

print(f"Saved dorsal-ventral histogram to: {plots_dir}")
print(f"Data range: Y adjusted from {cannon_cells['YMinAdj'].min():.1f} to {cannon_cells['YMinAdj'].max():.1f} µm")

### Euler Plots for gene overlap visualization


In [None]:
def euler_cells_dynamic(data, genes, pos_neg_pattern):
    """
    Calculate cell counts for any number of genes (2-4)
    
    Parameters:
    data: DataFrame
    genes: list of gene names
    pos_neg_pattern: list of 'pos' or 'neg' values, same length as genes
    """
    result = data.copy()
    
    # Handle empty input DataFrame
    if len(result) == 0:
        return 0
    
    for i, gene in enumerate(genes):
        pattern = pos_neg_pattern[i]
        if pattern == 'pos':
            # Filter to include only cells positive for this gene
            result = result[result['Classification'].apply(
                lambda x: gene in [g.strip() for g in x.split(':')] if pd.notna(x) and x.strip() != '' else False
            )]
        else:  # pattern == 'neg'
            # Use exact match with whitespace handling
            result = result[result['Classification'].apply(
                lambda x: gene not in [g.strip() for g in x.split(':')] if pd.notna(x) and x.strip() != '' else True
            )]
        
        # If result is empty after filtering, return 0 immediately to avoid further column access issues
        if len(result) == 0:
            return 0
    
    return len(result)

def euler_values_dynamic(data, genes):
    """
    Calculate all possible combinations for Euler diagrams
    Works with 2-4 genes
    
    Parameters:
    data: DataFrame
    genes: list of gene names
    """
    import itertools
    
    n_genes = len(genes)
    if not 2 <= n_genes <= 4:
        raise ValueError("Number of genes must be between 2 and 4")
    
    results = []
    
    # Generate all possible combinations of pos/neg patterns
    # For each position, we need to determine if we want that gene to be positive or negative
    for r in range(1, n_genes + 1):  # r is the number of 'pos' genes we want
        for pos_positions in itertools.combinations(range(n_genes), r):
            pattern = ['neg'] * n_genes  # Start with all 'neg'
            for pos in pos_positions:  # Set selected positions to 'pos'
                pattern[pos] = 'pos'
            
            count = euler_cells_dynamic(data, genes, pattern)
            results.append(count)
    
    return results


In [None]:
# Settings for Euler plots
cannon_gene = 'none'
cannon_OR_gene = 'none'
# Define sub_genes as a list - can be 2, 3, or 4 genes
sub_genes = ['Nts','Slc32a1','Slc17a6']  
colors = ['darkcyan', 'darkorange', 'darkmagenta', 'darkgreen']  
shape = 'ellipse'  # can be 'ellipse' or 'circle'

# Create output directory if it doesn't exist
plots_dir = os.path.join(filepath, 'Euler Plots')
os.makedirs(plots_dir, exist_ok=True)

# Initialize DataFrame for cell counts - columns will depend on number of genes
def get_column_names(n_genes):
    """Generate column names for n genes"""
    import itertools
    letters = ['A', 'B', 'C', 'D'][:n_genes]
    columns = []
    # Add single gene columns
    for r in range(1, n_genes + 1):
        for combo in itertools.combinations(letters, r):
            columns.append('&'.join(combo))
    return columns

n_genes = len(sub_genes)
if not 2 <= n_genes <= 4:
    raise ValueError("Number of sub_genes must be 2, 3, or 4")

cell_counts = pd.DataFrame(columns=get_column_names(n_genes))

for mouse in mouseIDs:
    mouseonly = all_data[all_data['mouseID'] == mouse]
    
    # Apply canonical gene filters if specified
    if cannon_gene == 'none':
        cannon_cells = mouseonly
    else:
        if cannon_OR_gene == 'none':
            cannon_cells = pos_cells(data=mouseonly, gene=cannon_gene)
        else:
            cannon_cells = posOR_cells(data=mouseonly, gene1=cannon_gene, gene2=cannon_OR_gene)
    
    # Calculate cell counts for Euler diagram using dynamic function
    counts = euler_values_dynamic(cannon_cells, sub_genes)
    cell_counts.loc[mouse] = counts

# Sum counts across all mice
totals = cell_counts.sum()

# Create unique output paths with timestamp to avoid file conflicts
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
output_base = f"{'_'.join(sub_genes)}_{timestamp}"
output_path = os.path.join(plots_dir, f'{output_base}_euler.pdf').replace('\\', '/')
png_path = os.path.join(plots_dir, f'{output_base}_euler.png').replace('\\', '/')

# Create color string for R
color_string = ', '.join(f'"{c}"' for c in colors[:n_genes])

# Create R script with dynamic number of genes
r_script_parts = [
    "library(eulerr)\n\n",
    "# Create the counts\n",
    "counts <- c(\n"
]

# Add counts dynamically based on number of genes
print("Column mappings for Euler plot:")
print(f"Sub-genes: {sub_genes}")
for col, val in totals.items():
    # Convert column names to R-compatible names 
    r_name = col
    
    # Create letter to gene mapping
    letter_to_gene = dict(zip(['A', 'B', 'C', 'D'], sub_genes))
    
    # Split by '&' to handle each part separately
    parts = r_name.split('&')
    gene_parts = []
    
    for part in parts:
        if part in letter_to_gene:
            gene_parts.append(letter_to_gene[part])
        else:
            gene_parts.append(part)  # fallback if something goes wrong
    
    r_name = '&'.join(gene_parts)
    
    print(f"  {col} -> {r_name} = {int(val)}")
    r_script_parts.append(f"    `{r_name}` = {int(val)},\n")

r_script_parts[-1] = r_script_parts[-1].rstrip(',\n') + '\n'  # Remove last comma

# Add the rest of the R script
r_script_parts.extend([
    ")\n\n",
    f"# Create euler diagram\n",
    f"fit <- euler(counts, shape = \"{shape}\")\n\n",
    f"# Save as PDF\n",
    f"pdf(\"{output_path}\", width=8, height=8)\n",
    f"plot(fit,\n",
    f"     fills = c({color_string}),\n",
    f"     alpha = 0.4,\n",
    f"     quantities = TRUE,\n",
    f"     main = \"Gene Expression Overlap\")\n",
    f"dev.off()\n\n",
    f"# Save as PNG for notebook display\n",
    f"png(\"{png_path}\", width=800, height=800)\n",
    f"plot(fit,\n",
    f"     fills = c({color_string}),\n",
    f"     alpha = 0.4,\n",
    f"     quantities = TRUE,\n",
    f"     main = \"Gene Expression Overlap\")\n",
    f"dev.off()\n"
])

r_script = ''.join(r_script_parts)

# Write R script to file with timestamp
script_path = os.path.join(plots_dir, f'euler_script_{timestamp}.R')
with open(script_path, 'w') as f:
    f.write(r_script)

print(f"\nR script content preview:")
print("counts <- c(")
for col, val in totals.items():
    letter_to_gene = dict(zip(['A', 'B', 'C', 'D'], sub_genes))
    parts = col.split('&')
    gene_parts = [letter_to_gene.get(part, part) for part in parts]
    r_name = '&'.join(gene_parts)
    print(f"    `{r_name}` = {int(val)},")
print(")")

# Execute R script with error capture
r_path = "C:/Program Files/R/R-4.1.2/bin/x64/Rscript.exe"
try:
    result = subprocess.run([r_path, script_path], 
                          check=True, 
                          capture_output=True, 
                          text=True)
    print(f"\nR script executed successfully!")
    print(f"Files created: {output_base}_euler.pdf and {output_base}_euler.png")
except subprocess.CalledProcessError as e:
    print(f"R script failed with return code {e.returncode}")
    print(f"STDERR: {e.stderr}")
    raise

# Display the plot 
from IPython.display import Image, display
display(Image(png_path))

Calculating Nts cells expressing any other peptide

In [None]:
def cells_expressing_any(data, gene_list):
    """Return cells expressing any gene from a list of genes (exact match, handles whitespace)"""
    # Split classifications, strip whitespace, and check if any gene from the list is present
    return data[data['Classification'].apply(
        lambda x: any(g.strip() in gene_list for g in x.split(':')) if pd.notna(x) else False
    )]


nts_cells = pos_cells(data=all_data, gene='Nts')
anypep = cells_expressing_any(data=nts_cells, gene_list=['Adcyap1', 'Cck', 'Pdyn', 'Penk', 'Tac1'])

print("number of Nts cells:", len(nts_cells))
print("number of Nts cells expressing any other peptide:", len(anypep))

Calculating number of Nts Cells positive for X number of peptide genes

In [None]:
# Settings 
primary_gene = 'Nts'  # Gene to subset by (e.g., 'Nts')
coexpression_genes = ['Adcyap1', 'Cck', 'Pdyn', 'Penk', 'Tac1']  # List of genes to check co-expression
#coexpression_genes = ['Ntsr1']

# Optional: filter by region first (set to None to use all data)
target_region = None  

# Helper function to count genes expressed in a cell
def count_coexpressed_genes(classification, gene_list):
    """Count how many genes from gene_list are expressed in a cell"""
    if pd.isna(classification):
        return 0
    cell_genes = [g.strip() for g in classification.split(':')]
    return sum(1 for gene in gene_list if gene in cell_genes)

def is_gene_positive(classification, gene):
    """Check if a cell expresses a specific gene"""
    if pd.isna(classification):
        return False
    cell_genes = [g.strip() for g in classification.split(':')]
    return gene in cell_genes


# Start with all data or filter by region
if target_region is not None:
    # Use exact region matching 
    working_data = all_data[all_data['Overlapping Regions'].apply(
        lambda x: target_region in [r.strip() for r in str(x).split(';')] if pd.notna(x) else False
    )]
    print(f"Filtered to {target_region}: {len(working_data)} cells")
else:
    working_data = all_data.copy()
    print(f"Using all data: {len(working_data)} cells")

# Subset to primary gene positive cells
primary_positive = pos_cells(working_data, primary_gene)
print(f"Found {len(primary_positive)} {primary_gene}+ cells")

# Calculate co-expression statistics per mouse
coexpression_results = []
distribution_results = []

for mouse in mouseIDs:
    mouse_data = primary_positive[primary_positive['mouseID'] == mouse]
    total_cells = len(mouse_data)
    
    if total_cells == 0:
        print(f"No {primary_gene}+ cells found for mouse {mouse}")
        continue
    
    print(f"\nMouse {mouse}: {total_cells} {primary_gene}+ cells")
    
    # --- Part 1: Percent co-expressing each gene ---
    coexp_row = {'MouseID': mouse, 'Total_Primary_Positive': total_cells}
    
    for gene in coexpression_genes:
        # Count cells positive for this gene
        positive_count = mouse_data['Classification'].apply(
            lambda x: is_gene_positive(x, gene)
        ).sum()
        
        percent_positive = (positive_count / total_cells) * 100
        coexp_row[f'{gene}_count'] = positive_count
        coexp_row[f'{gene}_percent'] = percent_positive
        
        print(f"  {gene}: {positive_count} cells ({percent_positive:.1f}%)")
    
    coexpression_results.append(coexp_row)
    
    # --- Part 2: Distribution of number of co-expressed genes ---
    # Count how many of the coexpression_genes each cell expresses
    mouse_data = mouse_data.copy()
    mouse_data['num_coexpressed'] = mouse_data['Classification'].apply(
        lambda x: count_coexpressed_genes(x, coexpression_genes)
    )
    
    # Calculate distribution (0 to max possible genes)
    max_genes = len(coexpression_genes)
    dist_row = {'MouseID': mouse, 'Total_Primary_Positive': total_cells}
    
    print(f"  Co-expression distribution:")
    for n in range(0, max_genes + 1):
        count_n = (mouse_data['num_coexpressed'] == n).sum()
        percent_n = (count_n / total_cells) * 100
        dist_row[f'{n}_genes_count'] = count_n
        dist_row[f'{n}_genes_percent'] = percent_n
        print(f"    {n} genes: {count_n} cells ({percent_n:.1f}%)")
    
    distribution_results.append(dist_row)

# Create summary DataFrames
coexpression_df = pd.DataFrame(coexpression_results)
distribution_df = pd.DataFrame(distribution_results)

print("\n" + "="*60)
print("CO-EXPRESSION SUMMARY (% of primary+ cells expressing each gene)")
print("="*60)
display(coexpression_df)

print("\n" + "="*60)
print("DISTRIBUTION SUMMARY (% of primary+ cells co-expressing N genes)")
print("="*60)
display(distribution_df)

# Save to Excel
output_dir = os.path.join(filepath, 'Coexpression_Analysis')
os.makedirs(output_dir, exist_ok=True)

region_suffix = f"_{target_region}" if target_region else "_all_regions"
output_file = os.path.join(output_dir, f'{primary_gene}_coexpression{region_suffix}.xlsx')

with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
    coexpression_df.to_excel(writer, sheet_name='Gene_Coexpression', index=False)
    distribution_df.to_excel(writer, sheet_name='Num_Genes_Distribution', index=False)

print(f"\nResults saved to: {output_file}")

Calculating intensity of Nts staining across subdivisions

In [None]:
target_gene = 'Nts'  

# The intensity column name pattern is typically "Nucleus: {gene} mean"
# Adjust if your column naming is different
intensity_column = f'Nucleus: {target_gene} mean'

# Helper function for exact region matching
def has_exact_region(regions_str, target_region):
    """Check if a cell is in exactly the specified region (not substring match)"""
    if pd.isna(regions_str):
        return False
    regions = [r.strip() for r in regions_str.split(';')]
    return target_region in regions

# Get all unique regions 
all_regions = set()
for regions_str in all_data['Overlapping Regions'].dropna().unique():
    regions = [r.strip() for r in regions_str.split(';') if r.strip() and r.strip() != 'DAPI cells']
    all_regions.update(regions)

overlapping_regions = sorted(list(all_regions))
print(f"Found regions: {overlapping_regions}")

# Verify intensity column exists
if intensity_column not in all_data.columns:
    print(f"\nWARNING: Column '{intensity_column}' not found!")
    print("Available columns containing 'mean':")
    mean_cols = [col for col in all_data.columns if 'mean' in col.lower()]
    for col in mean_cols[:20]:  # Show first 20
        print(f"  - {col}")
    # Try to suggest the correct column
    gene_cols = [col for col in mean_cols if target_gene.lower() in col.lower()]
    if gene_cols:
        print(f"\nColumns matching '{target_gene}':")
        for col in gene_cols:
            print(f"  - {col}")
else:
    print(f"Using intensity column: {intensity_column}")

# Filter to gene-positive cells
gene_positive_cells = pos_cells(all_data, target_gene)
print(f"\nFound {len(gene_positive_cells)} {target_gene}+ cells total")

# Calculate mean intensity per region per mouse
intensity_results = []

for region in overlapping_regions:
    print(f"\n=== Region: {region} ===")
    
    # Filter gene-positive cells to this EXACT region
    region_cells = gene_positive_cells[gene_positive_cells['Overlapping Regions'].apply(
        lambda x: has_exact_region(x, region)
    )]
    
    print(f"  {len(region_cells)} {target_gene}+ cells in {region}")
    
    if len(region_cells) == 0:
        continue
    
    for mouse in mouseIDs:
        mouse_cells = region_cells[region_cells['mouseID'] == mouse]
        n_cells = len(mouse_cells)
        
        if n_cells == 0:
            continue
        
        # Calculate mean intensity
        if intensity_column in mouse_cells.columns:
            mean_intensity = mouse_cells[intensity_column].mean()
            std_intensity = mouse_cells[intensity_column].std()
            sem_intensity = std_intensity / np.sqrt(n_cells) if n_cells > 1 else np.nan
        else:
            mean_intensity = np.nan
            std_intensity = np.nan
            sem_intensity = np.nan
        
        intensity_results.append({
            'Region': region,
            'MouseID': mouse,
            'N_Cells': n_cells,
            'Mean_Intensity': mean_intensity,
            'Std_Intensity': std_intensity,
            'SEM_Intensity': sem_intensity
        })
        
        print(f"    Mouse {mouse}: {n_cells} cells, Mean = {mean_intensity:.2f} ± {sem_intensity:.2f} (SEM)")

# Create summary DataFrame
intensity_df = pd.DataFrame(intensity_results)

# Pivot for easier viewing - mice as columns, regions as rows
if len(intensity_df) > 0:
    pivot_mean = intensity_df.pivot_table(
        index='Region', 
        columns='MouseID', 
        values='Mean_Intensity'
    )
    
    pivot_n = intensity_df.pivot_table(
        index='Region', 
        columns='MouseID', 
        values='N_Cells'
    )
    
    print("\n" + "="*60)
    print(f"MEAN {target_gene} INTENSITY BY REGION AND MOUSE")
    print("="*60)
    display(pivot_mean)
    
    print("\n" + "="*60)
    print("CELL COUNTS BY REGION AND MOUSE")
    print("="*60)
    display(pivot_n)

# Save to Excel
output_dir = os.path.join(filepath, 'Intensity_Analysis')
os.makedirs(output_dir, exist_ok=True)

output_file = os.path.join(output_dir, f'{target_gene}_intensity_by_region.xlsx')

with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
    # Full data (long format)
    intensity_df.to_excel(writer, sheet_name='Full_Data', index=False)
    
    # Pivoted views
    if len(intensity_df) > 0:
        pivot_mean.to_excel(writer, sheet_name='Mean_Intensity_Pivot')
        pivot_n.to_excel(writer, sheet_name='Cell_Counts_Pivot')

print(f"\nResults saved to: {output_file}")