# Yale Colon TMA Spatial Transcriptomics Analysis

This notebook analyzes spatial transcriptomics data from Yale's Colon Tissue Microarray (TMA) dataset.

## Data Overview
The dataset contains:
- `Colon_TMA_exprMat_file.csv.gz`: Gene expression matrix
- `Colon_TMA_fov_positions_file.csv.gz`: Field of view positions
- `Colon_TMA_metadata_file.csv.gz`: Sample metadata
- `Colon_TMA_tx_file.csv.gz`: Transcript information
- `Colon_TMA-polygons.csv.gz`: Spatial polygon coordinates

## Analysis Pipeline
1. Data loading and exploration
2. Quality control and preprocessing
3. Spatial analysis
4. Gene expression analysis
5. Visualization


In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import anndata as ad
from scipy import sparse
import warnings
warnings.filterwarnings('ignore')

# Set scanpy settings
sc.settings.verbosity = 3  # verbosity level
sc.settings.set_figure_params(dpi=300, facecolor='white')

print("Libraries imported successfully!")


Libraries imported successfully!


## 1. Data Loading and Exploration


In [None]:
# Load the data files
print("Loading data files...")

# Load expression matrix
expr_mat = pd.read_csv('Colon_TMA_exprMat_file.csv.gz', compression='gzip', index_col=0)
print(f"Expression matrix shape: {expr_mat.shape}")

# Load FOV positions
fov_positions = pd.read_csv('Colon_TMA_fov_positions_file.csv.gz', compression='gzip')
print(f"FOV positions shape: {fov_positions.shape}")

# Load metadata
metadata = pd.read_csv('Colon_TMA_metadata_file.csv.gz', compression='gzip')
print(f"Metadata shape: {metadata.shape}")

# Load transcript file
tx_file = pd.read_csv('Colon_TMA_tx_file.csv.gz', compression='gzip')
print(f"Transcript file shape: {tx_file.shape}")

# Load polygons
polygons = pd.read_csv('Colon_TMA-polygons.csv.gz', compression='gzip')
print(f"Polygons shape: {polygons.shape}")

print("\nData loading completed!")


Loading data files...


In [None]:
# Explore the data structure
print("=== EXPRESSION MATRIX ===")
print(f"Shape: {expr_mat.shape}")
print(f"Index (spots/cells): {expr_mat.index[:5].tolist()}...")
print(f"Columns (genes): {expr_mat.columns[:5].tolist()}...")
print(f"Data type: {expr_mat.dtypes.iloc[0]}")
print(f"Memory usage: {expr_mat.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

print("\n=== FOV POSITIONS ===")
print(fov_positions.head())
print(f"\nColumns: {fov_positions.columns.tolist()}")

print("\n=== METADATA ===")
print(metadata.head())
print(f"\nColumns: {metadata.columns.tolist()}")

print("\n=== TRANSCRIPT FILE ===")
print(tx_file.head())
print(f"\nColumns: {tx_file.columns.tolist()}")

print("\n=== POLYGONS ===")
print(polygons.head())
print(f"\nColumns: {polygons.columns.tolist()}")


## 2. Data Preprocessing and Quality Control


In [None]:
# Create AnnData object for analysis
print("Creating AnnData object...")

# Transpose expression matrix (genes as variables, spots as observations)
expr_mat_t = expr_mat.T

# Create AnnData object
adata = ad.AnnData(X=expr_mat_t.values, 
                   obs=expr_mat_t.index.to_frame(name='spot_id'),
                   var=expr_mat_t.columns.to_frame(name='gene_id'))

print(f"AnnData object created with shape: {adata.shape}")
print(f"Observations (spots): {adata.n_obs}")
print(f"Variables (genes): {adata.n_vars}")


In [None]:
# Add spatial coordinates if available
if 'fov_positions' in locals():
    # Try to match spot IDs with spatial coordinates
    if 'spot_id' in fov_positions.columns or 'cell_id' in fov_positions.columns:
        coord_col = 'spot_id' if 'spot_id' in fov_positions.columns else 'cell_id'
        
        # Merge spatial coordinates
        spatial_coords = fov_positions.set_index(coord_col)
        
        # Find coordinate columns (x, y positions)
        coord_cols = [col for col in spatial_coords.columns if any(coord in col.lower() for coord in ['x', 'y', 'pos'])]
        
        if len(coord_cols) >= 2:
            adata.obsm['spatial'] = spatial_coords[coord_cols[:2]].values
            print(f"Added spatial coordinates: {coord_cols[:2]}")
        else:
            print("Could not identify spatial coordinate columns")
    else:
        print("No spot/cell ID column found in FOV positions")

# Add metadata if available
if 'metadata' in locals():
    # Try to merge metadata
    if 'spot_id' in metadata.columns or 'cell_id' in metadata.columns:
        meta_col = 'spot_id' if 'spot_id' in metadata.columns else 'cell_id'
        metadata_indexed = metadata.set_index(meta_col)
        
        # Add metadata to observations
        for col in metadata_indexed.columns:
            if col not in adata.obs.columns:
                adata.obs[col] = metadata_indexed[col]
        print(f"Added metadata columns: {list(metadata_indexed.columns)}")
    else:
        print("No spot/cell ID column found in metadata")


In [None]:
# Basic quality control metrics
print("Calculating quality control metrics...")

# Calculate QC metrics
adata.var['mt'] = adata.var_names.str.startswith('MT-')  # mitochondrial genes
adata.var['ribo'] = adata.var_names.str.match('^RP[SL]')  # ribosomal genes

sc.pp.calculate_qc_metrics(adata, percent_top=None, log1p=False, inplace=True)

# Add total counts per spot
adata.obs['total_counts'] = adata.X.sum(axis=1).A1 if sparse.issparse(adata.X) else adata.X.sum(axis=1)
adata.obs['n_genes_by_counts'] = (adata.X > 0).sum(axis=1).A1 if sparse.issparse(adata.X) else (adata.X > 0).sum(axis=1)

print("QC metrics calculated!")
print(f"\nQC metrics summary:")
print(f"Total counts per spot - Mean: {adata.obs['total_counts'].mean():.2f}, Std: {adata.obs['total_counts'].std():.2f}")
print(f"Genes per spot - Mean: {adata.obs['n_genes_by_counts'].mean():.2f}, Std: {adata.obs['n_genes_by_counts'].std():.2f}")


In [None]:
# Visualize QC metrics
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# Total counts distribution
axes[0, 0].hist(adata.obs['total_counts'], bins=50, alpha=0.7)
axes[0, 0].set_xlabel('Total counts per spot')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Distribution of Total Counts')

# Genes per spot distribution
axes[0, 1].hist(adata.obs['n_genes_by_counts'], bins=50, alpha=0.7)
axes[0, 1].set_xlabel('Number of genes per spot')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Distribution of Genes per Spot')

# Total counts vs genes per spot
axes[1, 0].scatter(adata.obs['total_counts'], adata.obs['n_genes_by_counts'], alpha=0.5, s=1)
axes[1, 0].set_xlabel('Total counts per spot')
axes[1, 0].set_ylabel('Number of genes per spot')
axes[1, 0].set_title('Total Counts vs Genes per Spot')

# Mitochondrial gene percentage (if available)
if 'pct_counts_mt' in adata.obs.columns:
    axes[1, 1].hist(adata.obs['pct_counts_mt'], bins=50, alpha=0.7)
    axes[1, 1].set_xlabel('Mitochondrial gene percentage')
    axes[1, 1].set_ylabel('Frequency')
    axes[1, 1].set_title('Distribution of MT Gene %')
else:
    axes[1, 1].text(0.5, 0.5, 'MT genes not detected', ha='center', va='center', transform=axes[1, 1].transAxes)
    axes[1, 1].set_title('Mitochondrial Gene %')

plt.tight_layout()
plt.show()


## 3. Gene Expression Analysis


In [None]:
# Filter genes and spots
print("Filtering data...")

# Filter genes (keep genes expressed in at least 10 spots)
sc.pp.filter_genes(adata, min_cells=10)
print(f"After gene filtering: {adata.shape}")

# Filter spots (keep spots with at least 200 genes)
sc.pp.filter_cells(adata, min_genes=200)
print(f"After spot filtering: {adata.shape}")

# Log transform the data
adata.raw = adata  # Keep raw data
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

print("Data preprocessing completed!")


In [None]:
# Find highly variable genes
print("Finding highly variable genes...")
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)

# Plot highly variable genes
sc.pl.highly_variable_genes(adata)
plt.show()

print(f"Number of highly variable genes: {adata.var['highly_variable'].sum()}")


In [None]:
# Keep only highly variable genes for downstream analysis
adata_hvg = adata[:, adata.var.highly_variable].copy()
print(f"Data with highly variable genes: {adata_hvg.shape}")

# Scale the data
sc.pp.scale(adata_hvg, max_value=10)

# Principal component analysis
sc.tl.pca(adata_hvg, svd_solver='arpack')

# Plot PCA
sc.pl.pca_variance_ratio(adata_hvg, log=True, n_pcs=50)
plt.show()

print("PCA completed!")


## 4. Clustering and Dimensionality Reduction


In [None]:
# Compute neighborhood graph
print("Computing neighborhood graph...")
sc.pp.neighbors(adata_hvg, n_neighbors=10, n_pcs=40)

# UMAP embedding
sc.tl.umap(adata_hvg)

# Leiden clustering
sc.tl.leiden(adata_hvg, resolution=0.5)

print("Clustering completed!")
print(f"Number of clusters: {adata_hvg.obs['leiden'].nunique()}")


In [None]:
# Visualize clustering results
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# UMAP colored by cluster
sc.pl.umap(adata_hvg, color='leiden', legend_loc='on data', 
           title='UMAP - Leiden Clusters', ax=axes[0], show=False)

# UMAP colored by total counts
sc.pl.umap(adata_hvg, color='total_counts', 
           title='UMAP - Total Counts', ax=axes[1], show=False)

plt.tight_layout()
plt.show()


## 5. Spatial Analysis


In [None]:
# Spatial visualization if coordinates are available
if 'spatial' in adata_hvg.obsm:
    print("Spatial coordinates available - creating spatial plots...")
    
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    
    # Spatial plot colored by cluster
    sc.pl.spatial(adata_hvg, color='leiden', spot_size=50, 
                  title='Spatial - Leiden Clusters', ax=axes[0], show=False)
    
    # Spatial plot colored by total counts
    sc.pl.spatial(adata_hvg, color='total_counts', spot_size=50, 
                  title='Spatial - Total Counts', ax=axes[1], show=False)
    
    plt.tight_layout()
    plt.show()
    
    # Calculate spatial autocorrelation
    print("Calculating spatial autocorrelation...")
    try:
        from scipy.spatial.distance import pdist, squareform
        from scipy.stats import pearsonr
        
        # Get spatial coordinates
        coords = adata_hvg.obsm['spatial']
        
        # Calculate distances
        distances = pdist(coords)
        distance_matrix = squareform(distances)
        
        # Calculate expression correlation for a subset of highly variable genes
        hvg_genes = adata_hvg.var_names[:50]  # Top 50 HVGs
        expr_subset = adata_hvg[:, hvg_genes].X.toarray() if sparse.issparse(adata_hvg.X) else adata_hvg[:, hvg_genes].X
        
        # Calculate correlation matrix
        expr_corr = np.corrcoef(expr_subset)
        
        # Plot spatial vs expression correlation
        plt.figure(figsize=(8, 6))
        plt.scatter(distance_matrix.flatten(), expr_corr.flatten(), alpha=0.1, s=1)
        plt.xlabel('Spatial Distance')
        plt.ylabel('Expression Correlation')
        plt.title('Spatial vs Expression Correlation')
        plt.show()
        
    except Exception as e:
        print(f"Spatial autocorrelation analysis failed: {e}")
        
else:
    print("No spatial coordinates available for spatial analysis")


## 6. Differential Expression Analysis


In [None]:
# Find marker genes for each cluster
print("Finding marker genes for each cluster...")
sc.tl.rank_genes_groups(adata_hvg, 'leiden', method='wilcoxon')

# Plot top marker genes
sc.pl.rank_genes_groups(adata_hvg, n_genes=5, sharey=False)
plt.show()

# Get marker genes results
result = adata_hvg.uns['rank_genes_groups']
groups = result['names'].dtype.names

print(f"\nTop 3 marker genes for each cluster:")
for group in groups:
    print(f"\nCluster {group}:")
    for i in range(3):
        gene = result['names'][group][i]
        score = result['scores'][group][i]
        print(f"  {gene}: score = {score:.3f}")


## 7. Gene Expression Visualization


In [None]:
# Plot expression of top marker genes
top_markers = []
for group in groups[:5]:  # Top 5 clusters
    top_markers.extend(result['names'][group][:3])

# Remove duplicates while preserving order
top_markers = list(dict.fromkeys(top_markers))

print(f"Plotting expression of {len(top_markers)} top marker genes...")

# UMAP plots for top markers
sc.pl.umap(adata_hvg, color=top_markers[:6], ncols=3, 
           colorbar_loc='right', show=False)
plt.tight_layout()
plt.show()

# Spatial plots for top markers (if spatial data available)
if 'spatial' in adata_hvg.obsm:
    sc.pl.spatial(adata_hvg, color=top_markers[:6], ncols=3, 
                  spot_size=50, colorbar_loc='right', show=False)
    plt.tight_layout()
    plt.show()


## 8. Summary and Export Results


In [None]:
# Summary statistics
print("=== ANALYSIS SUMMARY ===")
print(f"Total spots analyzed: {adata_hvg.n_obs}")
print(f"Total genes analyzed: {adata_hvg.n_vars}")
print(f"Number of clusters: {adata_hvg.obs['leiden'].nunique()}")
print(f"Cluster sizes: {adata_hvg.obs['leiden'].value_counts().sort_index().to_dict()}")

if 'spatial' in adata_hvg.obsm:
    print(f"Spatial coordinates: Available")
    print(f"Spatial range X: {adata_hvg.obsm['spatial'][:, 0].min():.2f} to {adata_hvg.obsm['spatial'][:, 0].max():.2f}")
    print(f"Spatial range Y: {adata_hvg.obsm['spatial'][:, 1].min():.2f} to {adata_hvg.obsm['spatial'][:, 1].max():.2f}")
else:
    print(f"Spatial coordinates: Not available")

print(f"\nTop 5 highly variable genes: {adata_hvg.var_names[:5].tolist()}")


In [None]:
# Export results
print("Exporting results...")

# Save processed data
adata_hvg.write('processed_yale_colon_tma.h5ad')
print("Processed data saved as 'processed_yale_colon_tma.h5ad'")

# Export marker genes
marker_genes_df = pd.DataFrame({
    'cluster': [group for group in groups for _ in range(10)],
    'gene': [result['names'][group][i] for group in groups for i in range(10)],
    'score': [result['scores'][group][i] for group in groups for i in range(10)],
    'pval': [result['pvals'][group][i] for group in groups for i in range(10)],
    'pval_adj': [result['pvals_adj'][group][i] for group in groups for i in range(10)]
})

marker_genes_df.to_csv('yale_colon_tma_marker_genes.csv', index=False)
print("Marker genes saved as 'yale_colon_tma_marker_genes.csv'")

# Export cluster assignments
cluster_assignments = adata_hvg.obs[['leiden', 'total_counts', 'n_genes_by_counts']].copy()
if 'spatial' in adata_hvg.obsm:
    cluster_assignments['x_coord'] = adata_hvg.obsm['spatial'][:, 0]
    cluster_assignments['y_coord'] = adata_hvg.obsm['spatial'][:, 1]

cluster_assignments.to_csv('yale_colon_tma_clusters.csv')
print("Cluster assignments saved as 'yale_colon_tma_clusters.csv'")

print("\nAnalysis completed successfully!")
print("\nGenerated files:")
print("- processed_yale_colon_tma.h5ad: Processed AnnData object")
print("- yale_colon_tma_marker_genes.csv: Marker genes for each cluster")
print("- yale_colon_tma_clusters.csv: Cluster assignments and metadata")
