In [None]:
import anndata as ad
import pandas as pd

path = r'C:\Users\panag\OneDrive\Documents\coding\Projects\Liliana\data\GSE192456_SC_GEO_raw_counts.csv.gz'

# First, check the file structure
# You might want to peek at the file first
import gzip
with gzip.open(path, 'rt') as f:
    for i, line in enumerate(f):
        print(line.strip())
        if i > 5:
            break

# Then load it properly into AnnData
adata = ad.read_csv(path, first_column_names=True)
# Or if needed, specify parameters:
# adata = ad.read_csv(path, delimiter=',', first_column_names=True)

print(adata)

In [None]:
import h5py

# read in the file data
# unfortunately this data file is totally useless
ad = h5py.File(r'C:\Users\panag\OneDrive\Documents\coding\Projects\Liliana\data\output_anndata.h5ad')
ad.keys()

In [None]:
import scanpy as sc

# read in the file data
ad = sc.read_h5ad(r'C:\Users\panag\OneDrive\Documents\coding\Projects\Liliana\data\output_anndata.h5ad')
ad


In [None]:
# Check if raw counts are available
if ad.raw is not None:
    raw_ad = ad.raw.to_adata()
else:
    print("No raw counts found. Using the X matrix instead.")
    raw_ad = ad.copy()
raw_ad

In [None]:
# Inspect the columns in `obs` (cell metadata)
print(raw_ad.obs.columns)

# Inspect the columns in `var` (gene metadata)
print(raw_ad.var.columns)

In [None]:
import pandas as pd
import os
import tempfile
import gzip
import scipy
import scipy.sparse as sp
import zipfile
import scanpy as sc
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def h5ad_to_10x(ad, 
                gene_id_key="gene_id", 
                gene_name_key="gene_name", 
                cell_type_key="cell_type", 
                output_path="matrix.zip", 
                barcode_key=None, 
                subsample_rate=None):
    """
    Convert an AnnData object to 10x Genomics format and package the results into a zip file.
    
    Parameters:
      ad (AnnData): The input AnnData object.
      gene_id_key (str): Column name in ad.var for gene IDs.
      gene_name_key (str): Column name in ad.var for gene names.
      cell_type_key (str): Column name in ad.obs for cell type annotations.
      output_path (str): Path for the output zip file.
      barcode_key (str): Optional key in ad.obs for barcodes; if None, uses ad.obs.index.
      subsample_rate (float): Optional fraction (0-1) for subsampling cells.
    """
    if subsample_rate:
        sc.pp.subsample(ad, fraction=subsample_rate)
    
    # Extract gene information
    genes = ad.var.reset_index()[[gene_id_key, gene_name_key]].copy()
    genes.columns = ["gene_id", "gene_name"]  # Rename columns explicitly
    
    # Add feature type
    if "feature_types" in ad.var.columns:
        genes["feature_type"] = ad.var["feature_types"].values
    else:
        genes["feature_type"] = "Gene Expression"
    
    # Extract barcode information
    if barcode_key and barcode_key in ad.obs.columns:
        barcodes = ad.obs[[barcode_key]].copy()
    else:
        barcodes = pd.DataFrame(ad.obs.index, columns=["barcode"])
    
    # Extract cell type annotations
    if cell_type_key in ad.obs.columns:
        celltypes = ad.obs[[cell_type_key]].reset_index()
        celltypes.columns = ["barcode", "annotation"]
    else:
        celltypes = pd.DataFrame({"barcode": ad.obs.index, "annotation": "Unknown"})
    
    with tempfile.TemporaryDirectory() as tmp_dir:
        # Write matrix in compressed MTX format (transposed to genes x cells)
        matrix_file = os.path.join(tmp_dir, "matrix.mtx.gz")
        with gzip.open(matrix_file, "wb") as handle:
            scipy.io.mmwrite(handle, sp.csc_matrix(ad.X.T))
        
        # Write features file (genes)
        features_file = os.path.join(tmp_dir, "features.tsv.gz")
        genes.to_csv(features_file, sep="\t", index=False, header=False, compression="gzip")
        
        # Write barcodes file
        barcodes_file = os.path.join(tmp_dir, "barcodes.tsv.gz")
        barcodes.to_csv(barcodes_file, sep="\t", index=False, header=False, compression="gzip")
        
        # Write cell types file
        celltypes_file = os.path.join(tmp_dir, "celltypes.csv")
        celltypes.to_csv(celltypes_file, index=False)
        
        # Validate file contents
        with gzip.open(features_file, "rt") as f:
            lines = [line.strip() for line in f]
            logging.info(f"Features.tsv first line: {lines[0]}")
            logging.info(f"Features.tsv has {len(lines)} rows (should match matrix.mtx rows).")
        
        with gzip.open(matrix_file, "rt") as f:
            header = next(line for line in f if not line.startswith("%"))
            logging.info(f"Matrix header: {header.strip()}")
        
        # Package all files into a zip archive
        with zipfile.ZipFile(output_path, "w") as zip_handle:
            for file_name in ["matrix.mtx.gz", "features.tsv.gz", "barcodes.tsv.gz", "celltypes.csv"]:
                zip_handle.write(os.path.join(tmp_dir, file_name), arcname=file_name)
    
    logging.info(f"10x formatted files zipped successfully to {output_path}.")

if __name__ == "__main__":
    # Example usage
    import scanpy as sc
    
    # Convert to 10x format
    h5ad_to_10x(
        raw_ad, 
        gene_id_key="gene_id", 
        gene_name_key="gene",  # Ensure this is distinct from gene_id_key
        cell_type_key="cell_type", 
        output_path=r"C:\Users\panag\OneDrive\Documents\coding\Projects\Liliana\matrix.zip"
    )

In [None]:
import zipfile

def list_zip_contents(zip_path):
    """List the contents of a zip file."""
    try:
        with zipfile.ZipFile(zip_path, 'r') as zipf:
            print(f"Contents of {zip_path}:")
            for file in zipf.namelist():
                print(f"  - {file}")
    except zipfile.BadZipFile:
        print(f"Error: {zip_path} is not a valid zip file.")
    except FileNotFoundError:
        print(f"Error: File not found - {zip_path}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

# Example usage
zip_path = r"C:\Users\panag\OneDrive\Documents\coding\Projects\Liliana\matrix.zip"
list_zip_contents(zip_path)

In [1]:
import tempfile
import scanpy as sc
import zipfile


# Read the matrix file
with tempfile.TemporaryDirectory() as tmp_dir:
    with zipfile.ZipFile(r"C:\Users\panag\OneDrive\Documents\coding\Projects\sc2Xenium\matrix.zip", "r") as zip_handle:
        zip_handle.extractall(tmp_dir)
    mtx = sc.read_10x_mtx(tmp_dir)



In [2]:
mtx.to_df().head(10)

Unnamed: 0,MIR1302-2HG,FAM138A,OR4F5,AL627309.1,AL627309.3,AL627309.2,AL627309.4,AL732372.1,OR4F29,AC114498.1,...,AC007325.2,BX072566.1,AL354822.1,AC023491.2,AC004556.1,AC233755.2,AC233755.1,AC240274.1,AC213203.1,FAM231C
AAACCCAGTTGTGGCC-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACCCATCCATCGTC-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACGAAAGTTGTCAC-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACGCTCAATTGTGC-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACGCTTCGTCTCAC-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAAGAACAGGGCAACT-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
AAAGAACCAATGCTCA-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAAGGATAGGCATTTC-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAAGGATAGTACAGCG-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAAGGGCAGTGTTCAC-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
mtx.X.dtype

dtype('float32')