In [1]:
import urllib.request
import pandas as pd
import subprocess
import numpy as np
import h5py

from scipy.sparse import csc_matrix


data_root="/home/mcn26/palmer_scratch/tabula_data"
output_root="/home/mcn26/palmer_scratch/tabula_data/formatted"

In [2]:
name="GSE269037_RAW"
seelig_data_path=f"{data_root}/{name}.tar"

urllib.request.urlretrieve("https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE269037&format=file",
                           seelig_data_path)

('/home/mcn26/palmer_scratch/tabula_data/GSE269037_RAW.tar',
 <http.client.HTTPMessage at 0x1471112dc7d0>)

In [3]:
subprocess.run(['tar', '-xf', seelig_data_path, '-C', f"{data_root}"], check=True)

CompletedProcess(args=['tar', '-xf', '/home/mcn26/palmer_scratch/tabula_data/GSE269037_RAW.tar', '-C', '/home/mcn26/palmer_scratch/tabula_data'], returncode=0)

Two files extracted. What are they?

In [4]:
nontx_handle=h5py.File(f"{data_root}/GSM8305416_R1-scMPRA.h5",'r')
tx_handle=h5py.File(f"{data_root}/GSM8305417_R1-scMPRA_tx.h5")
files=(nontx_handle,tx_handle)

In [5]:
for i in files:
    print(i.keys())

<KeysViewHDF5 ['unknown']>
<KeysViewHDF5 ['unknown']>


Both just have one key, "unknown". What's inside?

In [6]:
for i in files:
    print(list(i[list(i.keys())[0]]))

['barcodes', 'data', 'gene_names', 'genes', 'indices', 'indptr', 'shape']
['barcodes', 'data', 'gene_names', 'genes', 'indices', 'indptr', 'shape']


I don't have the code (still a preprint) so hard to say what exactly this format is, but.
Prelim guess : tx is transcripts (GEX), and the non-tx is . both are sparse matricies. 

In [7]:
#avoiding error accessing two files at once with this ugly-as-sin code
print("nontx")
for name in list(files[0][list(files[0].keys())[0]]):
    print(f"length of {name} : {len(files[0]['unknown'][name][:])}")
print("tx")
for name in list(files[1][list(files[1].keys())[0]]):
    print(f"length of {name} : {len(files[1]['unknown'][name][:])}")

nontx
length of barcodes : 10640
length of data : 250078
length of gene_names : 1345
length of genes : 1345
length of indices : 250078
length of indptr : 10641
length of shape : 2
tx
length of barcodes : 10640
length of data : 48979197
length of gene_names : 62700
length of genes : 62700
length of indices : 48979197
length of indptr : 10641
length of shape : 2


In [8]:
print("nontx")
print(list(files[0]['unknown']['shape']))
print("tx")
print(list(files[1]['unknown']['shape']))

nontx
[np.int32(1345), np.int32(10640)]
tx
[np.int32(62700), np.int32(10640)]


Ok, so based on these dimensions and list-lengths, this looks like CSC format. ([wikipedia has a good breakdown](https://en.wikipedia.org/wiki/Sparse_matrix)). Let's try and decode it under that assumption. 

In [12]:
def decode(handle):
    unknown_group = handle['unknown']
    
    # Load the sparse matrix components
    data = unknown_group['data'][:]
    indices = unknown_group['indices'][:]
    indptr = unknown_group['indptr'][:]
    shape = unknown_group['shape'][:]
    
    # Load barcodes and gene names
    barcodes = unknown_group['barcodes'][:].astype(str)
    genes = unknown_group['genes'][:].astype(str)
    genes = [gene.decode('utf-8') for gene in unknown_group['genes'][:]]
    gene_names = unknown_group['gene_names'][:].astype(str)

    sparse_matrix = csc_matrix((data, indices, indptr), shape=shape)

    # Determine which has fewer unique values for the outer index
    multi_index=None
    if len(set(genes)) < len(set(gene_names)):
        multi_index = pd.MultiIndex.from_tuples(list(zip(genes, gene_names)), names=['Gene', 'Gene Name'])
    else:
        multi_index = pd.MultiIndex.from_tuples(list(zip(gene_names, genes)), names=['Gene Name', 'Gene'])

    df= pd.DataFrame.sparse.from_spmatrix(sparse_matrix, index=multi_index, columns=barcodes)
    df.columns.name = 'Cell Barcode'
    return df

    
MPRA=decode(files[0])
GEX=decode(files[1])

In [14]:
MPRA

Unnamed: 0_level_0,Cell Barcode,A9_A2_A2,A6_A2_A2,A2_B1_A2,A5_B2_A2,A1_B2_A2,A7_B3_A2,A4_A1_A2,A5_A1_A2,A12_B4_A2,A3_F10_A2,...,A3_F2_F8,A5_F4_F8,A9_F4_F8,A6_F5_F8,A4_F6_F8,A7_F6_F8,A12_F7_F8,A5_F7_F8,A5_F8_F8,A6_F8_F8
Gene Name,Gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
AAAATATCTCTGTAGGCAGATGCTTACAGCTGCTGCCGCAGACATATTTGGAGGTATCTGCCAAGCCCAGTCTCTCTGCCGCAGACAATTCCTGTAACCACACACTTCCTCTGCCAAGAGGGTGGAACCAAGGTCATACTCCCTC,AAAATATCTCTGTAGGCAGATGCTTACAGCTGCTGCCGCAGACATATTTGGAGGTATCTGCCAAGCCCAGTCTCTCTGCCGCAGACAATTCCTGTAACCACACACTTCCTCTGCCAAGAGGGTGGAACCAAGGTCATACTCCCTC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAAATTAAACACTCGGGACTTGTCCGGGCATGCTGGCTGACTTGGCTGAACTTTGGCTGTTGGTATGGTGACGTGACATAGCTTTGCAACGTACTGTCTGTAACCTTGGACTTTGCACAAACGTACAAAGCATGCCGGAGGGGAA,AAAATTAAACACTCGGGACTTGTCCGGGCATGCTGGCTGACTTGGCTGAACTTTGGCTGTTGGTATGGTGACGTGACATAGCTTTGCAACGTACTGTCTGTAACCTTGGACTTTGCACAAACGTACAAAGCATGCCGGAGGGGAA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2.0,0,0,0
AAAATTAGCCGGGCGTGGTAGCAGGCGCCTGTAGTCCCAGCTACTCTGGAGGCTGAGGCAGGAAAATGGCGGGAACCCGAGAGGCGGAGCTTGCAGTGAGCCGATATCGCGCCACTGAACTCCAGCCCGGACAACAGAGCGAGAC,AAAATTAGCCGGGCGTGGTAGCAGGCGCCTGTAGTCCCAGCTACTCTGGAGGCTGAGGCAGGAAAATGGCGGGAACCCGAGAGGCGGAGCTTGCAGTGAGCCGATATCGCGCCACTGAACTCCAGCCCGGACAACAGAGCGAGAC,0,2.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAACAGGTCGGGGGTTAATCCATACACACGCTGGGGTTTTGCCCAGGCAGGCCGGAATGGTCAACCTTTGGTCTTTGTACCGTCATGTTGACCTCGTCTGGACGGTTGAACTTTGCCCGTGTGCATTGGTACACTCGGTATGTAC,AAACAGGTCGGGGGTTAATCCATACACACGCTGGGGTTTTGCCCAGGCAGGCCGGAATGGTCAACCTTTGGTCTTTGTACCGTCATGTTGACCTCGTCTGGACGGTTGAACTTTGCCCGTGTGCATTGGTACACTCGGTATGTAC,0,1.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAACATTCATGTCAGGGCATGTGGGCTTGTAACTTTGAACCCCTGCGCGATCAAACAAAGGTTGAGCGAAAATCCACCCGTGCAAACATGTCCGGGCATGCCTGCTGGCGAACGTGCGAACTCCGACCGGAGAGTTTGGCGCACG,AAACATTCATGTCAGGGCATGTGGGCTTGTAACTTTGAACCCCTGCGCGATCAAACAAAGGTTGAGCGAAAATCCACCCGTGCAAACATGTCCGGGCATGCCTGCTGGCGAACGTGCGAACTCCGACCGGAGAGTTTGGCGCACG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTTTCTTACGCGGGTCATCACTCGTATGAAATGACTCACGCGACTTCTGATGAGTAATCACGCAATCGGTTCCGCGAAGTAAGATCTGCTATCGGACGCGGCGCTTCCTTCTTATATTGGGAAATCGCTTCATTAGTATGAGGC,TTTTTCTTACGCGGGTCATCACTCGTATGAAATGACTCACGCGACTTCTGATGAGTAATCACGCAATCGGTTCCGCGAAGTAAGATCTGCTATCGGACGCGGCGCTTCCTTCTTATATTGGGAAATCGCTTCATTAGTATGAGGC,0,6.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TTTTTGTTGGCGCGCGCGCCTGAAGCGGGACTGCCAGGTGGCGCGCGCGCCTGAAGGCGATTATGGCGCGCGCGCCTGACACGGGGTATTGTCCTGGAGTTATGCGGCTACAGGAATGGGACGGCGCCACTGCGGGGTTTGCCGG,TTTTTGTTGGCGCGCGCGCCTGAAGCGGGACTGCCAGGTGGCGCGCGCGCCTGAAGGCGATTATGGCGCGCGCGCCTGACACGGGGTATTGTCCTGGAGTTATGCGGCTACAGGAATGGGACGGCGCCACTGCGGGGTTTGCCGG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TTTTTGTTTGACCCCTGTAATGTTTGTTCCCAGGGAACATGCCGGGGCACGTGACCTCTGTCCGGTAATGTTTGAACAAAGGTCACATGCCTGGGCATGTCCTTTGAACAAAGCGAACATTACATAAACGTTCACAGGTCACCTG,TTTTTGTTTGACCCCTGTAATGTTTGTTCCCAGGGAACATGCCGGGGCACGTGACCTCTGTCCGGTAATGTTTGAACAAAGGTCACATGCCTGGGCATGTCCTTTGAACAAAGCGAACATTACATAAACGTTCACAGGTCACCTG,0,1.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TTTTTGTTTGTACAAAGCTCTGTTTGACCCCTGCGGGCATGCCGGGGCACGTGACCTCTGTCCGGTAATGTTTGAACAAAGCGAACATTACATGAACCTCTGCTGCTCTCTCTGTTTGAACATTCAAAGCGAACATTACATAAAC,TTTTTGTTTGTACAAAGCTCTGTTTGACCCCTGCGGGCATGCCGGGGCACGTGACCTCTGTCCGGTAATGTTTGAACAAAGCGAACATTACATGAACCTCTGCTGCTCTCTCTGTTTGAACATTCAAAGCGAACATTACATAAAC,0,1.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


The various values of cell-barcode are things like "A9_A2_A2" which I am dead certain are **cell**-barcodes. These are clearly parse-biosciences well position combintorial-barcoding cell-barcodes.

It may be a good idea to insert some standard QC here? Though if these data are derived from the filtered matrix from the standard parse biosciences pipeline, might be OK. 

Gene name and gene seem redundant for MPRA data. Are they identical?

In [23]:
flatty=MPRA.reset_index()
all(flatty["Gene Name"]==flatty["Gene"])

True

Yes they are.