In [1]:
import pandas as pd
import pysam
import numpy as np
import scanpy as sc

In [2]:
# Obtain cell cycle phase and pseudo-time info from the stat file
cell_list = pd.read_csv('data/single_cell/patski.dis.stat.txt', sep='\t')[['cellid', 'Phase', 'metacell']]

In [3]:
# Rename columns to better reflect data
cell_list = cell_list.rename(columns={'metacell':'pseudotime'})

In [4]:
# Load BAM file
bam = pysam.AlignmentFile('data/scATAC_RNA-seq_Patski/patski.allele.flt.M-003-002.bam', 'rb')

In [5]:
# Extract cell barcodes, genes and gene names
cells = []
genes = []
gene_names = []

for read in bam:
    if not read.has_tag('CB') or not read.has_tag('GX') or not read.has_tag('GN'):
        continue

    cells.append(read.get_tag('CB'))
    genes.append(read.get_tag('GX'))
    gene_names.append(read.get_tag('GN'))

[E::bgzf_uncompress] CRC32 checksum mismatch
[E::bgzf_read] Read block operation failed with error 33 after 0 of 4 bytes


OSError: truncated file

In [22]:
# Create a dictionary for gene IDs and names
genes_dict = {}

for gene, gene_name in zip(genes, gene_names):
	if gene not in genes_dict.keys():
		genes_dict[gene] = gene_name

In [29]:
# Create a dataframe from the read data
cell_gene = pd.DataFrame({
    'cellid': cells,
    'gene': genes,
}) 

In [41]:
# Keep only cells which have cell-cycle information present in the stat data
cell_gene = cell_gene[cell_gene['cellid'].isin(set(cell_list['cellid']))]

In [59]:
# Calculate how many times each gene appears per cell
cell_gene2 = cell_gene.groupby('cellid')['gene'].value_counts(dropna=False).unstack(fill_value=0)

In [60]:
cell_gene2 = cell_gene2.reset_index()

In [61]:
cell_gene2.head(5)

gene,cellid,ENSMUSG00000000001,ENSMUSG00000000028,ENSMUSG00000000031,ENSMUSG00000000037,ENSMUSG00000000049,ENSMUSG00000000056,ENSMUSG00000000058,ENSMUSG00000000078,ENSMUSG00000000085,...,ENSMUSG00000118406,ENSMUSG00000118436,ENSMUSG00000118445,ENSMUSG00000118449,ENSMUSG00000118454,ENSMUSG00000118505,ENSMUSG00000118506,ENSMUSG00000118519,ENSMUSG00000118537,ENSMUSG00000118550
0,SCG0088_AAACAGCCAACAGGTG-1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,SCG0088_AAACAGCCACCAACCG-1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,SCG0088_AAACAGCCATGATTGT-1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,SCG0088_AAACATGCACAACCTA-1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,SCG0088_AAACATGCAGTCTAGC-1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [65]:
# Encode index in an easily sortable manner
pseudotime_vals = list(cell_list['pseudotime'].unique())
pseudotime_proxy = list(range(len(pseudotime_vals)))
pseudotime_dict = {x:y for x,y in zip(pseudotime_vals, pseudotime_proxy)}
pseudotime_dict_rev = {y:x for x,y in zip(pseudotime_vals, pseudotime_proxy)}

cell_list['pseudotime_proxy'] = cell_list['pseudotime'].map(pseudotime_dict)

In [67]:
# Reduce data to the overlap between the two datafiles
cell_list = cell_list[cell_list['cellid'].isin(set(cell_gene2['cellid']))]

In [69]:
# Join the two datafiles into a single dataframe which includes cell cycle data and gene data
full_cell_data = pd.concat([cell_list, cell_gene2], axis=1).reindex(cell_list.index)

In [76]:
out_data = pd.concat([cell_list[['cellid', 'pseudotime_proxy']], cell_gene2], axis=1).reindex(cell_list.index)

In [77]:
out_data.to_csv('data/CycleMeetsExpressions/cell_data_full.csv', sep='\t')

KeyboardInterrupt: 

In [75]:
cell_list.columns

Index(['cellid', 'Phase', 'pseudotime', 'pseudotime_proxy'], dtype='object')