In [1]:
import pandas as pd
import pysam
import pickle

In [2]:
# Obtain cell cycle phase and pseudo-time info from the stat file
cell_list = pd.read_csv('data/single_cell/patski.dis.stat.txt', sep='\t')[['cellid', 'Phase', 'metacell']]

In [3]:
# Rename columns to better reflect data
cell_list = cell_list.rename(columns={'metacell':'pseudotime'})

In [4]:
# Load BAM file
bam = pysam.AlignmentFile('data/scATAC_RNA-seq_Patski/patski.allele.flt.M-003-002.bam', 'rb')

In [5]:
# Extract cell barcodes, genes and gene names
cells = []
genes = []
gene_names = []

for read in bam:
    if not read.has_tag('CB') or not read.has_tag('GX') or not read.has_tag('GN'):
        continue

    cells.append(read.get_tag('CB'))
    genes.append(read.get_tag('GX'))
    gene_names.append(read.get_tag('GN'))

bam.close()

[E::bgzf_uncompress] CRC32 checksum mismatch
[E::bgzf_read] Read block operation failed with error 33 after 0 of 4 bytes


OSError: truncated file

In [6]:
# Create a dictionary for gene IDs and names
genes_dict = {}

for gene, gene_name in zip(genes, gene_names):
	if gene not in genes_dict.keys():
		genes_dict[gene] = gene_name

In [7]:
# Create a dataframe from the read data
cell_gene = pd.DataFrame({
    'cellid': cells,
    'gene': genes,
}) 

In [8]:
# Keep only cells which have cell-cycle information present in the stat data
cell_gene = cell_gene[cell_gene['cellid'].isin(set(cell_list['cellid']))]

# Calculate how many times each gene appears per cell
cell_gene2 = cell_gene.groupby('cellid')['gene'].value_counts(dropna=False).unstack(fill_value=0)
cell_gene2 = cell_gene2.reset_index()

# Normalise by the number of genes read per cell
genelist = cell_gene2.columns[1::]
cell_gene2['sum'] = cell_gene2[genelist].sum(axis=1)
if cell_gene2['sum'].min() > 0:
    cell_gene2[genelist] = ( cell_gene2[genelist].values / cell_gene2['sum'].values[:,None] ) * 1e4
else:
    'Err: div by 0'

In [9]:
# Encode index in an easily sortable manner
pseudotime_vals = list(cell_list['pseudotime'].unique())
pseudotime_proxy = list(range(len(pseudotime_vals)))
pseudotime_dict = {x:y for x,y in zip(pseudotime_vals, pseudotime_proxy)}
pseudotime_dict_rev = {y:x for x,y in zip(pseudotime_vals, pseudotime_proxy)}

NameError: name 'wb' is not defined

In [11]:
# Save the reverse dictionary to decode pseudotime into phases in later analyses
with open ('data/CycleMeetsExpressions/pseudotime_dict.pickle', 'wb') as f:
	pickle.dump(pseudotime_dict_rev, f)

In [None]:
# Reduce data to the overlap between the two datafiles
cell_list = cell_list[cell_list['cellid'].isin(set(cell_gene2['cellid']))]

In [29]:
# Apply the pseudotime encoding 
cell_list['pseudotime_proxy'] = cell_list['pseudotime'].map(pseudotime_dict)

In [32]:
# Merge the two datafiles into a single dataframe which includes cell cycle data and gene data
full_cell_data = pd.merge(cell_list, cell_gene2, on='cellid', how='outer').reindex(cell_list.index)

In [33]:
out_data = pd.merge(cell_list[['cellid', 'pseudotime_proxy']], cell_gene2, on='cellid', how='outer').reindex(cell_list.index)

In [34]:
out_data.to_parquet('data/CycleMeetsExpressions/cell_data_full.parquet', index=False)