In [None]:
import loompy
import glob
import velocyto as vcy
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [202]:
files = glob.glob('/data/sc-10x/data-runs/171120-scheele-adipose/*/velocyto/*.loom')

#Loompy.combine looks at the first loom file and uses the 
#characteristcs of that one for the combined loom file, meaning the order of 
#the files matters. The CellID column contains the sample name plus bar code, 
#so if the first file in files is a sample with a short name, the cellID's of 
#44BSAT-3000_cells and BAT8-3000_cells won't fit in the new cellID column. 
#With the sorting step below 44BSAT-3000_cells will be the first file.
#You can also specify how many characters a column can contain with loompy.combine,
#but I don't know if there are more columns containing the cellID.
files = sorted(files, key=lambda x: x.split('/')[-1])
loompy.combine(files, '/projects/pytrik/sc_adipose/analyze_10x_fluidigm/data/velocyto/all10x.loom', key="Accession")

In [369]:
vlm = vcy.VelocytoLoom('/projects/pytrik/sc_adipose/analyze_10x_fluidigm/data/velocyto/all10x.loom')

In [370]:
print(set(list(map(lambda x: x.split(':')[0], vlm.ca['CellID']))))

{'Subq_1', 'Supra_3', 'Peri_3', 'Visce_2', 'Supra_1', 'Supra_2', 'Subq_2', 'Peri_2', 'Visce_1', 'Peri_1', 'Subq_3', 'Visce_3', 'BAT8-3000_cells', '44BSAT-3000_cells'}


In [371]:
def rename_cellid(x):
    if ('BAT8-3000_cells' in x):
        return x.replace('BAT8-3000_cells', 'Supra_4')
    elif ('44BSAT-3000_cells' in x):
        return x.replace('44BSAT-3000_cells', 'Subq_4')
    else:
        return x

In [372]:
vlm.ca['CellID'] = np.asarray(list(map(rename_cellid, vlm.ca['CellID'])))
vlm.ca['sample_name'] = np.asarray(list(map(lambda x: x.split(':')[0], vlm.ca['CellID'])))
#vlm.set_clusters(vlm.ca['sample_name'])

In [373]:
print(set(vlm.ca['sample_name']))

{'Supra_3', 'Peri_3', 'Visce_2', 'Supra_1', 'Supra_2', 'Subq_4', 'Subq_2', 'Peri_2', 'Visce_1', 'Peri_1', 'Subq_3', 'Visce_3', 'Subq_1', 'Supra_4'}


In [374]:
metadata = pd.read_table('/projects/pytrik/sc_adipose/analyze_10x_fluidigm/data/metadata-downsampled-36.txt', sep='\t')

metadata['CellID-old'] = metadata.index
metadata['CellID'] = metadata[['CellID-old', 'sample_name']].apply(lambda x: x[1] + ':' + x[0].split('-')[0] + 'x', axis=1)

cells_to_keep = np.array(metadata['CellID'].tolist())
cellids_vlm = np.array(vlm.ca['CellID'])

vlm.filter_cells(bool_array=np.isin(cellids_vlm, cells_to_keep))

In [375]:
ca_df = pd.DataFrame(vlm.ca)

In [376]:
ca_df.shape

(13904, 5)

In [377]:
merged = ca_df.merge(metadata, on='CellID', how='right')

In [378]:
print(vlm.S.shape)
print(len(vlm.ca['CellID']))

(32738, 13904)
13904


In [379]:
merged = merged.to_dict('list')

In [380]:
print(merged.keys())

dict_keys(['CellID', 'Clusters', '_X', '_Y', 'sample_name_x', 'nGene', 'nUMI', 'orig.ident', 'sample_name2', 'sample_name_y', 'diff', 'ucp1.ctrl', 'ucp1.ne', 'bmi', 'age', 'percent.ribo', 'percent.mito', 'res.0.5', 'res.0.6', 'res.0.7', 'res.0.8', 'res.0.9', 'res.1', 'S.Score', 'G2M.Score', 'Phase', 'old.ident', 'CellID-old'])


In [381]:
vlm.ca['CellID'] = np.asarray([x.encode('utf8') for x in vlm.ca['CellID']])
vlm.ca['sample_name'] = np.asarray([x.encode('utf8') for x in vlm.ca['sample_name']])

for key in merged:
    try:
        vlm.ca[key] = np.asarray([x.encode('utf8') for x in merged[key]])
    except:
        vlm.ca[key] = np.asarray(merged[key])

In [382]:
vlm.to_hdf5('/projects/pytrik/sc_adipose/analyze_10x_fluidigm/data/velocyto/all10x-downsampled-36.hdf5')