In [None]:
import loompy
import glob
import velocyto as vcy
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [202]:
files = glob.glob('/data/sc-10x/data-runs/171120-scheele-adipose/*/velocyto/*.loom')

#Loompy.combine looks at the first loom file and uses the 
#characteristcs of that one for the combined loom file, meaning the order of 
#the files matters. The CellID column contains the sample name plus bar code, 
#so if the first file in files is a sample with a short name, the cellID's of 
#44BSAT-3000_cells and BAT8-3000_cells won't fit in the new cellID column. 
#With the sorting step below 44BSAT-3000_cells will be the first file.
#You can also specify how many characters a column can contain with loompy.combine,
#but I don't know if there are more columns containing the cellID. 
files = sorted(files, key=lambda x: x.split('/')[-1])
loompy.combine(files, '/projects/pytrik/sc_adipose/analyze_10x_fluidigm/data/velocyto/all10x.loom', key="Accession")

In [301]:
vlm = vcy.VelocytoLoom('/projects/pytrik/sc_adipose/analyze_10x_fluidigm/data/velocyto/all10x.loom')

In [292]:
print(set(list(map(lambda x: x.split(':')[0], vlm.ca['CellID']))))

{'Subq_1', 'Supra_3', 'Peri_3', 'Visce_2', 'Supra_1', 'Supra_2', 'Subq_2', 'Peri_2', 'Visce_1', 'Peri_1', 'Subq_3', 'Visce_3', 'BAT8-3000_cells', '44BSAT-3000_cells'}


In [302]:
def rename_cellid(x):
    if ('BAT8-3000_cells' in x):
        return x.replace('BAT8-3000_cells', 'Supra_4')
    elif ('44BSAT-3000_cells' in x):
        return x.replace('44BSAT-3000_cells', 'Subq_4')
    else:
        return x

In [303]:
vlm.ca['CellID'] = list(map(rename_cellid, vlm.ca['CellID']))
vlm.ca['sample_name'] = list(map(lambda x: x.split(':')[0], vlm.ca['CellID']))
#vlm.set_clusters(vlm.ca['sample_name'])

In [304]:
print(set(vlm.ca['sample_name']))

{'Supra_3', 'Peri_3', 'Visce_2', 'Supra_1', 'Supra_2', 'Subq_4', 'Subq_2', 'Peri_2', 'Visce_1', 'Peri_1', 'Subq_3', 'Visce_3', 'Subq_1', 'Supra_4'}


In [305]:
metadata = pd.read_table('/projects/pytrik/sc_adipose/analyze_10x_fluidigm/data/metadata-downsampled-36.txt', sep='\t')
print(metadata.shape)
print(metadata.head(5))

(13904, 22)
                    nGene   nUMI     orig.ident sample_name2 sample_name diff  \
GCTCCTAGTCGACTAT-1   4274  26791  SeuratProject        Supra     Supra_4  80%   
GGGATGAAGACACGAC-1   4251  35598  SeuratProject        Supra     Supra_4  80%   
TAGGCATAGCCCTAAT-1   3945  23764  SeuratProject        Supra     Supra_4  80%   
CAGAGAGTCCCATTTA-1   5170  36709  SeuratProject        Supra     Supra_4  80%   
GTTCTCGAGACCTAGG-1   3985  19051  SeuratProject        Supra     Supra_4  80%   

                   ucp1.ctrl ucp1.ne bmi  age    ...      res.0.5  res.0.6  \
GCTCCTAGTCGACTAT-1   medium     high  26   43    ...            1        1   
GGGATGAAGACACGAC-1   medium     high  26   43    ...            1        1   
TAGGCATAGCCCTAAT-1   medium     high  26   43    ...            1        1   
CAGAGAGTCCCATTTA-1   medium     high  26   43    ...           13       14   
GTTCTCGAGACCTAGG-1   medium     high  26   43    ...            1        1   

                    res.0.7  res

In [306]:
ca_df = pd.DataFrame(vlm.ca)
print(ca_df.shape)
print(ca_df.head(5))

(57519, 5)
                     CellID  Clusters         _X         _Y sample_name
0  Subq_4:AAAGATGTCACGAAGGx         0  -5.738976   4.225660      Subq_4
1  Subq_4:AAATGCCCAGATCGGAx         5 -18.617771  -2.644823      Subq_4
2  Subq_4:AAATGCCTCGCCAAATx         2  -9.881272  -7.504128      Subq_4
3  Subq_4:AAACGGGGTAGCGCAAx         1   5.266000  20.643030      Subq_4
4  Subq_4:AAACCTGAGCGACGTAx         4  -5.705080  22.961088      Subq_4


In [307]:
metadata['CellID-old'] = metadata.index
metadata['CellID'] = metadata[['CellID-old', 'sample_name']].apply(lambda x: x[1] + ':' + x[0].split('-')[0] + 'x', axis=1)

In [310]:
cells_to_keep = np.array(metadata['CellID'].tolist())
cellids_vlm = np.array(vlm.ca['CellID'])
boolean_list_cells = np.isin(cellids_vlm, cells_to_keep)
vlm.filter_cells(boolean_list_cells)
##Don't use merge, filter VLM on cells from metadata, then add metadata. -
#merged = ca_df.merge(metadata, on='CellID', how='right')

IndexError: boolean index did not match indexed array along dimension 1; dimension is 13904 but corresponding boolean dimension is 57519

In [300]:
print(vlm.S.shape)

AttributeError: 'VelocytoLoom' object has no attribute 'cluster_labels'

In [191]:
merged = merged.to_dict('list')

In [192]:
print(merged.keys())

dict_keys(['CellID', 'Clusters', '_X', '_Y', 'sample_name_x', 'nGene', 'nUMI', 'orig.ident', 'sample_name2', 'sample_name_y', 'diff', 'ucp1.ctrl', 'ucp1.ne', 'bmi', 'age', 'percent.ribo', 'percent.mito', 'res.0.5', 'res.0.6', 'res.0.7', 'res.0.8', 'res.0.9', 'res.1', 'S.Score', 'G2M.Score', 'Phase', 'old.ident', 'CellID-old'])


In [193]:
#Apparently it's not possible to store unicode type in HDF5, so you have to manually encode everything

vlm.ca['CellID'] = [x.encode('utf8') for x in vlm.ca['CellID']]
vlm.ca['sample_name'] = [x.encode('utf8') for x in vlm.ca['sample_name']]

for key in merged:
    try:
        vlm.ca[key] = [x.encode('utf8') for x in merged[key]]
    except:
        vlm.ca[key] = merged[key] 

In [182]:
#print(vlm.ca.keys())
#print(type(vlm.ca['sample_name']))

dict_keys(['CellID', 'Clusters', '_X', '_Y', 'sample_name'])
<class 'list'>


In [194]:
vlm.to_hdf5('/projects/pytrik/sc_adipose/analyze_10x_fluidigm/data/velocyto/all10x-downsampled-36.hdf5')

In [183]:
#print(vlm.ca['Clusters'])
#vlm.ca['Clusters'] = [x.encode('utf8') for x in vlm.ca['Clusters']]
#vlm.set_clusters(vlm.ca["sample_name"])

[0 5 2 ..., 5 2 4]


In [196]:
#vlm = vcy.load_velocyto_hdf5('/projects/pytrik/sc_adipose/analyze_10x_fluidigm/data/velocyto/all10x-downsampled-36.hdf5')

In [246]:
print(cellid_vlm[1:10])

['Subq_4:AAATGCCCAGATCGGAx', 'Subq_4:AAATGCCTCGCCAAATx', 'Subq_4:AAACGGGGTAGCGCAAx', 'Subq_4:AAACCTGAGCGACGTAx', 'Subq_4:AAAGCAACAGGTGCCTx', 'Subq_4:AAAGATGAGATAGCATx', 'Subq_4:AAACCTGAGAAGCCCAx', 'Subq_4:AAACCTGGTCCGAGTCx', 'Subq_4:AAAGTAGCAGATAATGx']


[False, False, True, False, False, False, True, False, False]
