In [1]:
import scanpy as sc
import anndata as ad
import pandas as pd
import numpy as np
import os

print(sc.__version__)



1.10.3


---------------------------

# Here I want to look for duplicates

In [10]:
adata = sc.read_10x_h5('data files/GSM6186021_DeJager_MO_LOAD11_G_filtered_feature_bc_matrix.h5')

  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


In [11]:
adata

AnnData object with n_obs × n_vars = 1359 × 27984
    var: 'gene_ids', 'feature_types', 'genome'

In [3]:
adata.var_names[adata.var_names.duplicated()] # list of duplicates

Index(['LINC01115', 'STPG4', 'PDE11A', 'PRSS50', 'TXNRD3NB', 'ARL14EPL',
       'MATR3', 'RABGEF1', 'TMSB15B', 'KBTBD11-OT1', 'LINC00484', 'LINC01505',
       'EMG1', 'SPATA13', 'GOLGA8M', 'COG8', 'LINC01297', 'LINC01422', 'SCO2',
       'H2BFS'],
      dtype='object')

In [4]:
adata.var.loc[adata.var_names == 'STPG4']

Unnamed: 0,gene_ids,feature_types,genome
STPG4,ENSG00000239605,Gene Expression,refdata-cellranger-3.1.0-Homo.sapiens.GRCh38.91
STPG4,ENSG00000273269,Gene Expression,refdata-cellranger-3.1.0-Homo.sapiens.GRCh38.91


They have different ensembls

In [5]:
for gene in adata.var_names[adata.var_names.duplicated()]:
    length = len(adata.var.loc[adata.var_names == gene])
    print(length, gene)

2 LINC01115
2 STPG4
2 PDE11A
2 PRSS50
2 TXNRD3NB
2 ARL14EPL
2 MATR3
2 RABGEF1
2 TMSB15B
2 KBTBD11-OT1
2 LINC00484
2 LINC01505
2 EMG1
2 SPATA13
2 GOLGA8M
2 COG8
2 LINC01297
2 LINC01422
2 SCO2
2 H2BFS


**Making the genes unique (and saving not unique too):**

In [6]:
adata_copy = adata.copy()

# Before making unique
print((adata.var_names == adata_copy.var_names).all())  # This will print True

# Make var_names unique in one object
adata.var_names_make_unique()

# Check again
print((adata.var_names == adata_copy.var_names).all())  # This will print False


  utils.warn_names_duplicates("var")


True
False


In [7]:
adata.var_names[adata_copy.var_names.duplicated()] # How the genes were named before

Index(['LINC01115-1', 'STPG4-1', 'PDE11A-1', 'PRSS50-1', 'TXNRD3NB-1',
       'ARL14EPL-1', 'MATR3-1', 'RABGEF1-1', 'TMSB15B-1', 'KBTBD11-OT1-1',
       'LINC00484-1', 'LINC01505-1', 'EMG1-1', 'SPATA13-1', 'GOLGA8M-1',
       'COG8-1', 'LINC01297-1', 'LINC01422-1', 'SCO2-1', 'H2BFS-1'],
      dtype='object')

In [8]:
adata_copy.var_names[adata_copy.var_names.duplicated()] # How the genes are named after (with '-1')

Index(['LINC01115', 'STPG4', 'PDE11A', 'PRSS50', 'TXNRD3NB', 'ARL14EPL',
       'MATR3', 'RABGEF1', 'TMSB15B', 'KBTBD11-OT1', 'LINC00484', 'LINC01505',
       'EMG1', 'SPATA13', 'GOLGA8M', 'COG8', 'LINC01297', 'LINC01422', 'SCO2',
       'H2BFS'],
      dtype='object')

**Check if thses duplicates are being removed after filtering\
I want to do this before comparing the expression levels of two genes across all cells**

In [9]:
adata_copy_filtered = adata_copy.copy()

sc.pp.filter_cells(adata_copy_filtered, min_genes=100)
sc.pp.filter_genes(adata_copy_filtered, min_cells=3)

print(adata_copy_filtered.var_names[adata_copy_filtered.var_names.duplicated()])

Index(['EMG1', 'COG8'], dtype='object')


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


**After filtering by min_genes=100 and min_cells=3 there are only 2 genes left:** \
['EMG1', 'COG8']

**Compares the expression levels of two genes across all cells**

In [10]:
print(adata[:, 'COG8'].X == adata[:, 'COG8-1'].X)

<Compressed Sparse Row sparse matrix of dtype 'bool'
	with 1308 stored elements and shape (1359, 1)>
  Coords	Values
  (0, 0)	True
  (1, 0)	True
  (2, 0)	True
  (3, 0)	True
  (4, 0)	True
  (5, 0)	True
  (6, 0)	True
  (7, 0)	True
  (8, 0)	True
  (9, 0)	True
  (10, 0)	True
  (11, 0)	True
  (12, 0)	True
  (13, 0)	True
  (14, 0)	True
  (15, 0)	True
  (16, 0)	True
  (17, 0)	True
  (18, 0)	True
  (19, 0)	True
  (20, 0)	True
  (21, 0)	True
  (22, 0)	True
  (23, 0)	True
  (24, 0)	True
  :	:
  (1333, 0)	True
  (1334, 0)	True
  (1335, 0)	True
  (1336, 0)	True
  (1337, 0)	True
  (1338, 0)	True
  (1339, 0)	True
  (1340, 0)	True
  (1341, 0)	True
  (1342, 0)	True
  (1343, 0)	True
  (1344, 0)	True
  (1345, 0)	True
  (1346, 0)	True
  (1347, 0)	True
  (1348, 0)	True
  (1349, 0)	True
  (1350, 0)	True
  (1351, 0)	True
  (1353, 0)	True
  (1354, 0)	True
  (1355, 0)	True
  (1356, 0)	True
  (1357, 0)	True
  (1358, 0)	True


  exec(code_obj, self.user_global_ns, self.user_ns)


In [11]:
comparison = adata[:, 'COG8'].X == adata[:, 'COG8-1'].X
false_values = comparison != True

print(false_values.sum()) # Number of cells with different expression for the gene
non_zero_pos = false_values.nonzero()[0]
print(non_zero_pos) # Cells with different expression for the gene

51
[  88  136  170  230  245  266  277  285  286  296  355  429  509  512
  534  559  566  580  603  614  649  737  758  811  823  827  839  861
  863  867  901  907  927  943  955  992  999 1010 1071 1084 1087 1105
 1142 1158 1178 1198 1258 1266 1295 1305 1352]


  exec(code_obj, self.user_global_ns, self.user_ns)


In [12]:
print('COG8-1', 'COG8')
for pos in non_zero_pos:
    print(adata[pos, ['COG8-1', 'COG8']].X.toarray())

COG8-1 COG8
[[1. 0.]]
[[0. 1.]]
[[1. 0.]]
[[1. 0.]]
[[1. 0.]]
[[1. 0.]]
[[1. 0.]]
[[1. 0.]]
[[1. 0.]]
[[1. 0.]]
[[1. 0.]]
[[1. 0.]]
[[1. 0.]]
[[0. 1.]]
[[1. 0.]]
[[1. 0.]]
[[0. 1.]]
[[0. 1.]]
[[1. 0.]]
[[1. 0.]]
[[1. 0.]]
[[1. 0.]]
[[1. 0.]]
[[1. 0.]]
[[1. 0.]]
[[1. 0.]]
[[1. 0.]]
[[1. 0.]]
[[1. 0.]]
[[0. 1.]]
[[1. 0.]]
[[0. 1.]]
[[0. 1.]]
[[1. 0.]]
[[1. 0.]]
[[1. 0.]]
[[1. 0.]]
[[0. 1.]]
[[0. 1.]]
[[1. 0.]]
[[0. 1.]]
[[1. 0.]]
[[1. 0.]]
[[0. 1.]]
[[2. 0.]]
[[0. 1.]]
[[1. 0.]]
[[0. 1.]]
[[0. 1.]]
[[0. 1.]]
[[0. 1.]]


**So, these duplicates are two completely separated genes**

-----------------------------------------

# Check for genes and duplicates in all the files

In [12]:
os.listdir('data files')

['GSM6186021_DeJager_MO_LOAD11_G_filtered_feature_bc_matrix.h5',
 'GSM6186022_MA001_filtered_feature_bc_matrix.h5',
 'GSM6186023_MA002_filtered_feature_bc_matrix.h5',
 'GSM6186024_MA003_filtered_feature_bc_matrix.h5',
 'GSM6186025_MA004_filtered_feature_bc_matrix.h5',
 'GSM6186026_PM004_filtered_feature_bc_matrix.h5',
 'GSM6186027_PM005_filtered_feature_bc_matrix.h5',
 'GSM6186028_PM006_filtered_feature_bc_matrix.h5',
 'GSM6186029_PM007_filtered_feature_bc_matrix.h5',
 'GSM6186030_PM013_filtered_feature_bc_matrix.h5',
 'GSM6186031_PM014_filtered_feature_bc_matrix.h5',
 'GSM6186032_PM016_filtered_feature_bc_matrix.h5',
 'GSM6186033_PM017_filtered_feature_bc_matrix.h5',
 'GSM6186034_PM018_filtered_feature_bc_matrix.h5',
 'GSM6186035_PM020_filtered_feature_bc_matrix.h5',
 'GSM6186036_PM021_filtered_feature_bc_matrix.h5',
 'GSM6186037_PM022_filtered_feature_bc_matrix.h5',
 'GSM6186038_PM023_filtered_feature_bc_matrix.h5',
 'GSM6186039_PM024_filtered_feature_bc_matrix.h5',
 'GSM6186040_PM02

**Find all the files in this directory**

In [13]:
file_names = os.listdir('data files')

**3 files for test**

In [20]:
three_files_only_test = file_names[0:3]
print(three_files_only_test)

['GSM6186021_DeJager_MO_LOAD11_G_filtered_feature_bc_matrix.h5', 'GSM6186022_MA001_filtered_feature_bc_matrix.h5', 'GSM6186023_MA002_filtered_feature_bc_matrix.h5']


In [17]:
# import warnings
# warnings.filterwarnings("ignore", message="Variable names are not unique")


# duplicates_for_comparison = adata_copy.var_names[adata_copy.var_names.duplicated()]

# for file_name in three_files_only_test:
#     path = os.path.join(directory, file_name)

#     adata_cycle = sc.read_10x_h5(path)
    
#     number_of_genes = adata_cycle.shape[1]
#     list_of_duplicates = adata_cycle.var_names[adata_cycle.var_names.duplicated()]
#     number_of_duplicates = len(list_of_duplicates)
#     are_genes_the_same = list(list_of_duplicates) == list(duplicates_for_comparison)
    
#     print(number_of_genes, number_of_duplicates, are_genes_the_same)

In [30]:
# The next code was used (on 3 files) to determine if the function var_names_make_unique() make name 
# changes the same way for all the files
for file_name in three_files_only_test:
    path = os.path.join(directory, file_name)

    adata_cycle = sc.read_10x_h5(path)

    print(adata_cycle.var.loc['COG8'])

    adata_cycle.var_names_make_unique()

    print(adata_cycle.var.loc[['COG8-1', 'COG8']])


             gene_ids    feature_types  \
COG8  ENSG00000213380  Gene Expression   
COG8  ENSG00000272617  Gene Expression   

                                               genome  
COG8  refdata-cellranger-3.1.0-Homo.sapiens.GRCh38.91  
COG8  refdata-cellranger-3.1.0-Homo.sapiens.GRCh38.91  
               gene_ids    feature_types  \
COG8-1  ENSG00000272617  Gene Expression   
COG8    ENSG00000213380  Gene Expression   

                                                 genome  
COG8-1  refdata-cellranger-3.1.0-Homo.sapiens.GRCh38.91  
COG8    refdata-cellranger-3.1.0-Homo.sapiens.GRCh38.91  
             gene_ids    feature_types  \
COG8  ENSG00000213380  Gene Expression   
COG8  ENSG00000272617  Gene Expression   

                                               genome  
COG8  refdata-cellranger-3.1.0-Homo.sapiens.GRCh38.91  
COG8  refdata-cellranger-3.1.0-Homo.sapiens.GRCh38.91  
               gene_ids    feature_types  \
COG8-1  ENSG00000272617  Gene Expression   
COG8    ENSG000

**Yes, COG8 with gene_ids ENSG00000272617 is always renamed to COG8-1, while COG8 with gene_ids ENSG00000213380 always stay COG8**

**Find out if duplicates are the same for all the files**

In [21]:
import warnings
warnings.filterwarnings("ignore", message="Variable names are not unique")


# Identify duplicated gene names in the file
duplicates_for_comparison = adata_copy.var_names[adata_copy.var_names.duplicated()]

# Iterate through each file
for file_name in file_names:
    path = os.path.join(directory, file_name)

    adata_cycle = sc.read_10x_h5(path)

    # Find the gene count, the number of duplicates, and whether the duplicates match the reference for each file
    number_of_genes = adata_cycle.shape[1]
    list_of_duplicates = adata_cycle.var_names[adata_cycle.var_names.duplicated()]
    number_of_duplicates = len(list_of_duplicates)
    do_genes_match = list(list_of_duplicates) == list(duplicates_for_comparison)
    
    print(number_of_genes, number_of_duplicates, do_genes_match)

27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True
27984 20 True


--------------------------------------

# Combine files

In [42]:
# Should I firstly read all the files and save them to the adata_list and then use concat()?
# Or should I from the beginning use concat() in the cycle and add 1 file per iteration?

In [106]:
"""
!Sample_description	"LOAD11 GM"
!Sample_description	"ALS4 GM"	"ALS4 GM"	"ALS4 FN"	"ALS4 SC"	"DLBD-PD2 GM"	"DLBD-PD2 SN"	"DLBD-PD1 GM"	"DLBD-PD1 SN"	"LOAD12 GM"	"LOAD12 AH"	"LOAD13 GM"	"LOAD13 H"	"LOAD13 SN"	"LOAD14 GM"	"LOAD14 H"	"LOAD14 SN"	"LOAD15 GM"	"LOAD15 H"	"LOAD15 SN"	"PSP2 H"	"PSP2 SN"	"PSP3 GM"	"FTD1 GM"	"FTD1 H"	"ALS1 GM"	"ALS1 GM"	"ALS1 FN"	"ALS1 SC"	"ALS2 GM"	"ALS2 GM"	"ALS2 FN"	"ALS2 SC"	"ALS3 GM"	"ALS3 GM"	"ALS3 FN"	"ALS3 SC"	"ALS5 GM"	"ALS5 SC"	"DLBD-PD3 GM"	"DLBD-PD3 SN"	"MS1 AWS"	"MS1 GM"	"MS1 TH"	"ALS6 BA9+BA4+SC"	"LOAD1 BA9+BA20+H"	"LOAD2 BA9+BA20+H"	"EOAD1 BA9+BA20+H"	"ALS7 BA9+BA4+SC"	"ALS/FTD1 BA9+BA4+SC"	"MS2 BA9+TH+Lesion+AWS"	"LOAD20 BA9+BA20+H"	"EOAD2 BA9+BA20+H"	"LOAD21 BA9+BA20+H"	"DLBD-PD4 BA9+SN"	"ALS8 BA9+BA4+SC"	"CNTRL1 BA9+BA4+H+SN"	"LOAD26 GM"	"LOAD27 BA9+H"	"ALS/FTD2 BA9+BA4+SC; DLBD-PD5 BA9+SN"	"LOAD28 BA9+BA20+H"	"ALS9 BA9+BA4+SC"	"LOAD31 BA9+BA20+H"	"LOAD32 BA9+BA20+H"	"LOAD33 BA9+BA20+H"	"LOAD34 BA9+AWS"	"LOAD35 BA9+BA20+H;HD1 BA9+BA20+H"
"""

'\n!Sample_description\t"LOAD11 GM"\n!Sample_description\t"ALS4 GM"\t"ALS4 GM"\t"ALS4 FN"\t"ALS4 SC"\t"DLBD-PD2 GM"\t"DLBD-PD2 SN"\t"DLBD-PD1 GM"\t"DLBD-PD1 SN"\t"LOAD12 GM"\t"LOAD12 AH"\t"LOAD13 GM"\t"LOAD13 H"\t"LOAD13 SN"\t"LOAD14 GM"\t"LOAD14 H"\t"LOAD14 SN"\t"LOAD15 GM"\t"LOAD15 H"\t"LOAD15 SN"\t"PSP2 H"\t"PSP2 SN"\t"PSP3 GM"\t"FTD1 GM"\t"FTD1 H"\t"ALS1 GM"\t"ALS1 GM"\t"ALS1 FN"\t"ALS1 SC"\t"ALS2 GM"\t"ALS2 GM"\t"ALS2 FN"\t"ALS2 SC"\t"ALS3 GM"\t"ALS3 GM"\t"ALS3 FN"\t"ALS3 SC"\t"ALS5 GM"\t"ALS5 SC"\t"DLBD-PD3 GM"\t"DLBD-PD3 SN"\t"MS1 AWS"\t"MS1 GM"\t"MS1 TH"\t"ALS6 BA9+BA4+SC"\t"LOAD1 BA9+BA20+H"\t"LOAD2 BA9+BA20+H"\t"EOAD1 BA9+BA20+H"\t"ALS7 BA9+BA4+SC"\t"ALS/FTD1 BA9+BA4+SC"\t"MS2 BA9+TH+Lesion+AWS"\t"LOAD20 BA9+BA20+H"\t"EOAD2 BA9+BA20+H"\t"LOAD21 BA9+BA20+H"\t"DLBD-PD4 BA9+SN"\t"ALS8 BA9+BA4+SC"\t"CNTRL1 BA9+BA4+H+SN"\t"LOAD26 GM"\t"LOAD27 BA9+H"\t"ALS/FTD2 BA9+BA4+SC; DLBD-PD5 BA9+SN"\t"LOAD28 BA9+BA20+H"\t"ALS9 BA9+BA4+SC"\t"LOAD31 BA9+BA20+H"\t"LOAD32 BA9+BA20+H"\t"LOAD33 BA

In [134]:
sample_description = 'LOAD11 GM"	"ALS4 GM"	"ALS4 GM"	"ALS4 FN"	"ALS4 SC"	"DLBD-PD2 GM"	"DLBD-PD2 SN"	"DLBD-PD1 GM"	"DLBD-PD1 SN"	"LOAD12 GM"	"LOAD12 AH"	"LOAD13 GM"	"LOAD13 H"	"LOAD13 SN"	"LOAD14 GM"	"LOAD14 H"	"LOAD14 SN"	"LOAD15 GM"	"LOAD15 H"	"LOAD15 SN"	"PSP2 H"	"PSP2 SN"	"PSP3 GM"	"FTD1 GM"	"FTD1 H"	"ALS1 GM"	"ALS1 GM"	"ALS1 FN"	"ALS1 SC"	"ALS2 GM"	"ALS2 GM"	"ALS2 FN"	"ALS2 SC"	"ALS3 GM"	"ALS3 GM"	"ALS3 FN"	"ALS3 SC"	"ALS5 GM"	"ALS5 SC"	"DLBD-PD3 GM"	"DLBD-PD3 SN"	"MS1 AWS"	"MS1 GM"	"MS1 TH"	"ALS6 BA9+BA4+SC"	"LOAD1 BA9+BA20+H"	"LOAD2 BA9+BA20+H"	"EOAD1 BA9+BA20+H"	"ALS7 BA9+BA4+SC"	"ALS/FTD1 BA9+BA4+SC"	"MS2 BA9+TH+Lesion+AWS"	"LOAD20 BA9+BA20+H"	"EOAD2 BA9+BA20+H"	"LOAD21 BA9+BA20+H"	"DLBD-PD4 BA9+SN"	"ALS8 BA9+BA4+SC"	"CNTRL1 BA9+BA4+H+SN"	"LOAD26 GM"	"LOAD27 BA9+H"	"ALS/FTD2 BA9+BA4+SC; DLBD-PD5 BA9+SN"	"LOAD28 BA9+BA20+H"	"ALS9 BA9+BA4+SC"	"LOAD31 BA9+BA20+H"	"LOAD32 BA9+BA20+H"	"LOAD33 BA9+BA20+H"	"LOAD34 BA9+AWS"	"LOAD35 BA9+BA20+H;HD1 BA9+BA20+H"'

sample_description = sample_description.replace('"', '')
sample_description_list = sample_description.split('\t')

processed_sample_description_list = []
for element in sample_description_list:
    # Remove space and anything after it
    processed_element = element.split(' ')[0]
    processed_sample_description_list.append(processed_element)


condition_list = []
import re
for element in processed_sample_description_list:
    # Use a regular expression to remove trailing numbers
    cleaned_element = re.sub(r'\d+$', '', element)
    condition_list.append(cleaned_element)
    
print(condition_list)

['LOAD', 'ALS', 'ALS', 'ALS', 'ALS', 'DLBD-PD', 'DLBD-PD', 'DLBD-PD', 'DLBD-PD', 'LOAD', 'LOAD', 'LOAD', 'LOAD', 'LOAD', 'LOAD', 'LOAD', 'LOAD', 'LOAD', 'LOAD', 'LOAD', 'PSP', 'PSP', 'PSP', 'FTD', 'FTD', 'ALS', 'ALS', 'ALS', 'ALS', 'ALS', 'ALS', 'ALS', 'ALS', 'ALS', 'ALS', 'ALS', 'ALS', 'ALS', 'ALS', 'DLBD-PD', 'DLBD-PD', 'MS', 'MS', 'MS', 'ALS', 'LOAD', 'LOAD', 'EOAD', 'ALS', 'ALS/FTD', 'MS', 'LOAD', 'EOAD', 'LOAD', 'DLBD-PD', 'ALS', 'CNTRL', 'LOAD', 'LOAD', 'ALS/FTD', 'LOAD', 'ALS', 'LOAD', 'LOAD', 'LOAD', 'LOAD', 'LOAD']


In [135]:
# Create a list for all the adata objects corresponding to the files 
adata_list = []

# for file_name in three_files_only_test:
for i, file_name in enumerate(three_files_only_test):

    path = os.path.join(directory, file_name)

    adata_cycle = sc.read_10x_h5(path)
    
    adata_cycle.var_names_make_unique()

    adata_cycle.var['file_number']= i
    adata_cycle.var['condition'] = condition_list[i]
    
    adata_list.append(adata_cycle)

print(adata_list)

[AnnData object with n_obs × n_vars = 1359 × 27984
    var: 'gene_ids', 'feature_types', 'genome', 'file_number', 'condition', AnnData object with n_obs × n_vars = 2177 × 27984
    var: 'gene_ids', 'feature_types', 'genome', 'file_number', 'condition', AnnData object with n_obs × n_vars = 2510 × 27984
    var: 'gene_ids', 'feature_types', 'genome', 'file_number', 'condition']


In [120]:
adata_combined = ad.concat(adata_list,  join="outer")

  utils.warn_names_duplicates("obs")


In [121]:
adata_combined

AnnData object with n_obs × n_vars = 6046 × 27984

In [122]:
adata_combined.var # It is empty for some reason

MIR1302-2HG
FAM138A
OR4F5
AL627309.1
AL627309.3
...
AC233755.2
AC233755.1
AC240274.1
AC213203.1
FAM231D


In [136]:
print(adata_cycle.var)

                    gene_ids    feature_types  \
MIR1302-2HG  ENSG00000243485  Gene Expression   
FAM138A      ENSG00000237613  Gene Expression   
OR4F5        ENSG00000186092  Gene Expression   
AL627309.1   ENSG00000238009  Gene Expression   
AL627309.3   ENSG00000239945  Gene Expression   
...                      ...              ...   
AC233755.2   ENSG00000277856  Gene Expression   
AC233755.1   ENSG00000275063  Gene Expression   
AC240274.1   ENSG00000271254  Gene Expression   
AC213203.1   ENSG00000277475  Gene Expression   
FAM231D      ENSG00000268674  Gene Expression   

                                                      genome  file_number  \
MIR1302-2HG  refdata-cellranger-3.1.0-Homo.sapiens.GRCh38.91            2   
FAM138A      refdata-cellranger-3.1.0-Homo.sapiens.GRCh38.91            2   
OR4F5        refdata-cellranger-3.1.0-Homo.sapiens.GRCh38.91            2   
AL627309.1   refdata-cellranger-3.1.0-Homo.sapiens.GRCh38.91            2   
AL627309.3   refdata-cellr