# Notebook to create anndata object from matrix files of heart data from '10XGenomics' 

**Developed by** :Srivalli Kolla

**Created on** : 28 June, 2024

**Last modified** : 28 June, 2024

**Würzburg Institute for Systems Immunology & Julius-Maximilian-Universität Würzburg**

# Import packages

In [1]:
import scanpy as sc
import pandas as pd
import scipy.io
import os
import lamindb as ln

In [2]:
!lamin init --storage ./10x_ln --schema bionty

💡 connected lamindb: srivalli/10x_ln


In [3]:
ln.setup.init(storage= '10x_ln')

💡 connected lamindb: srivalli/10x_ln


In [4]:
# copy-pasted identifiers for your notebook or script
ln.settings.transform.stem_uid = "FPnfDtJz8qbE"  # <-- auto-generated by running ln.track()
ln.settings.transform.version = "1.0"  # <-- auto-generated by running ln.track()

# track the execution of your notebook or script
run = ln.track()

# see your currently running transform
run.transform

💡 notebook imports: lamindb==0.74.1 pandas==2.2.2 scanpy==1.10.1 scipy==1.12.0
💡 saved: Transform(uid='FPnfDtJz8qbE6xfK', version='1.0', name='Notebook to create anndata object from matrix files of heart data from '10XGenomics'', key='anndata_creation_10x_28062024', type='notebook', created_by_id=1, updated_at='2024-07-01 13:34:12 UTC')
💡 saved: Run(uid='YdFX2mDjbkMZxqlh3waR', transform_id=2, created_by_id=1)


Transform(uid='FPnfDtJz8qbE6xfK', version='1.0', name='Notebook to create anndata object from matrix files of heart data from '10XGenomics'', key='anndata_creation_10x_28062024', type='notebook', created_by_id=1, updated_at='2024-07-01 13:34:12 UTC')

# Load files

In [5]:
# Define the file paths
data_dir = 'data' 
matrix_file = os.path.join(data_dir, 'matrix.mtx.gz')
features_file = os.path.join(data_dir, 'features.tsv.gz')
barcodes_file = os.path.join(data_dir, 'barcodes.tsv.gz')

# Load the matrix data
matrix = scipy.io.mmread(matrix_file).tocsr()

# Load the features and barcodes
features = pd.read_csv(features_file, header=None, sep='\t')
barcodes = pd.read_csv(barcodes_file, header=None, sep='\t')

# Anndata creation

In [6]:
# Create the AnnData object
adata = sc.AnnData(X=matrix.T)  # Transpose the matrix to have cells as rows and genes as columns

# Set the observation names (cell barcodes) and variable names (genes)
adata.obs_names = barcodes[0].values
adata.var_names = features[1].values
adata.var['gene_ids'] = features[0].values
adata.var['feature_types'] = features[2].values

# Save the AnnData object to an .h5ad file
output_file = os.path.join(data_dir, 'heart_sp_vi_10x.h5ad')
adata.write(output_file)

print(f"AnnData object created and saved to {output_file}")

AnnData object created and saved to data/heart_sp_vi_10x.h5ad


In [7]:
heart = sc.read_h5ad('data/heart_sp_vi_10x.h5ad')

  utils.warn_names_duplicates("var")


In [8]:
heart

AnnData object with n_obs × n_vars = 4247 × 36601
    var: 'gene_ids', 'feature_types'

In [9]:
heart.obs

AAACAAGTATCTCCCA-1
AAACACCAATAACTGC-1
AAACAGAGCGACTCCT-1
AAACAGCTTTCAGAAG-1
AAACAGGGTCTATATT-1
...
TTGTTGTGTGTCAAGA-1
TTGTTTCACATCCAGG-1
TTGTTTCATTAGTCTA-1
TTGTTTCCATACAACT-1
TTGTTTGTGTAAATTC-1


In [10]:
heart.var

Unnamed: 0,gene_ids,feature_types
MIR1302-2HG,ENSG00000243485,Gene Expression
FAM138A,ENSG00000237613,Gene Expression
OR4F5,ENSG00000186092,Gene Expression
AL627309.1,ENSG00000238009,Gene Expression
AL627309.3,ENSG00000239945,Gene Expression
...,...,...
AC141272.1,ENSG00000277836,Gene Expression
AC023491.2,ENSG00000278633,Gene Expression
AC007325.1,ENSG00000276017,Gene Expression
AC007325.4,ENSG00000278817,Gene Expression


In [11]:
artifact_anndata = ln.Artifact.from_anndata(heart, description= '10x_anndata')
artifact_anndata.save()

💡 returning existing artifact with same hash: Artifact(uid='MAKYDY6WTqIudoQoQdiQ', description='10x_anndata', suffix='.h5ad', type='dataset', accessor='AnnData', size=102391001, hash='QBftfcesWHpe-Agns_v602', hash_type='sha1-fl', visibility=1, key_is_virtual=True, created_by_id=1, storage_id=1, transform_id=1, run_id=1, updated_at='2024-07-01 10:53:02 UTC')


Artifact(uid='MAKYDY6WTqIudoQoQdiQ', description='10x_anndata', suffix='.h5ad', type='dataset', accessor='AnnData', size=102391001, hash='QBftfcesWHpe-Agns_v602', hash_type='sha1-fl', visibility=1, key_is_virtual=True, created_by_id=1, storage_id=1, transform_id=2, run_id=2, updated_at='2024-07-01 13:34:15 UTC')

In [12]:
artifact_folder = ln.Artifact('../10x', description = 'Folder having 10x data and scripts')

In [13]:
ln.Artifact.df()

Unnamed: 0_level_0,uid,version,description,key,suffix,type,accessor,size,hash,hash_type,n_objects,n_observations,visibility,key_is_virtual,storage_id,transform_id,run_id,created_by_id,updated_at
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,MAKYDY6WTqIudoQoQdiQ,,10x_anndata,,.h5ad,dataset,AnnData,102391001,QBftfcesWHpe-Agns_v602,sha1-fl,,,1,True,1,2,2,1,2024-07-01 13:34:15.647346+00:00
4,Xq45Ea1q6AKrGyxCW3MA,,Folder having 10x data and scripts,,,dataset,,48683713239,S02omivXPviZbLKYHJ-8aQ,md5-d,1573.0,,1,True,1,1,1,1,2024-07-01 11:14:35.007862+00:00


In [14]:
artifact_folder.transform

Transform(uid='FPnfDtJz8qbE6xfK', version='1.0', name='Notebook to create anndata object from matrix files of heart data from '10XGenomics'', key='anndata_creation_10x_28062024', type='notebook', created_by_id=1, updated_at='2024-07-01 13:34:12 UTC')

In [15]:
artifact_folder.run

Run(uid='YdFX2mDjbkMZxqlh3waR', started_at='2024-07-01 13:34:12 UTC', is_consecutive=True, transform_id=2, created_by_id=1)

In [17]:
ln.finish()

✅ cell execution numbers increase consecutively
