## Notebook for Pham 2022 anndata file creation 
### Developed by: Anna Maguza

### Institute of Computational Biology - Computational Health Centre - Hemlholtz Munich

### 13 February 2022

#### Load packages

In [2]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as an
import h5py
from scipy.io import mmread
from scipy.sparse import coo_matrix
import matplotlib.pyplot as plt
import scipy as sci
import loompy

#### Setup Cells

In [3]:
%matplotlib inline

In [4]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

  from .autonotebook import tqdm as notebook_tqdm


scanpy==1.9.1 anndata==0.8.0 umap==0.5.3 numpy==1.21.5 scipy==1.7.3 pandas==1.5.1 scikit-learn==1.1.3 statsmodels==0.13.2 pynndescent==0.5.8


#### Data Upload

In [79]:
# Read csv files
naiveTSC_annotation = pd.read_csv('/lustre/groups/talaveralopez/datasets/Anna_Maguza_Master_2023/Pham_2022/GSE204779_scRNA_seq_timecourse_naiveTSC_annotation.csv', sep=',', header=0, index_col=0)
naiveTSC_count_matrix = pd.read_csv('/lustre/groups/talaveralopez/datasets/Anna_Maguza_Master_2023/Pham_2022/GSE204779_scRNA_seq_timecourse_naiveTSC_count_matrix.csv', sep=',', header=0, index_col=0)

In [80]:
naiveTSC_count_matrix.head()

Unnamed: 0,AAACCCATCTTCGCTG.1,AAACGCTAGACCTTTG.1,AAACGCTTCACCTCGT.1,AAAGAACTCCCTCTAG.1,AAAGGATTCTCGCTCA.1,AAAGGGCAGTTCCGGC.1,AAAGGGCGTCATCTAG.1,AAAGGTAAGGAACTCG.1,AAAGGTAGTATCGGTT.1,AAAGTCCTCGGAACTT.1,...,TTTGACTGTTTCACTT.1,TTTGGAGCAAGTTTGC.1,TTTGGAGCAATGCAAA.1,TTTGGAGGTGGTAACG.1,TTTGGTTAGCATGAAT.1,TTTGGTTGTCGGCACT.1,TTTGGTTTCACCATCC.1,TTTGGTTTCGCTCTAC.1,TTTGGTTTCTCAGTCC.1,TTTGTTGGTGTGACCC.1
SCYL3,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
C1orf112,1,0,0,0,0,0,0,1,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FGR,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CFH,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
STPG1,0,1,0,0,2,0,0,0,2,1,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [81]:
#Create a list with the names of the genes
barcodes = pd.DataFrame(naiveTSC_count_matrix.columns).astype(str)
genes = pd.DataFrame(naiveTSC_count_matrix.index).astype(str)

#Rename barcode column to 'cell_id'
barcodes.columns = ['cell_id']
genes.columns = ['gene_name']

In [82]:
#Delete columns names and index names from the count matrix
naiveTSC_count_matrix = naiveTSC_count_matrix.values

In [83]:
# Create AnnData object
naiveTSC_adata = an.AnnData(X = np.transpose(naiveTSC_count_matrix), obs = barcodes, var = genes)

  naiveTSC_adata = an.AnnData(X = np.transpose(naiveTSC_count_matrix), obs = barcodes, var = genes)


In [84]:
naiveTSC_adata.obs

Unnamed: 0,cell_id
0,AAACCCATCTTCGCTG.1
1,AAACGCTAGACCTTTG.1
2,AAACGCTTCACCTCGT.1
3,AAAGAACTCCCTCTAG.1
4,AAAGGATTCTCGCTCA.1
...,...
12973,TTTGGTTGTCGGCACT.1
12974,TTTGGTTTCACCATCC.1
12975,TTTGGTTTCGCTCTAC.1
12976,TTTGGTTTCTCAGTCC.1


In [85]:
#Make column 0 as index in anndata object
naiveTSC_adata.obs_names = naiveTSC_adata.obs['cell_id']

In [88]:
naiveTSC_annotation.head()

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,cell,Samples,anno.cluster
AAACCCATCTTCGCTG.1,timecourse_naive2TSC,29982,6390,AAACCCATCTTCGCTG.1,D0,naive
AAACGCTAGACCTTTG.1,timecourse_naive2TSC,58427,7486,AAACGCTAGACCTTTG.1,D0,naive
AAACGCTTCACCTCGT.1,timecourse_naive2TSC,37718,6780,AAACGCTTCACCTCGT.1,D0,naive
AAAGAACTCCCTCTAG.1,timecourse_naive2TSC,12857,3977,AAAGAACTCCCTCTAG.1,D0,naive
AAAGGATTCTCGCTCA.1,timecourse_naive2TSC,34149,6625,AAAGGATTCTCGCTCA.1,D0,naive


In [86]:
#Change nCount_RNA to str type in naiveTSC_annotation
naiveTSC_annotation['nCount_RNA'] = naiveTSC_annotation['nCount_RNA'].astype(str)
naiveTSC_annotation['nFeature_RNA'] = naiveTSC_annotation['nFeature_RNA'].astype(str)


In [87]:
# Add metadata
naiveTSC_adata.obs['cell_type'] = naiveTSC_annotation['anno.cluster']
naiveTSC_adata.obs['orig.ident'] = naiveTSC_annotation['orig.ident']
naiveTSC_adata.obs['nCount_RNA'] = naiveTSC_annotation['nCount_RNA']
naiveTSC_adata.obs['nFeature_RNA'] = naiveTSC_annotation['nFeature_RNA']
naiveTSC_adata.obs['Samples'] = naiveTSC_annotation['Samples']

In [89]:
naiveTSC_adata.obs

Unnamed: 0_level_0,cell_id,cell_type,orig.ident,nCount_RNA,nFeature_RNA,Samples
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAACCCATCTTCGCTG.1,AAACCCATCTTCGCTG.1,naive,timecourse_naive2TSC,29982,6390,D0
AAACGCTAGACCTTTG.1,AAACGCTAGACCTTTG.1,naive,timecourse_naive2TSC,58427,7486,D0
AAACGCTTCACCTCGT.1,AAACGCTTCACCTCGT.1,naive,timecourse_naive2TSC,37718,6780,D0
AAAGAACTCCCTCTAG.1,AAAGAACTCCCTCTAG.1,naive,timecourse_naive2TSC,12857,3977,D0
AAAGGATTCTCGCTCA.1,AAAGGATTCTCGCTCA.1,naive,timecourse_naive2TSC,34149,6625,D0
...,...,...,...,...,...,...
TTTGGTTGTCGGCACT.1,TTTGGTTGTCGGCACT.1,late.EXMC,timecourse_naive2TSC,14036,3876,D70
TTTGGTTTCACCATCC.1,TTTGGTTTCACCATCC.1,late.EXMC,timecourse_naive2TSC,23015,5405,D70
TTTGGTTTCGCTCTAC.1,TTTGGTTTCGCTCTAC.1,late.EXMC,timecourse_naive2TSC,22274,4500,D70
TTTGGTTTCTCAGTCC.1,TTTGGTTTCTCAGTCC.1,late.EXMC,timecourse_naive2TSC,12856,4786,D70


In [90]:
naiveTSC_adata.var

Unnamed: 0,gene_name
0,SCYL3
1,C1orf112
2,FGR
3,CFH
4,STPG1
...,...
1155,PDIA6
1156,RHOB
1157,ATP6V1C2
1158,HNRNPLL


In [91]:
#Write AnnData object to file
naiveTSC_adata.write('/lustre/groups/talaveralopez/datasets/Anna_Maguza_Master_2023/Pham_2022/Anndata/Pham_2022_naiveTSC_adata.h5ad')