# Williams periodontitis data
make object from 10x matrices downloaded from DCP: https://data.humancellatlas.org/explore/projects/50151324-f3ed-4358-98af-ec352a940a61/project-matrices

In [1]:
import scipy.io
import gzip
import pandas as pd
import io
import anndata

# Define a list of sample IDs and corresponding patient IDs
sample_info = [
    {"sample_id": "GSM5005058_PD134", "patient_id": "PD134"},
    {"sample_id": "GSM5005059_PD153", "patient_id": "PD153"},  
    {"sample_id": "GSM5005060_PD161", "patient_id": "PD161"},  
    {"sample_id": "GSM5005061_PD161b", "patient_id": "PD161b"},  
    {"sample_id": "GSM5005062_PD164", "patient_id": "PD164"},  
    {"sample_id": "GSM5177042_PD164b", "patient_id": "PD164b"},  
    {"sample_id": "GSM5177043_PD164c", "patient_id": "PD164c"},  
    {"sample_id": "GSM5177044_PD170", "patient_id": "PD170"} 
]

# Initialize a list to store the AnnData objects for each sample
adata_list = []

# Loop through the sample info
for sample_data in sample_info:
    sample_id = sample_data["sample_id"]
    patient_id = sample_data["patient_id"]

    # Load the X matrix
    X = scipy.io.mmread(f"/nfs/team205/ao15/Megagut/Williams_data/data/{sample_id}_matrix.mtx.gz").T
    
    # Load the obs (barcodes) data
    obs_file_path = f"/nfs/team205/ao15/Megagut/Williams_data/data/{sample_id}_barcodes.tsv"
    with gzip.open(obs_file_path, 'rb') as obs_file:
        obs_content = obs_file.read()
    obs = pd.read_csv(io.StringIO(obs_content.decode('utf-8')), index_col=0, delimiter='\t', header=None)
    
    # Load the var (features) data
    var_file_path = f"/nfs/team205/ao15/Megagut/Williams_data/data/{sample_id}_features.tsv"
    with gzip.open(var_file_path, 'rb') as var_file:
        var_content = var_file.read()
    var = pd.read_csv(io.StringIO(var_content.decode('utf-8')), index_col=0, delimiter='\t', header=None)

    

    # Append the patient ID to the cell barcodes
    obs.index = patient_id + "_" + obs.index 

    # Create an AnnData object for each sample
    adata = anndata.AnnData(X, var=var, obs=obs)

    # Append the adata object to the list
    adata_list.append(adata)

# Concatenate the list of AnnData objects into a single AnnData object
combined_adata = anndata.concat(adata_list, axis=0)  # axis=0 for concatenating vertically (along samples)

# You now have a single combined AnnData object containing data from all samples with patient IDs in cell barcodes
print(combined_adata)

  adata = anndata.AnnData(X, var=var, obs=obs)


AnnData object with n_obs × n_vars = 45712 × 33694


In [2]:
combined_adata

AnnData object with n_obs × n_vars = 45712 × 33694

In [3]:
import scanpy as sc
annot_adata = sc.read_h5ad('/nfs/team205/ao15/Megagut/Williams_data/gingiva_Health-Disease.h5ad')

In [4]:
annot_adata

AnnData object with n_obs × n_vars = 87240 × 33694
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'project', 'percent.mt', 'S.Score', 'G2M.Score', 'paperLabels'
    var: 'name'
    obsm: 'X_pca', 'X_umap'

In [5]:
annot_adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,project,percent.mt,S.Score,G2M.Score,paperLabels
GM136_AAACCTGAGACAGAGA-1,GM136,6410.0,2249,GM,2.589704,-0.089278,-0.067567,P.Fib 1.1
GM136_AAACCTGAGTCCGTAT-1,GM136,12523.0,2860,GM,5.621656,0.025254,0.195841,P.Epi 1
GM136_AAACCTGGTTCACCTC-1,GM136,4711.0,1752,GM,2.717045,-0.026435,-0.014840,P.VEC 1.4
GM136_AAACGGGAGACCGGAT-1,GM136,2917.0,1452,GM,2.913953,-0.025327,-0.067934,P.VEC 1.4
GM136_AAACGGGGTCCGACGT-1,GM136,1827.0,844,GM,10.892173,-0.007948,-0.042850,P.VEC 1.1
...,...,...,...,...,...,...,...,...
PD170_TTTGTTGAGTCGGCAA-1,PD170,3933.0,1961,PD,3.839308,-0.021114,-0.020469,P.VEC 1.4
PD170_TTTGTTGAGTTAACAG-1,PD170,10938.0,3284,PD,5.558603,-0.016251,-0.119871,P.Fib 1.1
PD170_TTTGTTGCAGGCACTC-1,PD170,1277.0,712,PD,4.463587,-0.021221,0.029080,abT (CD4)
PD170_TTTGTTGGTTGCATCA-1,PD170,1437.0,772,PD,5.845511,-0.028199,-0.009766,abT (CD4)


In [6]:
combined_adata.obs

PD134_AAACCTGAGAATCTCC-1
PD134_AAACCTGAGGGAAACA-1
PD134_AAACCTGCAAAGTCAA-1
PD134_AAACCTGCAGCTGCAC-1
PD134_AAACCTGTCCCTCAGT-1
...
PD170_TTTGTTGAGTCGGCAA-1
PD170_TTTGTTGAGTTAACAG-1
PD170_TTTGTTGCAGGCACTC-1
PD170_TTTGTTGGTTGCATCA-1
PD170_TTTGTTGTCGCTTACC-1


In [34]:
combined_adata.obs['donorID_original'] = annot_adata.obs['orig.ident'].astype(str)

In [35]:
combined_adata.obs['annot_original'] = annot_adata.obs['paperLabels'].astype(str)

In [36]:
combined_adata.obs['disease'] = annot_adata.obs['project'].astype(str)

In [10]:
combined_adata

AnnData object with n_obs × n_vars = 45712 × 33694
    obs: 'donorID_original', 'annot_original', 'disease'

In [11]:
combined_adata.var

ENSG00000243485
ENSG00000237613
ENSG00000186092
ENSG00000238009
ENSG00000239945
...
ENSG00000277856
ENSG00000275063
ENSG00000271254
ENSG00000277475
ENSG00000268674


In [12]:
combined_adata.X.max()

45267.0

In [13]:
annot_adata.X.max()

9.068303

In [17]:
annot_adata.var

Unnamed: 0,name
RP11-34P13.3,RP11-34P13.3
FAM138A,FAM138A
OR4F5,OR4F5
RP11-34P13.7,RP11-34P13.7
RP11-34P13.8,RP11-34P13.8
...,...
AC233755.2,AC233755.2
AC233755.1,AC233755.1
AC240274.1,AC240274.1
AC213203.1,AC213203.1


In [18]:
annot_adata.var['ensemblID']

ValueError: cannot reindex from a duplicate axis

In [19]:
file_path = "/nfs/team205/ao15/Megagut/Williams_data/data/GSM5005058_PD134_features.tsv"

with gzip.open(file_path, 'rb') as file:
    content = file.read()

var = pd.read_csv(io.StringIO(content.decode('utf-8')), index_col=0, delimiter='\t',header=None)

In [20]:
var

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
ENSG00000243485,RP11-34P13.3
ENSG00000237613,FAM138A
ENSG00000186092,OR4F5
ENSG00000238009,RP11-34P13.7
ENSG00000239945,RP11-34P13.8
...,...
ENSG00000277856,AC233755.2
ENSG00000275063,AC233755.1
ENSG00000271254,AC240274.1
ENSG00000277475,AC213203.1


In [22]:
combined_adata.var['names'] = var

In [23]:
combined_adata.var

Unnamed: 0_level_0,names
0,Unnamed: 1_level_1
ENSG00000243485,RP11-34P13.3
ENSG00000237613,FAM138A
ENSG00000186092,OR4F5
ENSG00000238009,RP11-34P13.7
ENSG00000239945,RP11-34P13.8
...,...
ENSG00000277856,AC233755.2
ENSG00000275063,AC233755.1
ENSG00000271254,AC240274.1
ENSG00000277475,AC213203.1


In [24]:
combined_adata.var = combined_adata.var.set_index('names')

In [25]:
combined_adata.var

RP11-34P13.3
FAM138A
OR4F5
RP11-34P13.7
RP11-34P13.8
...
AC233755.2
AC233755.1
AC240274.1
AC213203.1
FAM231B


In [26]:
annot_adata.X

<87240x33694 sparse matrix of type '<class 'numpy.float32'>'
	with 107853002 stored elements in Compressed Sparse Column format>

In [27]:
combined_adata.X

<45712x33694 sparse matrix of type '<class 'numpy.float32'>'
	with 51414147 stored elements in Compressed Sparse Row format>

In [28]:
annot_adata

AnnData object with n_obs × n_vars = 87240 × 33694
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'project', 'percent.mt', 'S.Score', 'G2M.Score', 'paperLabels'
    var: 'name'
    obsm: 'X_pca', 'X_umap'

In [37]:
combined_adata

AnnData object with n_obs × n_vars = 45712 × 33694
    obs: 'donorID_original', 'annot_original', 'disease'

In [41]:
combined_adata.obs.index.name = None

In [42]:
combined_adata.obs.annot_original

PD134_AAACCTGAGAATCTCC-1     Monocyte
PD134_AAACCTGAGGGAAACA-1    abT (CD4)
PD134_AAACCTGCAAAGTCAA-1          NaN
PD134_AAACCTGCAGCTGCAC-1          NaN
PD134_AAACCTGTCCCTCAGT-1          NaN
                              ...    
PD170_TTTGTTGAGTCGGCAA-1    P.VEC 1.4
PD170_TTTGTTGAGTTAACAG-1    P.Fib 1.1
PD170_TTTGTTGCAGGCACTC-1    abT (CD4)
PD170_TTTGTTGGTTGCATCA-1    abT (CD4)
PD170_TTTGTTGTCGCTTACC-1     Monocyte
Name: annot_original, Length: 45712, dtype: category
Categories (29, object): ['B', 'MAIT', 'Mast', 'Monocyte', ..., 'abT (CD8)', 'gd T', 'mDC', 'pDC']

In [43]:
combined_adata.write_h5ad('/nfs/team205/ao15/Megagut/Williams_data/Williams2021_PD_data_GSE164241_rawcounts.h5ad')

In [44]:
fib = sc.read_h5ad('/nfs/team205/ao15/Megagut/Williams_data/Williams2021_periodontitis_fibroblasts.h5ad')

In [45]:
fib

AnnData object with n_obs × n_vars = 4777 × 33694
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'project', 'percent.mt', 'S.Score', 'G2M.Score', 'paperLabels'
    var: 'name'
    uns: 'paperLabels_colors'
    obsm: 'X_pca', 'X_umap'

In [48]:
combined_adata.obs.annot_original.value_counts()

Plasma        6973
abT (CD8)     2751
abT (CD4)     2592
P.Fib 1.1     2306
P.VEC 1.2     2025
P.SMC         1827
P.VEC 1.1     1684
P.VEC 1.3     1472
P.Fib 1.2     1243
Monocyte      1155
B             1140
Treg          1106
gd T           992
mDC            985
P.VEC 1.4      974
P.Epi 1        715
P.Fib 1.4      675
Mast           653
P.Fib 1.3      542
Th17           539
NK             500
MAIT           392
pDC            202
Neutrophil     170
P.Epi 2        166
P.LEC          151
P.Epi 3         41
P.Mel           23
P.Fib 1.5       11
Name: annot_original, dtype: int64

In [50]:
#subset and export fibroblasts to map onto atlas
fib_raw = combined_adata[combined_adata.obs.annot_original.isin(['P.Fib 1.1','P.Fib 1.2','P.Fib 1.3','P.Fib 1.4','P.Fib 1.5'])].copy()

  utils.warn_names_duplicates("var")


In [51]:
fib_raw

AnnData object with n_obs × n_vars = 4777 × 33694
    obs: 'donorID_original', 'annot_original', 'disease'

In [52]:
fib.layers['counts'] = fib_raw.X

In [53]:
fib

AnnData object with n_obs × n_vars = 4777 × 33694
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'project', 'percent.mt', 'S.Score', 'G2M.Score', 'paperLabels'
    var: 'name'
    uns: 'paperLabels_colors'
    obsm: 'X_pca', 'X_umap'
    layers: 'counts'

In [55]:
fib.write_h5ad('/nfs/team205/ao15/Megagut/Williams_data/Williams2021_periodontitis_fibroblasts_withrawcounts.20231108.h5ad')