### Notebook for the samples identification based on hashtags of Fetal Gut data from Fawkner-Corbett study
- **Developed by:** Anna Maguza
- **Place:** Wuerzburg Institute for System Immunology
- **Date:** 13th March 2024

### Import packages

In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import anndata

from scipy import sparse

### Set up the cells

In [2]:
%matplotlib inline

In [3]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 180, color_map = 'magma_r', dpi_save = 300, vector_friendly = True, format = 'svg')

-----
anndata     0.9.2
scanpy      1.9.5
-----
PIL                         10.0.1
anyio                       NA
arrow                       1.3.0
asttokens                   NA
attr                        23.1.0
attrs                       23.1.0
babel                       2.13.0
backcall                    0.2.0
certifi                     2023.07.22
cffi                        1.16.0
charset_normalizer          3.3.0
colorama                    0.4.6
comm                        0.1.4
cycler                      0.10.0
cython_runtime              NA
dateutil                    2.8.2
debugpy                     1.8.0
decorator                   5.1.1
defusedxml                  0.7.1
executing                   2.0.0
fastjsonschema              NA
fqdn                        NA
h5py                        3.9.0
idna                        3.4
igraph                      0.11.2
ipykernel                   6.25.2
ipywidgets                  8.1.1
isoduration                 NA
jedi   

In [4]:
def X_is_raw(adata):
    return np.array_equal(adata.X.sum(axis=0).astype(int), adata.X.sum(axis=0))

+ Upload final anndata object

In [5]:
final_anndata_path = '/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/anndata_files/Fawkner_Corbett_GEX_raw.h5ad'
final_adata = sc.read_h5ad(final_anndata_path)

  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")


* Upload sample info

In [6]:
sample_info = pd.read_csv('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/Supplementary_Mendeley_data/1. Sample Overview-Table 1.csv', sep =';')

+ Upload anndata objects with HTO info

In [7]:
# Path to the file with folder names
folder_names_file = '/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/files_1.txt'

# Base directory where the folders are located
base_dir = '/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2'

# Initialize a list to store loaded AnnData objects
adatas = []

# Read folder names from the file
with open(folder_names_file, 'r') as f:
    folder_names = f.read().splitlines()

# Loop through each folder, load the AnnData object, add the 'Folder' column, and append to the list
for folder_name in folder_names:
    folder_path = os.path.join(base_dir, folder_name)
    for file in os.listdir(folder_path):
        if file.endswith('.h5ad'):  # Assuming the AnnData object ends with .h5ad
            file_path = os.path.join(folder_path, file)
            adata = sc.read_h5ad(file_path)
            adata.obs['Folder'] = folder_name  # Add new column with folder name
            adatas.append(adata)
            break  # Assuming there's only one AnnData object per folder

# Concatenate all AnnData objects into one
if adatas:
    concatenated_adata = adatas[0].concatenate(adatas[1:], index_unique=None, join='outer')
else:
    concatenated_adata = None
    print("No AnnData objects were loaded.")

# Save the concatenated AnnData object if needed
if concatenated_adata is not None:
    concatenated_adata.write('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/HTO_counts_concatenated_adata.h5ad')



See the tutorial for concat at: https://anndata.readthedocs.io/en/latest/concatenation.html
  if pd.api.types.is_categorical_dtype(dtype):
  utils.warn_names_duplicates("obs")
  if pd.api.types.is_categorical_dtype(dtype):
  utils.warn_names_duplicates("obs")


In [8]:
concatenated_adata.obs

Unnamed: 0_level_0,0,HTO1,HTO2,HTO3,HTO4,HTO5,HTO6,HTO7,HTO8,HTO9,...,ADT1,ADT2,ADT3,ADT4,ADT5,ADT6,ADT7,ADT8,ADT9,batch
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TACTGCCAGACCTTTG,TACTGCCAGACCTTTG,15.0,8.0,7.0,2.0,4.0,77.0,0.0,1.0,3.0,...,,,,,,,,,,0
CTGTGAACACTGAGGA,CTGTGAACACTGAGGA,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,,,,,,,,,,0
CAAGGGACAGGTGACA,CAAGGGACAGGTGACA,27.0,13.0,82.0,2.0,4.0,6.0,1.0,3.0,1.0,...,,,,,,,,,,0
GGGCCATAGTCACTAC,GGGCCATAGTCACTAC,22.0,12.0,20.0,2900.0,4.0,136.0,5.0,2.0,3.0,...,,,,,,,,,,0
AATTTCCCACTACCGG,AATTTCCCACTACCGG,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTCGATTTCGTGTGAT,TTCGATTTCGTGTGAT,23.0,51.0,46.0,79.0,10.0,37.0,4.0,12.0,0.0,...,,,,,,,,,,11
TTCTAGTAGAGTCACG,TTCTAGTAGAGTCACG,32.0,57.0,67.0,418.0,33.0,53.0,11.0,17.0,0.0,...,,,,,,,,,,11
TGAGGTTCACTATCCC,TGAGGTTCACTATCCC,31.0,45.0,85.0,3574.0,18.0,37.0,11.0,24.0,0.0,...,,,,,,,,,,11
TTCAGGAGTTCAGGAG,TTCAGGAGTTCAGGAG,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,11


In [9]:
in_house_folders = ['GSM4808359_ADT1', 'GSM4808360_ADT2', 'GSM4808361_ADT3']

concatenated_adata.obs['Tag_type'] = concatenated_adata.obs['Folder'].apply(lambda x: 'In-house' if x in in_house_folders else 'TotalSeq-tag')

In [20]:
df = pd.DataFrame(concatenated_adata.obs)

In [21]:
# leave only columns 'most_likely_hypothesis', 'cluster_feature', 'negative_hypothesis_probability', 'singlet_hypothesis_probability', 'doublet_hypothesis_probability', 'Classification', 'Digestion_Condition', 'GEO_Accession', 'Folder'
df = df[['most_likely_hypothesis', 'cluster_feature', 'negative_hypothesis_probability', 'singlet_hypothesis_probability', 'doublet_hypothesis_probability', 'Classification', 'Digestion_Condition', 'GEO_Accession', 'Folder', 'Tag_type']]

+ Change SRA Run table

In [20]:
SraRunTable = pd.read_csv('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/SraRunTable.txt', sep =',')

In [21]:
# Assuming SraRunTable is already loaded with 'post-conceptual_weeks' column containing values like '8,9'
# First, replace ' and ' with ',' to standardize the separator, then ensure all separators are just a single comma
SraRunTable['post-conceptual_weeks'] = SraRunTable['post-conceptual_weeks'].str.replace(' and ', ',')
SraRunTable['post-conceptual_weeks'] = SraRunTable['post-conceptual_weeks'].str.replace(' PCW', '')
SraRunTable['post-conceptual_weeks'] = SraRunTable['post-conceptual_weeks'].str.replace(' ', '')  # Remove any additional spaces

# Ensure that the separator between numbers is uniformly a comma
SraRunTable['post-conceptual_weeks'] = SraRunTable['post-conceptual_weeks'].apply(lambda x: ','.join(x.split(',')))

# Now split based on ',' to ensure all values like '8,9' are separated
SraRunTable['post-conceptual_weeks'] = SraRunTable['post-conceptual_weeks'].str.split(',')

# Explode the DataFrame to create a new row for each post-conceptual_week value
SraRunTable_exploded = SraRunTable.explode('post-conceptual_weeks')

# Convert 'post-conceptual_weeks' to integers
SraRunTable_exploded['post-conceptual_weeks'] = SraRunTable_exploded['post-conceptual_weeks'].astype(int)

In [22]:
SraRunTable_exploded.head()

Unnamed: 0,Run,Assay Type,AvgSpotLen,Bases,BioProject,BioSample,Bytes,cell_type,Center Name,Consent,...,LibrarySource,Organism,Platform,post-conceptual_weeks,ReleaseDate,create_date,version,Sample Name,source_name,SRA Study
0,SRR12735687,RNA-Seq,300,1476707400,PRJNA666217,SAMN16284014,560528715,EPCAM+ intestinal epithelial cells,GEO,public,...,TRANSCRIPTOMIC,Homo sapiens,ILLUMINA,8,2021-01-04T00:00:00Z,2020-09-28T19:21:00Z,1,GSM4808339,Small and large fetal intestine,SRP285688
0,SRR12735687,RNA-Seq,300,1476707400,PRJNA666217,SAMN16284014,560528715,EPCAM+ intestinal epithelial cells,GEO,public,...,TRANSCRIPTOMIC,Homo sapiens,ILLUMINA,9,2021-01-04T00:00:00Z,2020-09-28T19:21:00Z,1,GSM4808339,Small and large fetal intestine,SRP285688
0,SRR12735687,RNA-Seq,300,1476707400,PRJNA666217,SAMN16284014,560528715,EPCAM+ intestinal epithelial cells,GEO,public,...,TRANSCRIPTOMIC,Homo sapiens,ILLUMINA,13,2021-01-04T00:00:00Z,2020-09-28T19:21:00Z,1,GSM4808339,Small and large fetal intestine,SRP285688
0,SRR12735687,RNA-Seq,300,1476707400,PRJNA666217,SAMN16284014,560528715,EPCAM+ intestinal epithelial cells,GEO,public,...,TRANSCRIPTOMIC,Homo sapiens,ILLUMINA,20,2021-01-04T00:00:00Z,2020-09-28T19:21:00Z,1,GSM4808339,Small and large fetal intestine,SRP285688
1,SRR12735688,RNA-Seq,300,903216900,PRJNA666217,SAMN16284014,342383725,EPCAM+ intestinal epithelial cells,GEO,public,...,TRANSCRIPTOMIC,Homo sapiens,ILLUMINA,8,2021-01-04T00:00:00Z,2020-09-28T19:21:00Z,1,GSM4808339,Small and large fetal intestine,SRP285688


In [24]:
# Map cell_type to Digestion Condition for compatibility
digestion_map = {
    'EPCAM+ intestinal epithelial cells': 'EpCAM+',
    'EPCAM- intestinal stromal cells': 'EpCAM-',
    'intestinal stromal and epithelial cells': 'Bulk digest'
}
SraRunTable_exploded['Digestion Condition'] = SraRunTable_exploded['cell_type'].map(digestion_map)

# Convert 'post-conceptual_weeks' in SraRunTable_exploded and 'PCW' in sample_info to the same type for accurate merging
SraRunTable_exploded['post-conceptual_weeks'] = SraRunTable_exploded['post-conceptual_weeks'].astype(str)
sample_info['PCW'] = sample_info['PCW'].astype(str)

# Merge on 'post-conceptual_weeks'/'PCW' and 'Digestion Condition'
merged_info = pd.merge(sample_info, SraRunTable_exploded, left_on=['PCW', 'Digestion Condition'], right_on=['post-conceptual_weeks', 'Digestion Condition'], how='left')

# Aggregate GEO_Accession values for each sample into a list (if there are multiple GEO_Accession values per sample)
aggregated_info = merged_info.groupby(['PCW', 'Digestion Condition'], as_index=False).agg({'Sample Name': lambda x: ', '.join(x.dropna().unique())})

# Merge the aggregated_info back into sample_info to add the 'GEO_Accession' column
final_sample_info = pd.merge(sample_info, aggregated_info, on=['PCW', 'Digestion Condition'], how='left')

# Rename the 'GEO_Accession' column in final_sample_info if needed
final_sample_info.rename(columns={'GEO_Accession_x': 'GEO_Accession'}, inplace=True)

In [25]:
final_sample_info.head()

Unnamed: 0,Sample ID,Method,PCW,Gender / Genotype,Location,Digestion Condition,Run number,10x reaction number,"HTO number (TotalSeq, Biolegend)","HTO number (In house, Stoeckius et al, 2018)",HTO pool D70X_s,ADT Pool RPI,Sample Name
0,AAE,scRNA-seq,19,XX,Distal colon,EpCAM+,1.0,1.0,1.0,,1.0,,
1,AAE,scRNA-seq,19,XX,Distal colon,EpCAM-,1.0,1.0,2.0,,1.0,,
2,AAF,scRNA-seq,12,XY,Hindgut,EpCAM+,1.0,1.0,3.0,,1.0,,"GSM4808341, GSM4808351, GSM4808361"
3,AAF,scRNA-seq,12,XY,Hindgut,EpCAM-,1.0,1.0,4.0,,1.0,,"GSM4808344, GSM4808354"
4,AAB,scRNA-seq,17,XX,Distal colon,EpCAM+,1.0,1.0,5.0,,1.0,,"GSM4808340, GSM4808350, GSM4808360"
