### Notebook for the creation of anndata object with Fawkner-Corbett_2021 Visium data 

- **Developed by:** Anna Maguza
- **Place:** Wuerzburg Institute for System Immunology
- **Date:** 16th January 2024

#### Import packages

In [1]:
import pandas as pd
import anndata
import os
import scanpy as sc

+ Concatenate anndata objects from all samples

In [None]:
# Load the list of sample names
with open('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/sra_accessions.txt', 'r') as file:
    sample_names = file.read().splitlines()

In [3]:
# Directory where the anndata files are stored
data_dir = '/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/'

In [4]:
# Initialize an empty list to hold the anndata objects
adatas = []

for sample in sample_names:
    # Construct the file path for the anndata file
    file_path = os.path.join(data_dir, sample, 'counts_unfiltered', 'adata.h5ad')

    # Load the anndata object
    adata = anndata.read_h5ad(file_path)

    # Add the 'SRA_sample' column to the obs dataframe
    adata.obs['SRA_sample'] = sample

    # Append the modified anndata object to the list
    adatas.append(adata)

# Concatenate all anndata objects
final_adata = anndata.concat(adatas, join='outer')


  utils.warn_names_duplicates("obs")


+ Correct adata.var

In [6]:
#import transcripts_to_genes.txt as dataframe
transcripts_to_genes = pd.read_csv('/mnt/LaCIE/annaM/human_reference_genome/index_file_bustool/transcripts_to_genes.txt', sep='\t')

In [7]:
transcripts_to_genes.head()

Unnamed: 0,TCONS_00023415,XLOC_006846,DDX11L1,Unnamed: 3,chr1,11869,14409,+
0,TCONS_00023416,XLOC_006846,DDX11L1,,chr1,12010,13670,+
1,TCONS_00023419,XLOC_006847,MIR1302-2,,chr1,30366,30503,+
2,TCONS_00023417,XLOC_006847,MIR1302-2,,chr1,29554,31097,+
3,TCONS_00023418,XLOC_006847,MIR1302-2,,chr1,30267,31109,+
4,TCONS_00023420,XLOC_006848,OR4G4P,,chr1,52473,53312,+


In [8]:
# copy the column names as a last row
transcripts_to_genes.loc[-1] = transcripts_to_genes.columns

In [9]:
# see how last 5 looks like
transcripts_to_genes.tail()

Unnamed: 0,TCONS_00023415,XLOC_006846,DDX11L1,Unnamed: 3,chr1,11869,14409,+
273740,TCONS_00009814,XLOC_002628,ENSG00000277196,,KI270734.1,138082,161750,-
273741,TCONS_00009815,XLOC_002628,ENSG00000277196,,KI270734.1,138082,161852,-
273742,TCONS_00009816,XLOC_002629,U6,,KI270744.1,51009,51114,-
273743,TCONS_00009817,XLOC_002630,U1,,KI270750.1,148668,148843,+
-1,TCONS_00023415,XLOC_006846,DDX11L1,Unnamed: 3,chr1,11869,14409,+


In [10]:
# change column names: TCONS_00023416 to transcript_id, XLOC_006846 to locus_id, DDX11L1 to gene_id, chr1 to chromosome, 12010 to start_position, 13670 to end_position
transcripts_to_genes.rename(columns={'TCONS_00023415':'transcript_id', 'XLOC_006846':'locus_id', 'DDX11L1':'gene_id', 'chr1':'chromosome', '11869':'start_position', '14409':'end_position'}, inplace=True)

In [11]:
transcripts_to_genes.head()

Unnamed: 0,transcript_id,locus_id,gene_id,Unnamed: 3,chromosome,start_position,end_position,+
0,TCONS_00023416,XLOC_006846,DDX11L1,,chr1,12010,13670,+
1,TCONS_00023419,XLOC_006847,MIR1302-2,,chr1,30366,30503,+
2,TCONS_00023417,XLOC_006847,MIR1302-2,,chr1,29554,31097,+
3,TCONS_00023418,XLOC_006847,MIR1302-2,,chr1,30267,31109,+
4,TCONS_00023420,XLOC_006848,OR4G4P,,chr1,52473,53312,+


In [12]:
# name index column in adata.var as locus_id
final_adata.var.index.name = 'locus_id'
final_adata.var

XLOC_006846
XLOC_006847
XLOC_006848
XLOC_006849
XLOC_006850
...
XLOC_002626
XLOC_002627
XLOC_002628
XLOC_002629
XLOC_002630


In [13]:
# Check uniqueness in transcripts_to_genes
if not transcripts_to_genes['locus_id'].is_unique:
    # Handle duplicates here. For example, you might want to aggregate or drop duplicates.
    transcripts_to_genes = transcripts_to_genes.drop_duplicates(subset='locus_id')

# Merge operation (with 'locus_id' now as a column in adata.var)
merged_data = pd.merge(final_adata.var.reset_index(), transcripts_to_genes, on='locus_id', how='left')

# Check if the merge did not introduce extra rows
if merged_data.shape[0] == adata.var.shape[0]:
    final_adata.var = merged_data.set_index('locus_id')
else:
    print("Merge operation introduced extra rows, please check your data.")


In [14]:
# copy index column to a new column called locus_id
final_adata.var['locus_id'] = final_adata.var.index

# make gene_id as index
final_adata.var.set_index('gene_id', inplace=True)

In [15]:
final_adata.var

Unnamed: 0_level_0,transcript_id,Unnamed: 3,chromosome,start_position,end_position,+,locus_id
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
DDX11L1,TCONS_00023416,,chr1,12010,13670,+,XLOC_006846
MIR1302-2,TCONS_00023419,,chr1,30366,30503,+,XLOC_006847
OR4G4P,TCONS_00023420,,chr1,52473,53312,+,XLOC_006848
OR4G11P,TCONS_00023422,,chr1,62949,63887,+,XLOC_006849
OR4F5,TCONS_00023423,,chr1,65419,71585,+,XLOC_006850
...,...,...,...,...,...,...,...
ENSG00000276017,TCONS_00009812,,KI270734.1,72411,74814,+,XLOC_002626
ENSG00000278817,TCONS_00009813,,KI270734.1,131494,137392,+,XLOC_002627
ENSG00000277196,TCONS_00009814,,KI270734.1,138082,161750,-,XLOC_002628
U6,TCONS_00009816,,KI270744.1,51009,51114,-,XLOC_002629


In [16]:
final_adata.obs

Unnamed: 0_level_0,SRA_sample
barcode,Unnamed: 1_level_1
AAACAAGTATCTCCCA,SRR12685794
AAACACCAATAACTGC,SRR12685794
AAACAGAGCGACTCCT,SRR12685794
AAACAGCTTTCAGAAG,SRR12685794
AAACAGGGTCTATATT,SRR12685794
...,...
TTGTGGTATAGGTATG,SRR12685801
TTGTGGTGGTACTAAG,SRR12685801
TTGTTGTGTGTCAAGA,SRR12685801
TTGTTTGTGTAAATTC,SRR12685801


In [18]:
# convert var to strings
final_adata.var = final_adata.var.astype(str)

In [19]:
# Save the final anndata object if needed
final_adata.write_h5ad('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/Fawkner_Corbett_2021_raw_all_samples_unprocesses.h5ad')

### Add Metadata

In [20]:
import_path = '/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/Fawkner_Corbett_2021_raw_all_samples_unprocesses.h5ad'
adata = sc.read_h5ad(import_path)

  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")


In [21]:
adata.obs

Unnamed: 0_level_0,SRA_sample
barcode,Unnamed: 1_level_1
AAACAAGTATCTCCCA,SRR12685794
AAACACCAATAACTGC,SRR12685794
AAACAGAGCGACTCCT,SRR12685794
AAACAGCTTTCAGAAG,SRR12685794
AAACAGGGTCTATATT,SRR12685794
...,...
TTGTGGTATAGGTATG,SRR12685801
TTGTGGTGGTACTAAG,SRR12685801
TTGTTGTGTGTCAAGA,SRR12685801
TTGTTTGTGTAAATTC,SRR12685801


+ Add metadata

In [22]:
# Create a mapping dictionary
sample_mapping = {
    'SRR12685794': 'A1',
    'SRR12685795': 'A2',
    'SRR12685796': 'A3',
    'SRR12685797': 'A4',
    'SRR12685798': 'A6',
    'SRR12685799': 'A7',
    'SRR12685800': 'A8',
    'SRR12685801': 'A9'
}

# Apply the mapping to create a new column in adata.obs
adata.obs['Sample_ID'] = adata.obs['SRA_sample'].map(sample_mapping)

In [23]:
adata.obs['Sequincing_protocol'] = 'sp_NextSeq-spatial'
adata.obs['Library_preparation_protocol'] = 'lp_10xVisium'

In [24]:
# New mapping from Sample_ID to descriptions
Age_category = {
    'A1': 'Adult',
    'A2': 'Adult',
    'A3': 'Fetal',
    'A4': 'Fetal',
    'A6': 'Fetal',
    'A7': 'Fetal',
    'A8': 'Fetal',
    'A9': 'Fetal'
}

# Use the 'Sample_ID' column to create a new 'Description' column
adata.obs['Age_category'] = adata.obs['Sample_ID'].map(Age_category)

# New mapping from Sample_ID to descriptions
Age = {
    'A1': '66 years',
    'A2': '66 years',
    'A3': '12PCW',
    'A4': '19PCW',
    'A6': '12PCW',
    'A7': '12PCW',
    'A8': '12PCW',
    'A9': '12PCW'
}

# Use the 'Sample_ID' column to create a new 'Description' column
adata.obs['Age'] = adata.obs['Sample_ID'].map(Age)


# New mapping from Sample_ID to descriptions
Donor_ID = {
    'A1': 'A',
    'A2': 'A',
    'A3': 'B',
    'A4': 'C',
    'A6': 'D',
    'A7': 'D',
    'A8': 'D',
    'A9': 'D'
}

# Use the 'Sample_ID' column to create a new 'Description' column
adata.obs['Donor_ID'] = adata.obs['Sample_ID'].map(Donor_ID)

# New mapping from Sample_ID to descriptions
Anatomical_region = {
    'A1': 'colon slide',
    'A2': 'colon slide',
    'A3': 'whole colon section slide',
    'A4': 'whole colon section slide',
    'A6': 'whole small intestine section slide',
    'A7': 'whole small intestine section slide',
    'A8': 'whole colon section slide',
    'A9': 'whole colon section slide'
}

# Use the 'Sample_ID' column to create a new 'Description' column
adata.obs['Anatomical_region'] = adata.obs['Sample_ID'].map(Anatomical_region)

In [25]:
# New mapping from Sample_ID to descriptions
Sex = {
    'A': 'Male',
    'B': 'Male',
    'C': 'Male',
    'D': 'Male'
}

# Use the 'Sample_ID' column to create a new 'Description' column
adata.obs['Sex'] = adata.obs['Donor_ID'].map(Sex)

In [26]:
# New mapping from Sample_ID to descriptions
biomaterial = {
    'A': 'inflammatory bowel disease||colorectal cancer'
}

# Use the 'Sample_ID' column to create a new 'Description' column
adata.obs['Biomaterial_description'] = adata.obs['Donor_ID'].map(biomaterial)

In [27]:
# Show adata.obs rows 1500-1510
adata.obs['Donor_ID'].value_counts()

Donor_ID
D    10488
A     6280
B     2782
C     2267
Name: count, dtype: int64

In [29]:
del adata.var['Unnamed: 3']

+ Save adata

In [31]:
# Save the final anndata object if needed
adata.write_h5ad('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/Fawkner_Corbett_2021_raw_all_samples_unprocesses.h5ad')