### Notebook for the creation of anndata object with Fawkner-Corbett_2021 Visium data 

- **Developed by:** Anna Maguza
- **Place:** Wuerzburg Institute for System Immunology
- **Date:** 16th January 2024

#### Import packages

In [1]:
import pandas as pd
import anndata
import os
import scanpy as sc

+ Concatenate anndata objects from all samples

In [2]:
# Load the list of sample names
with open('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/sra_accessions.txt', 'r') as file:
    sample_names = file.read().splitlines()

In [5]:
# Directory where the anndata files are stored
data_dir = '/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/'

In [6]:
# Initialize an empty list to hold the anndata objects
adatas = []

for sample in sample_names:
    # Construct the file path for the anndata file
    file_path = os.path.join(data_dir, sample, 'counts_unfiltered', 'adata.h5ad')

    # Load the anndata object
    adata = anndata.read_h5ad(file_path)

    # Add the 'SRA_sample' column to the obs dataframe
    adata.obs['SRA_sample'] = sample

    # Append the modified anndata object to the list
    adatas.append(adata)

# Concatenate all anndata objects
final_adata = anndata.concat(adatas, join='outer')


  if pd.api.types.is_categorical_dtype(dtype):
  utils.warn_names_duplicates("obs")


+ Correct adata.var

In [22]:
#import transcripts_to_genes.txt as dataframe
transcripts_to_genes = pd.read_csv('/mnt/LaCIE/annaM/human_reference_genome/index_file_bustool/transcripts_to_genes.txt', sep='\t')

In [23]:
transcripts_to_genes.head()

Unnamed: 0,TCONS_00023415,XLOC_006846,DDX11L1,Unnamed: 3,chr1,11869,14409,+
0,TCONS_00023416,XLOC_006846,DDX11L1,,chr1,12010,13670,+
1,TCONS_00023419,XLOC_006847,MIR1302-2,,chr1,30366,30503,+
2,TCONS_00023417,XLOC_006847,MIR1302-2,,chr1,29554,31097,+
3,TCONS_00023418,XLOC_006847,MIR1302-2,,chr1,30267,31109,+
4,TCONS_00023420,XLOC_006848,OR4G4P,,chr1,52473,53312,+


In [24]:
# copy the column names as a last row
transcripts_to_genes.loc[-1] = transcripts_to_genes.columns

In [25]:
# see how last 5 looks like
transcripts_to_genes.tail()

Unnamed: 0,TCONS_00023415,XLOC_006846,DDX11L1,Unnamed: 3,chr1,11869,14409,+
273740,TCONS_00009814,XLOC_002628,ENSG00000277196,,KI270734.1,138082,161750,-
273741,TCONS_00009815,XLOC_002628,ENSG00000277196,,KI270734.1,138082,161852,-
273742,TCONS_00009816,XLOC_002629,U6,,KI270744.1,51009,51114,-
273743,TCONS_00009817,XLOC_002630,U1,,KI270750.1,148668,148843,+
-1,TCONS_00023415,XLOC_006846,DDX11L1,Unnamed: 3,chr1,11869,14409,+


In [26]:
# change column names: TCONS_00023416 to transcript_id, XLOC_006846 to locus_id, DDX11L1 to gene_id, chr1 to chromosome, 12010 to start_position, 13670 to end_position
transcripts_to_genes.rename(columns={'TCONS_00023415':'transcript_id', 'XLOC_006846':'locus_id', 'DDX11L1':'gene_id', 'chr1':'chromosome', '11869':'start_position', '14409':'end_position'}, inplace=True)

In [27]:
transcripts_to_genes.head()

Unnamed: 0,transcript_id,locus_id,gene_id,Unnamed: 3,chromosome,start_position,end_position,+
0,TCONS_00023416,XLOC_006846,DDX11L1,,chr1,12010,13670,+
1,TCONS_00023419,XLOC_006847,MIR1302-2,,chr1,30366,30503,+
2,TCONS_00023417,XLOC_006847,MIR1302-2,,chr1,29554,31097,+
3,TCONS_00023418,XLOC_006847,MIR1302-2,,chr1,30267,31109,+
4,TCONS_00023420,XLOC_006848,OR4G4P,,chr1,52473,53312,+


In [None]:
# name index column in adata.var as locus_id
adata.var.index.name = 'locus_id'
adata.var

In [31]:
# Check uniqueness in transcripts_to_genes
if not transcripts_to_genes['locus_id'].is_unique:
    # Handle duplicates here. For example, you might want to aggregate or drop duplicates.
    transcripts_to_genes = transcripts_to_genes.drop_duplicates(subset='locus_id')

# Merge operation (with 'locus_id' now as a column in adata.var)
merged_data = pd.merge(adata.var.reset_index(), transcripts_to_genes, on='locus_id', how='left')

# Check if the merge did not introduce extra rows
if merged_data.shape[0] == adata.var.shape[0]:
    adata.var = merged_data.set_index('locus_id')
else:
    print("Merge operation introduced extra rows, please check your data.")


In [35]:
# copy index column to a new column called locus_id
adata.var['locus_id'] = adata.var.index

# make gene_id as index
adata.var.set_index('gene_id', inplace=True)

In [42]:
adata.var

Unnamed: 0_level_0,transcript_id,Unnamed: 3,chromosome,start_position,end_position,+,locus_id
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
DDX11L1,TCONS_00023416,,chr1,12010,13670,+,XLOC_006846
MIR1302-2,TCONS_00023419,,chr1,30366,30503,+,XLOC_006847
OR4G4P,TCONS_00023420,,chr1,52473,53312,+,XLOC_006848
OR4G11P,TCONS_00023422,,chr1,62949,63887,+,XLOC_006849
OR4F5,TCONS_00023423,,chr1,65419,71585,+,XLOC_006850
...,...,...,...,...,...,...,...
ENSG00000276017,TCONS_00009812,,KI270734.1,72411,74814,+,XLOC_002626
ENSG00000278817,TCONS_00009813,,KI270734.1,131494,137392,+,XLOC_002627
ENSG00000277196,TCONS_00009814,,KI270734.1,138082,161750,-,XLOC_002628
U6,TCONS_00009816,,KI270744.1,51009,51114,-,XLOC_002629


In [41]:
# Save the final anndata object if needed
adata.write_h5ad('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/Fawkner_Corbett_2021_raw_all_samples_unprocesses.h5ad')