### Notebook for the anndata file creation from Fetal Gut data from Fawkner-Corbett study
- **Developed by:** Anna Maguza
- **Place:** Wuerzburg Institute for System Immunology
- **Date:** 8th March 2024

### Import packages

In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import anndata

### Set up the cells

In [2]:
%matplotlib inline

In [3]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 180, color_map = 'magma_r', dpi_save = 300, vector_friendly = True, format = 'svg')

-----
anndata     0.9.2
scanpy      1.9.5
-----
PIL                         10.0.1
anyio                       NA
arrow                       1.3.0
asttokens                   NA
attr                        23.1.0
attrs                       23.1.0
babel                       2.13.0
backcall                    0.2.0
certifi                     2023.07.22
cffi                        1.16.0
charset_normalizer          3.3.0
colorama                    0.4.6
comm                        0.1.4
cycler                      0.10.0
cython_runtime              NA
dateutil                    2.8.2
debugpy                     1.8.0
decorator                   5.1.1
defusedxml                  0.7.1
executing                   2.0.0
fastjsonschema              NA
fqdn                        NA
h5py                        3.9.0
idna                        3.4
igraph                      0.11.2
ipykernel                   6.25.2
ipywidgets                  8.1.1
isoduration                 NA
jedi   

In [4]:
def X_is_raw(adata):
    return np.array_equal(adata.X.sum(axis=0).astype(int), adata.X.sum(axis=0))

* Create anndata file

In [5]:
# Load the list of sample names
with open('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/sra_accessions.txt', 'r') as file:
    sample_names = file.read().splitlines()

In [6]:
# Directory where the anndata files are stored
data_dir = '/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/mapped_files'

In [7]:
# Initialize an empty list to hold the anndata objects
adatas = []

for sample in sample_names:
    # Construct the file path for the anndata file
    file_path = os.path.join(data_dir, sample, 'counts_unfiltered', 'adata.h5ad')

    # Load the anndata object
    adata = anndata.read_h5ad(file_path)

    # Add the 'SRA_sample' column to the obs dataframe
    adata.obs['SRA_sample'] = sample

    # Append the modified anndata object to the list
    adatas.append(adata)

# Concatenate all anndata objects
final_adata = anndata.concat(adatas, join='outer')

  if pd.api.types.is_categorical_dtype(dtype):
  utils.warn_names_duplicates("obs")


+ Correct adata.var

In [20]:
#import transcripts_to_genes.txt as dataframe
transcripts_to_genes = pd.read_csv('/mnt/LaCIE/annaM/human_reference_genome/index_file_bustool/transcripts_to_genes.txt', sep='\t')

In [21]:
transcripts_to_genes.head()

Unnamed: 0,TCONS_00023547,XLOC_006889,DDX11L1,Unnamed: 3,chr1,11869,14409,+
0,TCONS_00023548,XLOC_006889,DDX11L1,,chr1,12010,13670,+
1,TCONS_00023550,XLOC_006890,MIR1302-2,,chr1,30267,31109,+
2,TCONS_00023551,XLOC_006890,MIR1302-2,,chr1,30366,30503,+
3,TCONS_00023549,XLOC_006890,MIR1302-2,,chr1,29554,31097,+
4,TCONS_00023552,XLOC_006891,OR4G4P,,chr1,52473,53312,+


In [22]:
# copy the column names as a last row
transcripts_to_genes.loc[-1] = transcripts_to_genes.columns

In [23]:
# change column names: TCONS_00023416 to transcript_id, XLOC_006846 to locus_id, DDX11L1 to gene_id, chr1 to chromosome, 12010 to start_position, 13670 to end_position
transcripts_to_genes.rename(columns={'TCONS_00023547':'transcript_id', 'XLOC_006889':'locus_id', 'DDX11L1':'gene_id', 'chr1':'chromosome', '11869':'start_position', '14409':'end_position'}, inplace=True)

In [24]:
transcripts_to_genes.head()

Unnamed: 0,transcript_id,locus_id,gene_id,Unnamed: 3,chromosome,start_position,end_position,+
0,TCONS_00023548,XLOC_006889,DDX11L1,,chr1,12010,13670,+
1,TCONS_00023550,XLOC_006890,MIR1302-2,,chr1,30267,31109,+
2,TCONS_00023551,XLOC_006890,MIR1302-2,,chr1,30366,30503,+
3,TCONS_00023549,XLOC_006890,MIR1302-2,,chr1,29554,31097,+
4,TCONS_00023552,XLOC_006891,OR4G4P,,chr1,52473,53312,+


In [25]:
# name index column in adata.var as locus_id
final_adata.var.index.name = 'locus_id'
final_adata.var

XLOC_006889
XLOC_006890
XLOC_006891
XLOC_006892
XLOC_006893
...
XLOC_002628
XLOC_002629
XLOC_002630
XLOC_002631
XLOC_002632


In [26]:
# Check uniqueness in transcripts_to_genes
if not transcripts_to_genes['locus_id'].is_unique:
    # Handle duplicates here. For example, you might want to aggregate or drop duplicates.
    transcripts_to_genes = transcripts_to_genes.drop_duplicates(subset='locus_id')

# Merge operation (with 'locus_id' now as a column in adata.var)
merged_data = pd.merge(final_adata.var.reset_index(), transcripts_to_genes, on='locus_id', how='left')

# Check if the merge did not introduce extra rows
if merged_data.shape[0] == adata.var.shape[0]:
    final_adata.var = merged_data.set_index('locus_id')
else:
    print("Merge operation introduced extra rows, please check your data.")

In [27]:
# copy index column to a new column called locus_id
final_adata.var['locus_id'] = final_adata.var.index

# make gene_id as index
final_adata.var.set_index('gene_id', inplace=True)

In [28]:
final_adata.var

Unnamed: 0_level_0,transcript_id,Unnamed: 3,chromosome,start_position,end_position,+,locus_id
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
DDX11L1,TCONS_00023548,,chr1,12010,13670,+,XLOC_006889
MIR1302-2,TCONS_00023550,,chr1,30267,31109,+,XLOC_006890
OR4G4P,TCONS_00023552,,chr1,52473,53312,+,XLOC_006891
OR4G11P,TCONS_00023553,,chr1,57598,64116,+,XLOC_006892
OR4F5,TCONS_00023555,,chr1,65419,71585,+,XLOC_006893
...,...,...,...,...,...,...,...
ENSG00000276017,TCONS_00009796,,KI270734.1,72411,74814,+,XLOC_002628
ENSG00000278817,TCONS_00009797,,KI270734.1,131494,137392,+,XLOC_002629
ENSG00000277196,TCONS_00009798,,KI270734.1,138082,161750,-,XLOC_002630
U6,TCONS_00009800,,KI270744.1,51009,51114,-,XLOC_002631


In [29]:
# convert var to strings
final_adata.var = final_adata.var.astype(str)

In [30]:
# Save the final anndata object if needed
final_adata.write_h5ad('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/anndata_files/Fawkner_Corbett_GEX_raw.h5ad')

+ Upload SRA table

In [56]:
adata = '/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/anndata_files/Fawkner_Corbett_GEX_raw.h5ad'
adata = sc.read_h5ad(adata)

  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")


In [57]:
SRA_run = pd.read_csv('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/SraRunTable.txt', sep=',')

In [58]:
# delete columns Assay Type, AvgSpotLen and Bases from SRA_run
SRA_run = SRA_run.drop(['Assay Type', 'AvgSpotLen', 'Bases', 'Bytes', 'Center Name', 'LibrarySource', 'Platform',
                        'ReleaseDate', 'create_date', 'version', 'Consent', 'DATASTORE filetype', 'DATASTORE provider', 'DATASTORE region'], axis=1)

In [59]:
# rename run column to SRA_sample
SRA_run.rename(columns={'Run': 'SRA_sample'}, inplace=True)

In [60]:
adata.obs['barcodes'] = adata.obs.index.copy()

In [62]:
#merge final_adata.obs and SRA_run on SRA_sample final_adata.obs and Run in SRA_run
final_adata.obs = adata.obs.merge(SRA_run, on='SRA_sample', how='left')

In [67]:
final_adata.obs.index = final_adata.obs['barcodes']

In [69]:
# Save the final anndata object if needed
final_adata.write_h5ad('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/anndata_files/Fawkner_Corbett_GEX_raw.h5ad')