## Notebook for Wang-2020 anndata object creation 
### Developed by: Anna Maguza

### Institute of Computational Biology - Computational Health Centre - Hemlholtz Munich

### 12 February 2023

#### Load packages

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as an
#import scrublet
import h5py

#### Setup Cells

In [2]:
%matplotlib inline

In [3]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

  from .autonotebook import tqdm as notebook_tqdm


scanpy==1.9.1 anndata==0.8.0 umap==0.5.3 numpy==1.23.5 scipy==1.10.0 pandas==1.5.3 scikit-learn==1.2.1 statsmodels==0.13.5 python-igraph==0.10.4 pynndescent==0.5.8


#### Data Quality Check and Preprocessing

### Colon

In [4]:
#Data Loading 
input_anndata = '/lustre/groups/talaveralopez/datasets/Colorectal_cancer/Anndata/Wang_2020/Wang_2020_colon/wang20_colon.processed.h5ad'  # the file that will store the analysis results
output_anndata = '/lustre/groups/talaveralopez/datasets/Colorectal_cancer/Anndata/Wang_2020/Wang_2020_colon/wang20_colon.processed_output.h5ad'  # the file that will store the analysis results

In [5]:
#Anndata upload
Wang_colon = sc.read_h5ad(input_anndata)
Wang_colon.X

<4329x17181 sparse matrix of type '<class 'numpy.float32'>'
	with 12513556 stored elements in Compressed Sparse Row format>

In [6]:
Wang_colon.obs

Unnamed: 0_level_0,Sample_ID,CellType,Location,n_counts,log1p_n_counts,n_genes,log1p_n_genes,percent_mito,percent_ribo,percent_hb,percent_top50
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AAGGAGCAGTTAGGTA_Colon-1_Enterocyte,Colon-1,Enterocyte,Colon,27109.0,10.207658,3977,8.288534,19.266663,12.837066,0.003689,46.656092
ACCTTTAGTACTTGAC_Colon-1_Enterocyte,Colon-1,Enterocyte,Colon,16638.0,9.719504,2972,7.997327,28.753456,5.968265,0.000000,52.866931
ACGGGCTAGTCAAGCG_Colon-1_Enterocyte,Colon-1,Enterocyte,Colon,23970.0,10.084600,4110,8.321422,17.008760,15.640384,0.012516,42.515645
AGATCTGCAAGGACAC_Colon-1_Enterocyte,Colon-1,Enterocyte,Colon,20066.0,9.906832,3751,8.230044,16.246387,15.389215,0.014951,42.614373
ATGGGAGCAATGCCAT_Colon-1_Enterocyte,Colon-1,Enterocyte,Colon,23372.0,10.059337,4027,8.301025,21.303268,10.071881,0.017114,45.400479
...,...,...,...,...,...,...,...,...,...,...,...
TGTGGTACACCACCAG_Colon-2_Stem Cell,Colon-2,Stem Cell,Colon,21546.0,9.977992,3527,8.168486,11.073981,43.274853,0.013924,44.634735
TGTTCCGCAATGCCAT_Colon-2_Stem Cell,Colon-2,Stem Cell,Colon,7831.0,8.965973,1846,7.521318,21.210573,34.746521,0.000000,48.665560
TTAGGCAAGAGTGACC_Colon-2_Stem Cell,Colon-2,Stem Cell,Colon,11279.0,9.330787,2487,7.819234,9.752638,40.739426,0.000000,41.794485
TTATGCTTCGTCTGCT_Colon-2_Stem Cell,Colon-2,Stem Cell,Colon,8277.0,9.021357,1869,7.533694,16.261930,39.809113,0.000000,48.133382


In [7]:
Wang_colon.var

Unnamed: 0_level_0,mito,ribo,hb,n_counts,n_cells,n_genes,highly_variable,means,dispersions,dispersions_norm
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
RP11-34P13.7,False,False,False,9.0,10,9,False,0.001402,-0.223694,-0.396330
FO538757.2,False,False,False,620.0,637,572,False,0.092447,-0.075833,0.128244
AP006222.2,False,False,False,307.0,326,296,False,0.040123,-0.342289,-0.817075
RP4-669L17.10,False,False,False,3.0,3,3,False,0.000246,-1.017381,-3.212137
RP11-206L10.9,False,False,False,120.0,130,118,False,0.018567,-0.113123,-0.004051
...,...,...,...,...,...,...,...,...,...,...
CH507-154B10.2,False,False,False,7.0,7,7,False,0.000980,-0.424253,-1.107865
CH507-254M2.1,False,False,False,3.0,3,3,False,0.000386,-0.504515,-1.392615
AP000431.1,False,False,False,4.0,4,4,False,0.000723,0.047152,0.564565
AP000265.1,False,False,False,5.0,8,5,False,0.001004,0.362146,1.682085


### Ileum

In [8]:
#Data Loading 
input_ileum_anndata = '/lustre/groups/talaveralopez/datasets/Colorectal_cancer/Anndata/Wang_2020/Wang_2020_ileum/wang20_ileum.processed.h5ad'  # the file that will store the analysis results
output_ileum_anndata = '/lustre/groups/talaveralopez/datasets/Colorectal_cancer/Anndata/Wang_2020/Wang_2020_ileum/wang20_ileum.processed_output.h5ad'  # the file that will store the analysis results

In [9]:
#Anndata upload
Wang_ileum = sc.read_h5ad(input_ileum_anndata)
Wang_ileum.X

<5980x16977 sparse matrix of type '<class 'numpy.float32'>'
	with 12663073 stored elements in Compressed Sparse Row format>

In [10]:
Wang_ileum.obs

Unnamed: 0_level_0,Sample_ID,CellType,Location,n_counts,log1p_n_counts,n_genes,log1p_n_genes,percent_mito,percent_ribo,percent_hb,percent_top50
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AAATGCCCAATCTGCA_Ileum-1_Enterocyte,Ileum-1,Enterocyte,Ileum,4956.0,8.508556,1621,7.391415,12.126715,6.799839,0.020178,44.915254
AACTCTTGTCTAGTCA_Ileum-1_Enterocyte,Ileum-1,Enterocyte,Ileum,8600.0,9.059634,2303,7.742402,14.465116,7.813953,0.011628,43.825581
AAGACCTCACGGACAA_Ileum-1_Enterocyte,Ileum-1,Enterocyte,Ileum,7236.0,8.886962,1833,7.514255,9.798231,9.369818,0.000000,43.352681
AAGCCGCGTCTTGCGG_Ileum-1_Enterocyte,Ileum-1,Enterocyte,Ileum,9623.0,9.172015,2371,7.771489,18.528526,8.677128,0.020784,44.840486
AAGTCTGGTTGTCTTT_Ileum-1_Enterocyte,Ileum-1,Enterocyte,Ileum,4644.0,8.443546,1488,7.305860,11.886305,7.278208,0.000000,44.142980
...,...,...,...,...,...,...,...,...,...,...,...
TGGCCAGTCAGCTTAG_Ileum-2_Stem Cell,Ileum-2,Stem Cell,Ileum,8427.0,9.039315,1904,7.552237,21.798979,35.291325,0.011867,50.492465
TGGTTAGCATCACGAT_Ileum-2_Stem Cell,Ileum-2,Stem Cell,Ileum,9821.0,9.192380,2207,7.699842,18.470625,37.694733,0.020365,48.416658
TGTCCCAGTTAAAGAC_Ileum-2_Stem Cell,Ileum-2,Stem Cell,Ileum,6560.0,8.788898,1443,7.275172,26.935976,34.192074,0.000000,55.533537
TTCTCCTAGTGGGCTA_Ileum-2_Stem Cell,Ileum-2,Stem Cell,Ileum,15412.0,9.642966,2941,7.986845,18.096289,21.749287,0.006488,44.835193


In [11]:
Wang_ileum.var

Unnamed: 0_level_0,mito,ribo,hb,n_counts,n_cells,n_genes,highly_variable,means,dispersions,dispersions_norm
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
RP11-34P13.7,False,False,False,10.0,13,10,False,0.001702,0.196076,-0.339504
FO538757.2,False,False,False,558.0,587,520,False,0.093239,0.305601,0.091138
AP006222.2,False,False,False,181.0,210,179,False,0.030459,0.244459,-0.149265
RP4-669L17.10,False,False,False,5.0,8,5,False,0.001044,0.417806,0.532314
RP11-206L10.9,False,False,False,83.0,106,83,False,0.014199,0.258878,-0.092572
...,...,...,...,...,...,...,...,...,...,...
GGT2,False,False,False,8.0,8,8,False,0.002016,0.457016,0.686485
RP3-510H16.3,False,False,False,4.0,4,4,False,0.000818,0.436569,0.606087
RP4-669P10.16,False,False,False,20.0,20,20,False,0.004075,0.390260,0.424007
BACH1-AS1,False,False,False,6.0,6,6,False,0.001167,0.265457,-0.066705


### Rectum

In [12]:
#Data Loading 
input_rectum_anndata = '/lustre/groups/talaveralopez/datasets/Colorectal_cancer/Anndata/Wang_2020/Wang_2020_rectum/wang20_rectum.processed.h5ad'  # the file that will store the analysis results
output_rectum_anndata = '/lustre/groups/talaveralopez/datasets/Colorectal_cancer/Anndata/Wang_2020/Wang_2020_rectum/wang20_rectum.processed_output.h5ad'  # the file that will store the analysis results

In [13]:
#Anndata upload
Wang_rectum = sc.read_h5ad(input_rectum_anndata)
Wang_rectum.X

<3797x17676 sparse matrix of type '<class 'numpy.float32'>'
	with 11593261 stored elements in Compressed Sparse Row format>

In [14]:
Wang_rectum.obs

Unnamed: 0_level_0,Sample_ID,CellType,Location,n_counts,log1p_n_counts,n_genes,log1p_n_genes,percent_mito,percent_ribo,percent_hb,percent_top50
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AAACCTGAGCTGGAAC_Rectum-1_Enterocyte,Rectum-1,Enterocyte,Rectum,10500.0,9.259226,2545,7.842279,28.838095,5.447619,0.009524,49.000000
AAACCTGGTTCCCGAG_Rectum-1_Enterocyte,Rectum-1,Enterocyte,Rectum,15281.0,9.634431,3375,8.124447,16.190041,9.619789,0.032720,44.244487
AAACCTGTCAGCGATT_Rectum-1_Enterocyte,Rectum-1,Enterocyte,Rectum,6529.0,8.784163,1574,7.362011,32.409252,5.176903,0.015316,59.013631
AAAGATGGTTCAGACT_Rectum-1_Enterocyte,Rectum-1,Enterocyte,Rectum,13501.0,9.510593,2989,8.003029,24.946301,3.814532,0.014814,46.737279
AAAGTAGAGAGTAAGG_Rectum-1_Enterocyte,Rectum-1,Enterocyte,Rectum,5287.0,8.573195,1582,7.367077,29.960281,5.239266,0.037829,51.030830
...,...,...,...,...,...,...,...,...,...,...,...
TCAGGATGTTCCACAA_Rectum-2_Stem Cell,Rectum-2,Stem Cell,Rectum,17962.0,9.796069,2953,7.990915,11.897339,45.763279,0.000000,47.260884
TCCACACAGCAATATG_Rectum-2_Stem Cell,Rectum-2,Stem Cell,Rectum,21999.0,9.998797,3771,8.235361,12.205100,38.410835,0.013637,40.510932
TCGAGGCCAAGGTGTG_Rectum-2_Stem Cell,Rectum-2,Stem Cell,Rectum,17397.0,9.764111,3101,8.039802,10.766224,41.392197,0.005748,41.346209
TCTATTGTCTTCGAGA_Rectum-2_Stem Cell,Rectum-2,Stem Cell,Rectum,4892.0,8.495561,1223,7.109879,26.185608,30.498774,0.020442,58.053966


In [15]:
Wang_rectum.var

Unnamed: 0_level_0,mito,ribo,hb,n_counts,n_cells,n_genes,highly_variable,means,dispersions,dispersions_norm
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
RP11-34P13.7,False,False,False,25.0,25,23,False,0.003679,-0.360232,-0.807470
FO538757.2,False,False,False,579.0,548,520,True,0.107308,0.118275,0.572867
AP006222.2,False,False,False,230.0,230,219,False,0.033655,-0.314175,-0.674611
RP4-669L17.10,False,False,False,4.0,4,4,False,0.000481,-0.686255,-1.747941
RP11-206L10.9,False,False,False,171.0,172,166,False,0.028947,-0.087199,-0.019861
...,...,...,...,...,...,...,...,...,...,...
CLDN14,False,False,False,3.0,3,3,False,0.000645,-0.009892,0.203147
AF129408.17,False,False,False,6.0,6,6,False,0.001739,0.338456,1.208017
MX2,False,False,False,4.0,4,4,False,0.001739,0.586054,1.922254
CBS,False,False,False,3.0,3,3,False,0.000536,-0.200734,-0.347372


### Raw Data Processing

In [13]:
# Upload data
Wang_raw_cell_info = pd.read_csv('/lustre/groups/talaveralopez/datasets/Anna_Maguza_Master_2023/Wang_2020_normal/Wang_raw/GSE125970_cell_info.txt', sep = '\t')
Wang_raw_Umi_counts = pd.read_csv('/lustre/groups/talaveralopez/datasets/Anna_Maguza_Master_2023/Wang_2020_normal/Wang_raw/GSE125970_raw_UMIcounts.txt', sep = '\t')
Wang_raw_scale_data = pd.read_csv('/lustre/groups/talaveralopez/datasets/Anna_Maguza_Master_2023/Wang_2020_normal/Wang_raw/GSE125970_scale_data.txt', sep = '\t')

In [21]:
Genes = pd.DataFrame(Wang_raw_Umi_counts.iloc[:, 0])

In [37]:
# Make GENE column as index
Wang_raw_Umi_counts2 = Wang_raw_Umi_counts.drop(['GENE'], axis=1)

In [38]:
Wang_raw_Umi_counts2 = np.transpose(Wang_raw_Umi_counts2)

In [39]:
Wang_raw_Umi_counts2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19515,19516,19517,19518,19519,19520,19521,19522,19523,19524
AAATGCCCAATCTGCA_Ileum-1_Enterocyte,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AACTCTTGTCTAGTCA_Ileum-1_Enterocyte,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
AAGACCTCACGGACAA_Ileum-1_Enterocyte,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAGCCGCGTCTTGCGG_Ileum-1_Enterocyte,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAGTCTGGTTGTCTTT_Ileum-1_Enterocyte,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
Wang_raw_cell_info.index = Wang_raw_cell_info['UniqueCell_ID']

In [42]:
Wang_raw_cell_info.head()

Unnamed: 0_level_0,UniqueCell_ID,Sample_ID,CellType
UniqueCell_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAATGCCCAATCTGCA_Ileum-1_Enterocyte,AAATGCCCAATCTGCA_Ileum-1_Enterocyte,Ileum-1,Enterocyte
AACTCTTGTCTAGTCA_Ileum-1_Enterocyte,AACTCTTGTCTAGTCA_Ileum-1_Enterocyte,Ileum-1,Enterocyte
AAGACCTCACGGACAA_Ileum-1_Enterocyte,AAGACCTCACGGACAA_Ileum-1_Enterocyte,Ileum-1,Enterocyte
AAGCCGCGTCTTGCGG_Ileum-1_Enterocyte,AAGCCGCGTCTTGCGG_Ileum-1_Enterocyte,Ileum-1,Enterocyte
AAGTCTGGTTGTCTTT_Ileum-1_Enterocyte,AAGTCTGGTTGTCTTT_Ileum-1_Enterocyte,Ileum-1,Enterocyte


In [45]:
Genes.index = Genes['GENE']

In [46]:
#Create an anndata object
Wang_raw = an.AnnData(X = Wang_raw_Umi_counts2, obs = Wang_raw_cell_info, var = Genes)

  Wang_raw = an.AnnData(X = Wang_raw_Umi_counts2, obs = Wang_raw_cell_info, var = Genes)


In [49]:
Wang_raw.write('/lustre/groups/talaveralopez/workspace/anna.maguza/Processed_datasets/Wang_2022/Wang_2022_raw_anndata.h5ad')