## Notebook for Ishikawa-2022 anndata file creation 
### Developed by: Anna Maguza

### Institute of Computational Biology - Computational Health Centre - Hemlholtz Munich

### 16 January 2023

#### Load packages

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as an
import h5py
from scipy.io import mmread
from scipy.sparse import coo_matrix
import matplotlib.pyplot as plt
import scipy as sci

#### Setup Cells

In [2]:
%matplotlib inline

In [3]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

  from .autonotebook import tqdm as notebook_tqdm


scanpy==1.9.1 anndata==0.8.0 umap==0.5.3 numpy==1.21.5 scipy==1.7.3 pandas==1.5.1 scikit-learn==1.1.3 statsmodels==0.13.2 pynndescent==0.5.8


#### Data upload

In [62]:
#Data Upload (tsv) - Barcodes
Barcodes = pd.read_csv('/lustre/groups/talaveralopez/datasets/Colorectal_cancer/Mice_intestine/Ishikawa-2022-mice/GSM6381794_barcodes.tsv', sep='\t', header=None, index_col=0)

In [63]:
#Data Upload (tsv) - Features
Features = pd.read_csv('/lustre/groups/talaveralopez/datasets/Colorectal_cancer/Mice_intestine/Ishikawa-2022-mice/GSM6381794_features.tsv', sep='\t', header=None, index_col=0)

In [64]:
#Data Upload (tsv) - Matrix
Matrix = mmread('/lustre/groups/talaveralopez/datasets/Colorectal_cancer/Mice_intestine/Ishikawa-2022-mice/GSM6381794_matrix.mtx')

In [65]:
Matrix = sci.sparse.csr_matrix(Matrix)

In [66]:
Ishikawa_anndata = an.AnnData(X=np.transpose(Matrix), var=Features, obs=Barcodes)

  Ishikawa_anndata = an.AnnData(X=np.transpose(Matrix), var=Features, obs=Barcodes)


In [67]:
Ishikawa_anndata.var

Unnamed: 0_level_0,1,2
0,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSMUSG00000051951,Xkr4,Gene Expression
ENSMUSG00000089699,Gm1992,Gene Expression
ENSMUSG00000102343,Gm37381,Gene Expression
ENSMUSG00000025900,Rp1,Gene Expression
ENSMUSG00000025902,Sox17,Gene Expression
...,...,...
ENSMUSG00000079808,AC168977.1,Gene Expression
ENSMUSG00000095041,AC149090.1,Gene Expression
ENSMUSG00000063897,CAAA01118383.1,Gene Expression
ENSMUSG00000096730,Vmn2r122,Gene Expression


In [68]:
Ishikawa_anndata.obs

AAACCCAAGAGGATGA-1
AAACCCAGTAGGTTTC-1
AAACCCATCGCGCCAA-1
AAACGAAAGACGGTCA-1
AAACGAAAGGCTCTAT-1
...
TTTGTTGCAAGTCCCG-1
TTTGTTGCATGAGATA-1
TTTGTTGGTCAGTCGC-1
TTTGTTGTCGAGGCAA-1
TTTGTTGTCTGGACCG-1


In [69]:
#Name the column in obs as cell_index
Ishikawa_anndata.obs['cell_index'] = Ishikawa_anndata.obs.index
#Change anndata index to numeric index
Ishikawa_anndata.obs.index = np.arange(0, len(Ishikawa_anndata.obs.index))
#Change anndata column names to string
Ishikawa_anndata.obs.columns = Ishikawa_anndata.obs.columns.astype(str)
Ishikawa_anndata.obs

Unnamed: 0,cell_index
0,AAACCCAAGAGGATGA-1
1,AAACCCAGTAGGTTTC-1
2,AAACCCATCGCGCCAA-1
3,AAACGAAAGACGGTCA-1
4,AAACGAAAGGCTCTAT-1
...,...
3633,TTTGTTGCAAGTCCCG-1
3634,TTTGTTGCATGAGATA-1
3635,TTTGTTGGTCAGTCGC-1
3636,TTTGTTGTCGAGGCAA-1


In [70]:
#Name the column in var as gene_id
Ishikawa_anndata.var['gene_id'] = Ishikawa_anndata.var.index
#Change anndata index to numeric index
Ishikawa_anndata.var.index = np.arange(0, len(Ishikawa_anndata.var.index))
#Change anndata column names to string
Ishikawa_anndata.var.columns = Ishikawa_anndata.var.columns.astype(str)
Ishikawa_anndata.var

Unnamed: 0,1,2,gene_id
0,Xkr4,Gene Expression,ENSMUSG00000051951
1,Gm1992,Gene Expression,ENSMUSG00000089699
2,Gm37381,Gene Expression,ENSMUSG00000102343
3,Rp1,Gene Expression,ENSMUSG00000025900
4,Sox17,Gene Expression,ENSMUSG00000025902
...,...,...,...
31048,AC168977.1,Gene Expression,ENSMUSG00000079808
31049,AC149090.1,Gene Expression,ENSMUSG00000095041
31050,CAAA01118383.1,Gene Expression,ENSMUSG00000063897
31051,Vmn2r122,Gene Expression,ENSMUSG00000096730


In [71]:
#Name the columns properly
Ishikawa_anndata.var.rename(columns={"1": "gene_name"}, inplace=True)
Ishikawa_anndata.var.rename(columns={"2": "type"}, inplace=True)
Ishikawa_anndata.var

Unnamed: 0,gene_name,type,gene_id
0,Xkr4,Gene Expression,ENSMUSG00000051951
1,Gm1992,Gene Expression,ENSMUSG00000089699
2,Gm37381,Gene Expression,ENSMUSG00000102343
3,Rp1,Gene Expression,ENSMUSG00000025900
4,Sox17,Gene Expression,ENSMUSG00000025902
...,...,...,...
31048,AC168977.1,Gene Expression,ENSMUSG00000079808
31049,AC149090.1,Gene Expression,ENSMUSG00000095041
31050,CAAA01118383.1,Gene Expression,ENSMUSG00000063897
31051,Vmn2r122,Gene Expression,ENSMUSG00000096730


In [72]:
Ishikawa_anndata.X

<3638x31053 sparse matrix of type '<class 'numpy.float32'>'
	with 8586005 stored elements in Compressed Sparse Column format>

In [73]:
#Save the anndata object
Ishikawa_anndata.write('/lustre/groups/talaveralopez/datasets/Colorectal_cancer/Mice_intestine/Ishikawa-2022-mice/GSM6381794_anndata_raw-2.h5ad')