## Notebook for Ayyaz-2019 anndata file creation 
### Developed by: Anna Maguza

### Institute of Computational Biology - Computational Health Centre - Hemlholtz Munich

### 9 December 2022

#### Load packages

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as an
import h5py
from scipy.io import mmread
from scipy.sparse import coo_matrix
import matplotlib.pyplot as plt
import scipy as sci

KeyboardInterrupt: 

#### Setup Cells

In [2]:
%matplotlib inline

KeyboardInterrupt: 

In [None]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

  from .autonotebook import tqdm as notebook_tqdm


scanpy==1.9.1 anndata==0.8.0 umap==0.5.3 numpy==1.21.5 scipy==1.7.3 pandas==1.5.1 scikit-learn==1.1.3 statsmodels==0.13.2 pynndescent==0.5.8


## GSM3308717_C04

#### Upload Data

In [None]:
#Data Upload (tsv) - Barcodes
Barcodes = pd.read_csv('/lustre/groups/talaveralopez/datasets/Colorectal_cancer/Mice_intestine/Ayyaz-2019/Raw_data/GSM3308717_C04', sep='\t', header=None, index_col=0)

In [None]:
#Data Upload (tsv) - Genes
Genes = pd.read_csv('/lustre/groups/talaveralopez/datasets/Colorectal_cancer/Mice_intestine_stem_cells/Raw_data/GSM3308717_C04/GSM3308717_C04_genes.tsv', sep='\t', header=None)

In [12]:
#Data Upload (mtx) - Counts
Counts = mmread('/lustre/groups/talaveralopez/datasets/Colorectal_cancer/Mice_intestine_stem_cells/Raw_data/GSM3308717_C04/GSM3308717_C04.mtx')

In [17]:
matrix1 = sci.sparse.csr_matrix(Counts)

In [18]:
#Data Upload (csv) - Metadata
Processed = pd.read_csv('/lustre/groups/talaveralopez/datasets/Colorectal_cancer/Mice_intestine_stem_cells/Processed/GSE123515_processed_data.csv', sep=',', header=0, index_col=0)

In [20]:
# See data head
Processed.head()

Unnamed: 0,A01__Plate4,A02__Plate3,A02__Plate4,A03__Plate4,A04__Plate3,A04__Plate4,A05__Plate3,A05__Plate4,A06__Plate3,A06__Plate4,...,H07__Plate4,H08__Plate3,H08__Plate4,H09__Plate3,H09__Plate4,H10__Plate3,H10__Plate4,H11__Plate3,H11__Plate4,H12__Plate4
0610009B22Rik,0.0,0.0,0.0,0.0,1.686444,11.302245,0.0,4.366853,0.0,0.0,...,3.536888,1.947951,0.0,2.621282,0.0,0.0,0.0,0.0,0.0,0.0
0610010F05Rik,0.0,3.468714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.55235,0.0,0.0,0.0,3.892427,0.0,0.0,0.0,0.0,0.0
0610030E20Rik,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0610031O16Rik,0.0,0.0,0.0,0.0,0.0,2.176532,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.215932,0.0,0.0,0.0
0610039K10Rik,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
Processed_whole_epithelium = pd.read_csv('/lustre/groups/talaveralopez/datasets/Colorectal_cancer/Mice_intestine_stem_cells/Processed/GSE117783_Whole_epithelium_processed.csv', sep=',', header=0, index_col=0)

In [22]:
Processed_whole_epithelium.head()

Unnamed: 0,C04_RF_AAACCTGCACAGGAGT,C04_RF_AAACCTGCACCGTTGG,C04_RF_AAACCTGGTACAGACG,C04_RF_AAACCTGGTCGAGATG,C04_RF_AAACCTGGTGATGCCC,C04_RF_AAACCTGGTTTACTCT,C04_RF_AAACCTGTCCACGCAG,C04_RF_AAACGGGAGACCCACC,C04_RF_AAACGGGAGATGGGTC,C04_RF_AAACGGGCAAACGTGG,...,C06_RF_TTTGTCAAGAGTCGGT,C06_RF_TTTGTCACAATGGAAT,C06_RF_TTTGTCACACAGACAG,C06_RF_TTTGTCACATGGGACA,C06_RF_TTTGTCAGTAAGTAGT,C06_RF_TTTGTCAGTAGCTCCG,C06_RF_TTTGTCAGTTTGACTG,C06_RF_TTTGTCATCAAACAAG,C06_RF_TTTGTCATCAATAAGG,C06_RF_TTTGTCATCGGAGCAA
Xkr4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Sox17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Mrpl15,0.0,0.511226,0.0,0.0,0.0,0.0,0.999649,0.894722,1.108849,0.471045,...,0.0,0.0,0.0,0.0,0.0,0.0,0.84829,0.0,0.0,0.0
Lypla1,0.696666,1.300011,0.0,0.0,2.009269,0.0,0.0,1.153726,0.0,0.789971,...,1.38885,0.0,0.0,0.0,1.319113,0.0,1.300545,0.0,0.0,0.0
Gm37988,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
