https://www.youtube.com/watch?v=Ee0PQUwVH8Q&t=496s

In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import random

folder_path = "C:/Users/heung/OneDrive/Documents/scData/14. GSE136001_RAW/"

# Load data

In [49]:
# Load file
adata = sc.read(folder_path + "annotated.h5ad")
adata

AnnData object with n_obs × n_vars = 37255 × 16739
    obs: 'sample', 'sex', 'condition', 'n_genes', 'total_counts', 'pct_counts_mito', 'pct_counts_ribo', '_scvi_batch', '_scvi_labels', 'leiden', '_scvi_raw_norm_scaling', 'cell_type'
    var: 'n_cells'
    uns: '_scvi_manager_uuid', '_scvi_uuid', 'cell_type_colors', 'condition_colors', 'dendrogram_cell_type', 'leiden', 'leiden_colors', 'log1p', 'markers', 'neighbors', 'pca', 'rank_genes_groups', 'sample_colors', 'scvi_markers', 'umap'
    obsm: 'X_pca', 'X_scVI', 'X_umap', '_scvi_extra_categorical_covs', '_scvi_extra_continuous_covs'
    varm: 'PCs'
    layers: 'counts', 'scvi_normalized'
    obsp: 'connectivities', 'distances'

In [50]:
cell_subset = adata[adata.obs['cell_type'].isin(['Microglia', 'Activated microglia'])].copy()
cell_subset.obs.tail()

Unnamed: 0,sample,sex,condition,n_genes,total_counts,pct_counts_mito,pct_counts_ribo,_scvi_batch,_scvi_labels,leiden,_scvi_raw_norm_scaling,cell_type
TTTGTCAGTCATACTG-1,m-tumor-2,male,tumor,949,1681.0,2.260559,6.127306,0,0,2,5.94884,Microglia
TTTGTCAGTGCGCTTG-1,m-tumor-2,male,tumor,1001,2119.0,1.557338,7.597923,0,0,3,4.719207,Activated microglia
TTTGTCAGTTCACCTC-1,m-tumor-2,male,tumor,2293,6668.0,0.539892,10.662868,0,0,7,1.4997,Activated microglia
TTTGTCATCCTATTCA-1-1,m-tumor-2,male,tumor,956,1857.0,1.938611,6.946688,0,0,3,5.385029,Activated microglia
TTTGTCATCTGTTTGT-1,m-tumor-2,male,tumor,956,1783.0,1.682558,9.197981,0,0,0,5.608525,Microglia


In [51]:
cell_subset.obs.groupby(['condition', 'sample']).size()

  cell_subset.obs.groupby(['condition', 'sample']).size()


condition  sample   
normal     f-ctrl-1     4544
           f-ctrl-2     4219
           f-tumor-1       0
           f-tumor-2       0
           m-ctrl-1     4136
           m-ctrl-2     4472
           m-tumor-1       0
           m-tumor-2       0
tumor      f-ctrl-1        0
           f-ctrl-2        0
           f-tumor-1    3051
           f-tumor-2    3852
           m-ctrl-1        0
           m-ctrl-2        0
           m-tumor-1    2316
           m-tumor-2    3284
dtype: int64

In [52]:
adata.X.max()

8.663969

# Making pseudo bulk file

In [58]:
pbs = []
for sample in cell_subset.obs['sample'].unique():
    samp_cell_subset = cell_subset[cell_subset.obs['sample'] == sample]

    rep_adata = sc.AnnData(
        X = samp_cell_subset.X.sum(axis = 0),
        var = samp_cell_subset.var[[]])
    
    rep_adata.obs_names = [sample]
    rep_adata.obs['condition'] = samp_cell_subset.obs['condition'].iloc[0]
    
    pbs.append(rep_adata)

In [59]:
pb = sc.concat(pbs)
pb.obs

Unnamed: 0,condition
f-ctrl-1,normal
f-ctrl-2,normal
f-tumor-1,tumor
f-tumor-2,tumor
m-ctrl-1,normal
m-ctrl-2,normal
m-tumor-1,tumor
m-tumor-2,tumor


# Making 3 pseudo replicates per sample

In [67]:
pbs = []
for sample in cell_subset.obs['sample'].unique():
    samp_cell_subset = cell_subset[cell_subset.obs['sample'] == sample]

    # Splitting the cells into 3 random groups -> 3 pseudo replicates
    indices = list(samp_cell_subset.obs_names)
    random.shuffle(indices)
    indices = np.array_split(np.array(indices), 3)

    # Creating pseudo replicates
    for i, pseudo_rep in enumerate(indices):
        rep_adata = sc.AnnData(
            X = samp_cell_subset.X.sum(axis = 0),
            var = samp_cell_subset.var[[]])
        
        rep_adata.obs_names = [sample]
        rep_adata.obs['condition'] = samp_cell_subset.obs['condition'].iloc[0]
        rep_adata.obs['replicate'] = i
        
        pbs.append(rep_adata)

In [68]:
pb = sc.concat(pbs)
pb.obs

  utils.warn_names_duplicates("obs")


Unnamed: 0,condition,replicate
f-ctrl-1,normal,0
f-ctrl-1,normal,1
f-ctrl-1,normal,2
f-ctrl-2,normal,0
f-ctrl-2,normal,1
f-ctrl-2,normal,2
f-tumor-1,tumor,0
f-tumor-1,tumor,1
f-tumor-1,tumor,2
f-tumor-2,tumor,0


# DEseq

In [62]:
from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats

In [64]:
counts = pd.DataFrame(pb.X, columns = pb.var_names)
counts.head()

Unnamed: 0,Sox17,Mrpl15,Lypla1,Gm37988,Tcea1,Rgs20,Atp6v1h,Rb1cc1,4732440D04Rik,St18,...,CR974586.5,Csprs,AC132444.6,AC125149.3,AC125149.2,AC168977.2,AC168977.1,AC149090.1,CAAA01118383.1,CAAA01147332.1
0,0.0,957.320618,748.307861,0.0,1427.360352,3.597847,679.845032,752.281372,67.37941,0.0,...,0.0,0.0,3.811548,3.604362,0.0,0.0,3.361282,5594.480957,523.530029,13.202333
1,0.0,871.911316,792.684204,0.0,1360.673584,0.0,686.552246,766.941833,93.05603,0.0,...,0.0,1.422774,0.0,4.961704,0.0,0.0,1.54428,5647.447266,520.149353,14.070037
2,0.0,633.163513,480.285828,0.0,890.045898,0.0,450.596313,411.49585,47.808067,0.0,...,1.066726,11.447237,0.0,33.003674,0.0,0.866491,1.179816,2914.951172,327.008636,2.308375
3,0.0,807.924072,640.46228,0.0,1121.771973,1.145522,596.738953,535.628479,58.374336,0.0,...,0.0,15.097595,0.0,22.969788,0.0,2.134902,2.604242,4165.07666,448.633331,8.662447
4,2.057444,851.843201,698.285278,1.444667,1256.575928,1.590821,610.802917,720.442688,83.699104,0.0,...,0.0,0.0,1.984241,3.674593,1.619757,0.0,1.327301,4549.992188,433.150116,12.491718


> **이하 오류남**

In [65]:
dds = DeseqDataSet(
    counts = counts,
    clinical = pb.obs,
    design_factors = 'condition'
)

TypeError: DeseqDataSet.__init__() got an unexpected keyword argument 'clinical'