## Write out data in .mtx format 

_24 August 2021_

## Importing

### Modules

In [1]:
import numpy as np
import scipy as sp
import scanpy as sc
import pandas as pd
import pickle as pkl
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib import colors
import seaborn as sb
from collections import defaultdict
from numpy import asarray as ar
from collections import Counter

#sklearn <- machine learning
#statsmodels

sc.settings.verbosity = 1
sc.logging.print_version_and_date()
%load_ext autoreload
%autoreload 2

Running Scanpy 1.8.1, on 2021-11-28 15:42.


In [2]:
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all" # to show output from all the lines in a cells

In [3]:
pd.set_option('display.max_column',None) # display all the columns in pandas

In [4]:
pd.options.display.max_rows = 100

In [5]:
from datetime import date
today = str(date.today())

In [6]:
res_folder = "/nfs/team205/vk8/processed_data/muscle/forDE_LMM/"

## Save human2mouse data

In [7]:
adata = sc.read("/nfs/team205/vk8/processed_data/muscle/data_v3/SKM_human2mouse_integration_v6.0_2021-10-19.h5ad")

In [14]:
adata.obs['Source'].cat.categories

Index(['Giordani_Molc_Cell', 'Li_EMBO_Journal', 'Micheli_Skeletal_Muscle',
       'Rubenstein_SR', 'Sanger_Zhang', 'Sanger_Zhang_human', 'Tabula_Muris',
       'elife'],
      dtype='object')

In [15]:
adata.obs['Species_annotation'] = adata.obs['Species_annotation'].astype(str)

In [16]:
adata.obs.loc[adata.obs['Species_annotation'] == 'human-MF-IIfg', 'Species_annotation'] = 'human-MF_IIfg'
adata.obs.loc[adata.obs['Species_annotation'] == 'human-MF_Ifg', 'Species_annotation'] = 'human-MF_Ifg'
adata.obs.loc[adata.obs['Species_annotation'] == 'mouse-MF-IIfg', 'Species_annotation'] = 'mouse-MF_IIfg'

In [17]:
cell_type_anno = adata.obs['Species_annotation'].str.split("-", n=1, expand = True)

In [18]:
adata.obs['celltype'] = cell_type_anno[1]

### Normalize and log-tramsform data

In [19]:
adata.shape

(346271, 11232)

In [15]:
sc.pp.normalize_per_cell(adata, counts_per_cell_after = 1e4)
sc.pp.log1p(adata)

In [16]:
import scipy.io as spio

spio.mmwrite("/nfs/team205/vk8/processed_data/muscle/forDE_LMM/SKM_human2mouse_raw_log_norm_"+today+".mtx", adata.X)

adata.obs.to_csv("/nfs/team205/vk8/processed_data/muscle/forDE_LMM/SKM_human2mouse_raw_metadata_"+today+".csv")
adata.var.to_csv("/nfs/team205/vk8/processed_data/muscle/forDE_LMM/SKM_human2mouse_raw_features_"+today+".csv")

sp_matrix=spio.mmread("/nfs/team205/vk8/processed_data/muscle/forDE_LMM/SKM_human2mouse_raw_log_norm_2021-10-24.mtx")

obs = pd.read_csv("/nfs/team205/vk8/processed_data/muscle/forDE_LMM/SKM_human2mouse_raw_metadata_.csv", index_col = 0)
var = pd.read_csv("/nfs/team205/vk8/processed_data/muscle/forDE_LMM/SKM_human2mouse_raw_features_.csv", index_col = 0)

# Save human data seperately 

In [20]:
adata_human = adata[adata.obs['Species'] == "human",:].copy()

In [21]:
adata_human.shape

(219893, 11232)

In [22]:
adata_human.obs['Species'].unique()

['human']
Categories (1, object): ['human']

In [19]:
spio.mmwrite(f"{res_folder}SKM_human_cells_broad_anno_raw_log_norm_"+today+".mtx", adata_human.X)

adata_human.obs.to_csv(f"{res_folder}SKM_human_cells_broad_anno_raw_metadata_"+today+".csv")
adata_human.var.to_csv(f"{res_folder}SKM_human_cells_broad_anno_raw_features_"+today+".csv")

# Save mouse data seperately 

In [23]:
adata_mouse = adata[adata.obs['Species'] == "mouse",:].copy()

In [25]:
adata_mouse.shape

(126378, 11232)

In [24]:
adata_mouse.obs['Species'].unique()

['mouse']
Categories (1, object): ['mouse']

In [20]:
spio.mmwrite(f"{res_folder}SKM_mouse_cells_broad_anno_raw_log_norm_"+today+".mtx", adata_mouse.X)

adata_mouse.obs.to_csv(f"{res_folder}SKM_mouse_cells_broad_anno_raw_metadata_"+today+".csv")
adata_mouse.var.to_csv(f"{res_folder}SKM_mouse_cells_broad_anno_raw_features_"+today+".csv")

# Save human nuclei data (granular annotation)

In [7]:
adata_nuclei = sc.read("/nfs/team205/vk8/processed_data/muscle/data_v3/SKM_nuclei_anno_filt_raw_cbender2021-09-01.h5ad")

In [15]:
adata_nuclei.obs['celltype'] = adata_nuclei.obs['cell_type(nuclei)_level1']

In [17]:
sc.pp.normalize_per_cell(adata_nuclei, counts_per_cell_after = 1e4)
sc.pp.log1p(adata_nuclei)

In [18]:
adata_nuclei.X.data

array([1.4811257, 1.4811257, 1.4811257, ..., 1.4000249, 1.4000249,
       2.3190393], dtype=float32)

In [20]:
import scipy.io as spio
spio.mmwrite(f"{res_folder}SKM_human_nuclei_nuclei_level1_anno_raw_log_norm_"+today+".mtx", adata_nuclei.X)

adata_nuclei.obs.to_csv(f"{res_folder}SKM_human_nuclei_nuclei_level1_anno_raw_metadata_"+today+".csv")
adata_nuclei.var.to_csv(f"{res_folder}SKM_human_nuclei_nuclei_level1_anno_raw_features_"+today+".csv")

# Save scvi human nuclei data (granular annotation)

In [7]:
adata_nuclei = sc.read('/nfs/team205/vk8/processed_data/muscle/data_v3/SKM_myonuclei_re-anno_cleaned_scvi_2021-10-31.h5ad')

In [16]:
adata_nuclei.obs['celltype'] = adata_nuclei.obs['cell_type(myonuclei)_level1']

In [17]:
adata_nuclei.obs['celltype'].cat.categories

Index(['MF_typeI', 'MF_typeII', 'MF_typeI(cytoplasmic)',
       'MF_typeII(cytoplasmic)', 'MF_typeI-II(hybrid)', 'MF_typeI-FAM189A2',
       'MF_typeII-FAM189A2', 'MF_typeI-OTUD1', 'MF_typeII-OTUD1',
       'MF_typeII-TNFRSF12A-high', 'NMJ', 'MTJ', 'MF_type-SORBS2'],
      dtype='object')

adata_nuclei.obs.to_csv(f"{res_folder}SKM_human_nuclei_nuclei_level1_anno_raw_metadata_2021-11-12.csv")

In [16]:
import scipy.io as spio
spio.mmwrite(f"{res_folder}SKM_human_nuclei_nuclei_level1_anno_raw_log_norm_"+today+".mtx", adata_nuclei.X)

adata_nuclei.obs.to_csv(f"{res_folder}SKM_human_nuclei_nuclei_level1_anno_raw_metadata_"+today+".csv")
adata_nuclei.var.to_csv(f"{res_folder}SKM_human_nuclei_nuclei_level1_anno_raw_features_"+today+".csv")

# Save scvi human nuclei data (broad annotation)

In [7]:
adata_nuclei = sc.read('/nfs/team205/vk8/processed_data/muscle/data_v3/SKM_myonuclei_re-anno_cleaned_scvi_2021-10-31.h5ad')

In [8]:
adata_nuclei.obs['cell_type(nuclei)_level0'].cat.categories

Index(['Hybrid', 'MF_type-NCAM1', 'MF_type-SORBS2', 'MTJ', 'Myofiber_typeI',
       'Myofiber_typeII', 'NMJ', 'high_mito'],
      dtype='object')

In [13]:
adata_nuclei.obs['celltype'] = adata_nuclei.obs['cell_type(myonuclei)_level0']

adata_nuclei.obs.to_csv(f"{res_folder}SKM_human_nuclei_nuclei_level0_anno_raw_metadata_"+today+".csv")

In [11]:
import scipy.io as spio
spio.mmwrite(f"{res_folder}SKM_human_nuclei_nuclei_level0_anno_raw_log_norm_"+today+".mtx", adata_nuclei.X)

adata_nuclei.obs.to_csv(f"{res_folder}SKM_human_nuclei_nuclei_level0_anno_raw_metadata_"+today+".csv")
adata_nuclei.var.to_csv(f"{res_folder}SKM_human_nuclei_nuclei_level0_anno_raw_features_"+today+".csv")