In [1]:
import pandas as pd
import numpy as np
import scanpy as sc
import os
from matplotlib.pyplot import rc_context

import sys
import warnings

import scvi
import anndata
import matplotlib.pyplot as plt

Global seed set to 0


In [2]:
#set up directory
large_root = r"../03-Out"
results_file = large_root + "/results_file.h5ad"

In [3]:
sc.set_figure_params(dpi=100, color_map = 'viridis_r')
sc.settings.verbosity = 1
sc.logging.print_header()

scanpy==1.9.2 anndata==0.8.0 umap==0.5.3 numpy==1.23.5 scipy==1.10.0 pandas==1.5.3 scikit-learn==1.2.1 statsmodels==0.13.5 python-igraph==0.10.4 pynndescent==0.5.4


In [4]:
# download data (GSE120575) from here:  
# "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE120575&format=file&file=GSE120575%5FSade%5FFeldman%5Fmelanoma%5Fsingle%5Fcells%5FTPM%5FGEO%2Etxt%2Egz"

data_path = "../01-data/GSE120575_Sade_Feldman_melanoma_single_cells_TPM_GEO.txt"

index2 = pd.read_csv(data_path, delimiter="\t",skiprows=[0], nrows=0).T

print(index2.shape)
index2 = index2.iloc[1:,:]
print(index2.shape)
index2 = index2.reset_index()
index2.rename(columns = {'index':'index2'}, inplace = True)

index2.head()

(16292, 0)
(16291, 0)


Unnamed: 0,index2
0,Pre_P1
1,Pre_P1.1
2,Pre_P1.2
3,Pre_P1.3
4,Pre_P1.4


In [6]:
index3 = pd.read_csv(data_path, delimiter="\t",skiprows=[1], nrows=0).T

print(index3.shape)
index3 = index3.iloc[1:,:]
print(index3.shape)
index3 = index3.reset_index()
index3.rename(columns = {'index':'index3'}, inplace = True)

index3.head()
# adata = adata.iloc[1:,:].copy()
# adata.head()

(16292, 0)
(16291, 0)


Unnamed: 0,index3
0,A10_P3_M11
1,A11_P1_M11
2,A11_P3_M11
3,A11_P4_M11
4,A12_P3_M11


In [7]:
index3

Unnamed: 0,index3
0,A10_P3_M11
1,A11_P1_M11
2,A11_P3_M11
3,A11_P4_M11
4,A12_P3_M11
...,...
16286,H5_P5_M67_L001_T_enriched
16287,H6_P5_M67_L001_T_enriched
16288,H7_P5_M67_L001_T_enriched
16289,H8_P5_M67_L001_T_enriched


In [8]:
index3_list = index3.index3.to_list()
len(index3_list)
index3_list = index3_list[:-1]
len(index3_list)

16290

In [10]:
combo_index = pd.concat([index2, index3], axis="columns")
combo_index = combo_index.iloc[:-1,:]

In [11]:
combo_index['full_sample_name'] = combo_index['index3'] + "-" +combo_index['index2']
combo_index = combo_index.drop('index2', axis = 1)
combo_index = combo_index.drop('index3', axis = 1)
combo_index = combo_index.set_index("full_sample_name")
combo_index.head()

A10_P3_M11-Pre_P1
A11_P1_M11-Pre_P1.1
A11_P3_M11-Pre_P1.2
A11_P4_M11-Pre_P1.3
A12_P3_M11-Pre_P1.4


In [12]:
cell_index = []
for i in range(16290):
    cell_index.append('Sample '+str(i+1))
cell_index
len(cell_index)

combo_index2 = combo_index.reset_index()
combo_index2['sample_name'] = cell_index
combo_index2 = combo_index2.set_index('sample_name')
combo_index2

Unnamed: 0_level_0,full_sample_name
sample_name,Unnamed: 1_level_1
Sample 1,A10_P3_M11-Pre_P1
Sample 2,A11_P1_M11-Pre_P1.1
Sample 3,A11_P3_M11-Pre_P1.2
Sample 4,A11_P4_M11-Pre_P1.3
Sample 5,A12_P3_M11-Pre_P1.4
...,...
Sample 16286,H4_P5_M67_L001_T_enriched-Post_P6_T_enriched.86
Sample 16287,H5_P5_M67_L001_T_enriched-Post_P6_T_enriched.87
Sample 16288,H6_P5_M67_L001_T_enriched-Post_P6_T_enriched.88
Sample 16289,H7_P5_M67_L001_T_enriched-Post_P6_T_enriched.89


In [13]:
combo_index.shape

(16290, 0)

In [14]:
cell_names_list = combo_index.index.to_list()
print(len(cell_names_list))
# cell_names_list = cell_names_list[:-1]
print(len(cell_names_list))
cell_names_list

16290
16290


['A10_P3_M11-Pre_P1',
 'A11_P1_M11-Pre_P1.1',
 'A11_P3_M11-Pre_P1.2',
 'A11_P4_M11-Pre_P1.3',
 'A12_P3_M11-Pre_P1.4',
 'A12_P6_M11-Pre_P1.5',
 'A2_P1_M11-Pre_P1.6',
 'A2_P4_M11-Pre_P1.7',
 'A3_P1_M11-Pre_P1.8',
 'A3_P3_M11-Pre_P1.9',
 'A4_P3_M11-Pre_P1.10',
 'A4_P4_M11-Pre_P1.11',
 'A4_P6_M11-Pre_P1.12',
 'A5_P4_M11-Pre_P1.13',
 'A5_P5_M11-Pre_P1.14',
 'A6_P1_M11-Pre_P1.15',
 'A6_P4_M11-Pre_P1.16',
 'A6_P6_M11-Pre_P1.17',
 'A7_P2_M11-Pre_P1.18',
 'A7_P6_M11-Pre_P1.19',
 'A8_P1_M11-Pre_P1.20',
 'A8_P6_M11-Pre_P1.21',
 'A9_P1_M11-Pre_P1.22',
 'A9_P4_M11-Pre_P1.23',
 'B10_P1_M11-Pre_P1.24',
 'B10_P3_M11-Pre_P1.25',
 'B10_P4_M11-Pre_P1.26',
 'B10_P6_M11-Pre_P1.27',
 'B11_P6_M11-Pre_P1.28',
 'B12_P3_M11-Pre_P1.29',
 'B12_P4_M11-Pre_P1.30',
 'B1_P2_M11-Pre_P1.31',
 'B1_P4_M11-Pre_P1.32',
 'B2_P1_M11-Pre_P1.33',
 'B2_P5_M11-Pre_P1.34',
 'B2_P6_M11-Pre_P1.35',
 'B3_P1_M11-Pre_P1.36',
 'B3_P5_M11-Pre_P1.37',
 'B4_P1_M11-Pre_P1.38',
 'B4_P4_M11-Pre_P1.39',
 'B4_P6_M11-Pre_P1.40',
 'B5_P1_M11-Pre

In [16]:
gene_names = pd.read_csv(data_path, delimiter="\t", usecols=[0])

gene_names.rename(columns = {'Unnamed: 0':'gene_symbols'}, inplace = True)
gene_names = gene_names.iloc[1:,:]

gene_names_list = gene_names.gene_symbols.to_list()

print(gene_names_list[0])
print(gene_names.shape)
print(len(gene_names_list))

TSPAN6
(55737, 1)
55737


In [17]:
# read data
adata = pd.read_csv(data_path, delimiter="\t",skiprows=[1], usecols=index3_list, header=0).T

print(adata.shape)

(16290, 55737)


In [18]:
bdata = adata.to_numpy()
print(bdata.shape)
bdata

(16290, 55737)


array([[0.  , 0.  , 9.24, ..., 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 0.  , 7.99, ..., 0.  , 0.  , 0.  ],
       ...,
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ]])

In [19]:
# create sparse matrix using csr_matrix()
  
# Import required package
from scipy.sparse import csr_matrix
from scipy import sparse
  
# creating sparse matrix
sparseMatrix = sparse.csr_matrix(bdata)
              
# print the sparse matrix
print(sparseMatrix)

  (0, 2)	9.24
  (0, 8)	1.01
  (0, 17)	5.8
  (0, 19)	6.65
  (0, 23)	5.36
  (0, 42)	9.62
  (0, 45)	9.5
  (0, 53)	9.81
  (0, 75)	9.61
  (0, 91)	9.65
  (0, 100)	6.5
  (0, 107)	9.55
  (0, 116)	9.46
  (0, 118)	8.78
  (0, 152)	9.31
  (0, 170)	10.36
  (0, 184)	10.02
  (0, 206)	4.38
  (0, 214)	8.69
  (0, 219)	9.24
  (0, 227)	12.16
  (0, 235)	8.89
  (0, 237)	8.47
  (0, 250)	8.99
  (0, 269)	8.51
  :	:
  (16289, 51902)	7.96
  (16289, 51920)	4.49
  (16289, 52154)	4.44
  (16289, 52220)	5.99
  (16289, 52298)	6.67
  (16289, 52341)	5.47
  (16289, 52409)	7.49
  (16289, 52678)	4.79
  (16289, 52854)	6.14
  (16289, 52877)	7.56
  (16289, 52889)	1.5
  (16289, 53130)	6.49
  (16289, 53363)	5.26
  (16289, 53508)	9.75
  (16289, 53537)	5.52
  (16289, 53575)	11.86
  (16289, 53618)	8.83
  (16289, 53971)	4.26
  (16289, 54247)	6.08
  (16289, 54613)	6.32
  (16289, 54897)	1.96
  (16289, 54966)	2.88
  (16289, 55093)	3.7
  (16289, 55105)	3.35
  (16289, 55468)	7.38


In [20]:
type(sparseMatrix)
sparse.isspmatrix(sparseMatrix)

True

In [21]:
import scipy.io as sio
sio.mmwrite("../01-data/matrix/matrix.mtx",sparseMatrix)

In [22]:
gene_names

Unnamed: 0,gene_symbols
1,TSPAN6
2,TNMD
3,DPM1
4,SCYL3
5,C1orf112
...,...
55733,RP4-621B10.8
55734,RP11-114I8.4
55735,RP11-180C16.1
55736,AP000230.1


In [23]:
gene_names.to_csv('../01-data/matrix/genes.tsv', index=False)

In [24]:
combo_index2.to_csv('../01-data/matrix/barcodes.tsv')