In [1]:
import anndata
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re

### Preprocess Liver Cancer datasets

In [2]:
def find_csv_filenames(path_to_dir, string_in_filename):
    filenames = os.listdir(path_to_dir)
    return [filename for filename in filenames if all([
        filename.endswith('.csv'),
        string_in_filename in filename
    ])]

# 示例用法
path = 'Liver'
string = 'cell_by_gene'
csv_files = sorted(find_csv_filenames(path, string))
print(csv_files)
metadata_files = sorted(find_csv_filenames(path, 'metadata'))
print(metadata_files)

['1003_region_0_cell_by_gene.csv', '1012_region_0_cell_by_gene.csv', '1012_region_1_cell_by_gene.csv', '1014_region_0_cell_by_gene.csv', '1014_region_1_cell_by_gene.csv', '1017_region_0_cell_by_gene.csv', '1029_region_0_cell_by_gene.csv', '122_region_0_cell_by_gene.csv', '122_region_1_cell_by_gene.csv', '63_region_0_cell_by_gene.csv']
['1003_region_0_cell_metadata.csv', '1012_region_0_cell_metadata.csv', '1012_region_1_cell_metadata.csv', '1014_region_0_cell_metadata.csv', '1014_region_1_cell_metadata.csv', '1017_region_0_cell_metadata.csv', '1029_region_0_cell_metadata.csv', '122_region_0_cell_metadata.csv', '122_region_1_cell_metadata.csv', '63_region_0_cell_metadata.csv']


In [32]:
exp_matrix.sum(axis=0)

ADH1B        30656.0
ASPN          5523.0
BAAT        133007.0
BGN          50177.0
BTG3          4233.0
              ...   
Blank-11       371.0
TCL1A         1003.0
Blank-14       355.0
CCR6           314.0
DIO2           443.0
Length: 461, dtype: float64

In [8]:
exp_matrix = pd.read_csv(os.path.join(path, csv_files[i]), index_col=0)
exp_matrix

Unnamed: 0_level_0,ACTA2,ACTG2,ADAM12,ADAM28,ADGRE5,ADH1B,CCL21,AIM2,ANKRD55,AREG,...,Blank-51,Blank-52,Blank-53,Blank-54,Blank-55,Blank-56,Blank-57,Blank-58,Blank-59,Blank-60
cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,4.0,7.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,1.0,0.0,0.0,5.0,9.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131421,1.0,0.0,0.0,0.0,4.0,2.0,15.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
131422,0.0,0.0,0.0,0.0,3.0,1.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
131423,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
131424,0.0,0.0,0.0,0.0,3.0,0.0,10.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
regex = re.compile(r'(\d+)_region_(\d+)')
# 读取数据
pseudo_bulk = {}
for i in range(len(csv_files)):
    exp_matrix = pd.read_csv(os.path.join(path, csv_files[i]), index_col=0)
    if i == 0:
       gene_features = exp_matrix.columns 
    exp_matrix = exp_matrix.loc[:, gene_features]
    print(exp_matrix.columns)
    metadata = pd.read_csv(os.path.join(path, metadata_files[i]), index_col=0)
    match = regex.search(csv_files[i])
    if match:
        adata_filename = f'{match.group(1)}_region_{match.group(2)}.h5ad'
        print(adata_filename)
    adata = anndata.AnnData(X=exp_matrix.values, obs=metadata)
    print(adata)
    pseudo_bulk[f'{match.group(1)}_{match.group(2)}'] = adata.X.sum(axis=0)
    adata.write_h5ad(os.path.join(path, adata_filename))

Index(['ACTA2', 'ACTG2', 'ADAM12', 'ADAM28', 'ADGRE5', 'ADH1B', 'CCL21',
       'AIM2', 'ANKRD55', 'AREG',
       ...
       'Blank-51', 'Blank-52', 'Blank-53', 'Blank-54', 'Blank-55', 'Blank-56',
       'Blank-57', 'Blank-58', 'Blank-59', 'Blank-60'],
      dtype='object', length=461)
1003_region_0.h5ad
AnnData object with n_obs × n_vars = 369399 × 461
    obs: 'fov', 'volume', 'center_x', 'center_y', 'min_x', 'min_y', 'max_x', 'max_y', 'barcodeCount'




Index(['ACTA2', 'ACTG2', 'ADAM12', 'ADAM28', 'ADGRE5', 'ADH1B', 'CCL21',
       'AIM2', 'ANKRD55', 'AREG',
       ...
       'Blank-51', 'Blank-52', 'Blank-53', 'Blank-54', 'Blank-55', 'Blank-56',
       'Blank-57', 'Blank-58', 'Blank-59', 'Blank-60'],
      dtype='object', length=461)
1012_region_0.h5ad
AnnData object with n_obs × n_vars = 283525 × 461
    obs: 'fov', 'volume', 'center_x', 'center_y', 'min_x', 'min_y', 'max_x', 'max_y', 'barcodeCount'




Index(['ACTA2', 'ACTG2', 'ADAM12', 'ADAM28', 'ADGRE5', 'ADH1B', 'CCL21',
       'AIM2', 'ANKRD55', 'AREG',
       ...
       'Blank-51', 'Blank-52', 'Blank-53', 'Blank-54', 'Blank-55', 'Blank-56',
       'Blank-57', 'Blank-58', 'Blank-59', 'Blank-60'],
      dtype='object', length=461)
1012_region_1.h5ad
AnnData object with n_obs × n_vars = 131426 × 461
    obs: 'fov', 'volume', 'center_x', 'center_y', 'min_x', 'min_y', 'max_x', 'max_y', 'barcodeCount'




Index(['ACTA2', 'ACTG2', 'ADAM12', 'ADAM28', 'ADGRE5', 'ADH1B', 'CCL21',
       'AIM2', 'ANKRD55', 'AREG',
       ...
       'Blank-51', 'Blank-52', 'Blank-53', 'Blank-54', 'Blank-55', 'Blank-56',
       'Blank-57', 'Blank-58', 'Blank-59', 'Blank-60'],
      dtype='object', length=461)
1014_region_0.h5ad
AnnData object with n_obs × n_vars = 207552 × 461
    obs: 'fov', 'volume', 'center_x', 'center_y', 'min_x', 'min_y', 'max_x', 'max_y', 'barcodeCount'




Index(['ACTA2', 'ACTG2', 'ADAM12', 'ADAM28', 'ADGRE5', 'ADH1B', 'CCL21',
       'AIM2', 'ANKRD55', 'AREG',
       ...
       'Blank-51', 'Blank-52', 'Blank-53', 'Blank-54', 'Blank-55', 'Blank-56',
       'Blank-57', 'Blank-58', 'Blank-59', 'Blank-60'],
      dtype='object', length=461)
1014_region_1.h5ad
AnnData object with n_obs × n_vars = 25449 × 461
    obs: 'fov', 'volume', 'center_x', 'center_y', 'min_x', 'min_y', 'max_x', 'max_y', 'barcodeCount'




Index(['ACTA2', 'ACTG2', 'ADAM12', 'ADAM28', 'ADGRE5', 'ADH1B', 'CCL21',
       'AIM2', 'ANKRD55', 'AREG',
       ...
       'Blank-51', 'Blank-52', 'Blank-53', 'Blank-54', 'Blank-55', 'Blank-56',
       'Blank-57', 'Blank-58', 'Blank-59', 'Blank-60'],
      dtype='object', length=461)
1017_region_0.h5ad
AnnData object with n_obs × n_vars = 61404 × 461
    obs: 'fov', 'volume', 'center_x', 'center_y', 'min_x', 'min_y', 'max_x', 'max_y', 'barcodeCount'




Index(['ACTA2', 'ACTG2', 'ADAM12', 'ADAM28', 'ADGRE5', 'ADH1B', 'CCL21',
       'AIM2', 'ANKRD55', 'AREG',
       ...
       'Blank-51', 'Blank-52', 'Blank-53', 'Blank-54', 'Blank-55', 'Blank-56',
       'Blank-57', 'Blank-58', 'Blank-59', 'Blank-60'],
      dtype='object', length=461)
1029_region_0.h5ad
AnnData object with n_obs × n_vars = 120434 × 461
    obs: 'fov', 'volume', 'center_x', 'center_y', 'min_x', 'min_y', 'max_x', 'max_y', 'barcodeCount'




Index(['ACTA2', 'ACTG2', 'ADAM12', 'ADAM28', 'ADGRE5', 'ADH1B', 'CCL21',
       'AIM2', 'ANKRD55', 'AREG',
       ...
       'Blank-51', 'Blank-52', 'Blank-53', 'Blank-54', 'Blank-55', 'Blank-56',
       'Blank-57', 'Blank-58', 'Blank-59', 'Blank-60'],
      dtype='object', length=461)
122_region_0.h5ad
AnnData object with n_obs × n_vars = 91860 × 461
    obs: 'fov', 'volume', 'center_x', 'center_y', 'min_x', 'min_y', 'max_x', 'max_y', 'barcodeCount'




Index(['ACTA2', 'ACTG2', 'ADAM12', 'ADAM28', 'ADGRE5', 'ADH1B', 'CCL21',
       'AIM2', 'ANKRD55', 'AREG',
       ...
       'Blank-51', 'Blank-52', 'Blank-53', 'Blank-54', 'Blank-55', 'Blank-56',
       'Blank-57', 'Blank-58', 'Blank-59', 'Blank-60'],
      dtype='object', length=461)
122_region_1.h5ad




AnnData object with n_obs × n_vars = 151647 × 461
    obs: 'fov', 'volume', 'center_x', 'center_y', 'min_x', 'min_y', 'max_x', 'max_y', 'barcodeCount'
Index(['ACTA2', 'ACTG2', 'ADAM12', 'ADAM28', 'ADGRE5', 'ADH1B', 'CCL21',
       'AIM2', 'ANKRD55', 'AREG',
       ...
       'Blank-51', 'Blank-52', 'Blank-53', 'Blank-54', 'Blank-55', 'Blank-56',
       'Blank-57', 'Blank-58', 'Blank-59', 'Blank-60'],
      dtype='object', length=461)
63_region_0.h5ad
AnnData object with n_obs × n_vars = 228679 × 461
    obs: 'fov', 'volume', 'center_x', 'center_y', 'min_x', 'min_y', 'max_x', 'max_y', 'barcodeCount'




In [4]:
pseudo_bulk = pd.DataFrame(pseudo_bulk)
pseudo_bulk.index = exp_matrix.columns
pseudo_bulk

Unnamed: 0,1003_0,1012_0,1012_1,1014_0,1014_1,1017_0,1029_0,122_0,122_1,63_0
ACTA2,270227.0,434886.0,201380.0,59374.0,9884.0,32705.0,106966.0,38842.0,22886.0,13156.0
ACTG2,22833.0,17338.0,5980.0,3665.0,554.0,1138.0,5193.0,3040.0,3080.0,1697.0
ADAM12,16661.0,20374.0,3441.0,1697.0,399.0,3404.0,3379.0,5146.0,6441.0,2016.0
ADAM28,35283.0,13956.0,12717.0,1170.0,88.0,1554.0,3660.0,4583.0,3822.0,1905.0
ADGRE5,402208.0,346434.0,87069.0,41693.0,7982.0,42169.0,47220.0,60694.0,35784.0,29150.0
...,...,...,...,...,...,...,...,...,...,...
Blank-56,5819.0,2860.0,1556.0,388.0,52.0,214.0,555.0,520.0,652.0,279.0
Blank-57,5225.0,3626.0,1504.0,408.0,58.0,502.0,607.0,736.0,625.0,489.0
Blank-58,5817.0,2875.0,1964.0,427.0,54.0,253.0,797.0,1120.0,821.0,419.0
Blank-59,17939.0,7644.0,3699.0,888.0,75.0,538.0,1721.0,726.0,594.0,675.0


In [5]:
pseudo_bulk.to_csv(os.path.join(path, 'pseudo_bulk.csv'))

In [25]:
with open('Liver/genenames.txt', 'w') as f:
    for gene in exp_matrix.columns:
        f.write(f'{gene}\n')

In [14]:
exp_matrix.shape
metadata.shape

(120434, 9)

In [37]:
clinical = pd.read_table('TCGA/TCGA-LIHC.GDC_phenotype.tsv')
clinical.columns

Index(['submitter_id.samples',
       'adjacent_hepatic_tissue_inflammation_extent_type',
       'age_at_initial_pathologic_diagnosis', 'albumin_result_lower_limit',
       'albumin_result_specified_value', 'albumin_result_upper_limit',
       'batch_number', 'bcr', 'bcr_followup_barcode', 'bcr_followup_uuid',
       ...
       'days_to_collection.samples', 'days_to_sample_procurement.samples',
       'initial_weight.samples', 'is_ffpe.samples', 'oct_embedded.samples',
       'preservation_method.samples', 'sample_type.samples',
       'sample_type_id.samples', 'state.samples', 'tissue_type.samples'],
      dtype='object', length=119)

In [38]:
clinical[['Pathologic']]

Unnamed: 0,age_at_initial_pathologic_diagnosis,albumin_result_lower_limit,albumin_result_specified_value,albumin_result_upper_limit,bilirubin_lower_limit,bilirubin_upper_limit,cancer_first_degree_relative,creatinine_lower_level,creatinine_upper_limit,creatinine_value_in_mg_dl,...,days_to_last_follow_up.diagnoses,year_of_diagnosis.diagnoses,bmi.exposures,height.exposures,weight.exposures,days_to_collection.samples,days_to_sample_procurement.samples,initial_weight.samples,preservation_method.samples,sample_type_id.samples
count,468.0,369.0,378.0,369.0,376.0,386.0,171.0,382.0,384.0,384.0,...,329.0,466.0,417.0,422.0,433.0,466.0,0.0,469.0,0.0,469.0
mean,60.264957,14.130623,18.081481,19.265583,0.129521,1.035233,1.77193,1.699215,3.157031,2.614323,...,842.914894,2008.845494,26.634508,167.635071,74.094688,1272.504292,,265.162047,,2.904051
std,13.834533,197.656825,267.283631,265.264388,0.096917,1.770143,1.418585,7.324469,12.876252,11.335129,...,776.564828,4.285227,9.646802,11.528937,19.758092,1349.031291,,331.229204,,3.923053
min,16.0,0.3,0.2,0.5,0.0,0.1,0.0,0.0,0.9,0.4,...,0.0,1995.0,14.526644,64.0,40.0,7.0,,1.0,,1.0
25%,52.0,3.5,3.5,5.0,0.1,0.5,1.0,0.6,1.2,0.8,...,347.0,2007.0,21.877551,162.0,61.0,170.75,,80.0,,1.0
50%,62.0,3.5,4.0,5.0,0.1,0.7,1.0,0.7,1.4,0.9,...,615.0,2010.0,24.983563,168.0,71.0,777.5,,160.0,,1.0
75%,70.0,3.8,4.3,5.2,0.2,1.0,2.0,0.8,1.5,1.1,...,1145.0,2012.0,29.320988,174.0,84.0,2025.5,,290.0,,1.0
max,90.0,3800.0,5200.0,5100.0,1.0,19.0,9.0,62.0,120.0,124.0,...,3675.0,2013.0,131.835938,196.0,172.0,6014.0,,2190.0,,11.0


In [17]:
expression_data = pd.read_csv('/raid1/YiDingcheng/BulkPheno/Cleaned_data/Liver/TCGA-LIHC.htseq_counts_clean_normalized_tumor.csv', index_col=0)
label = pd.read_csv('/raid1/YiDingcheng/BulkPheno/Cleaned_data/Liver/TCGA-LIHC.survival_clean_tumor.csv', index_col=0)

In [20]:
from sklearn.model_selection import train_test_split
train_exp, test_exp, train_label, test_label = train_test_split(expression_data, label, test_size=0.2, random_state=42)
train_exp.to_csv('Cleaned_data/Liver/train_exp.csv')
test_exp.to_csv('Cleaned_data/Liver/test_exp.csv')
train_label.to_csv('Cleaned_data/Liver/train_label.csv')
test_label.to_csv('Cleaned_data/Liver/test_label.csv')