In [21]:
import scanpy as sc
import pandas as pd
import numpy as np
import os
from scipy.io import mmread
from scipy.sparse import csr_matrix
from anndata import AnnData

data_dir = "/Users/wsun/research/CAT/"


## Read in data

In [22]:
genes = {}

# Chen_2024
features = pd.read_csv(f"{data_dir}Chen_2024/GSE236581_features.tsv.gz", compression='gzip', sep='\t', header=None)
print(features.shape)
print(features.head())
genes['Chen_2024'] = np.array(list(features[1].values))

# Check for duplicates
duplicates = features[1][features[1].duplicated()]
if not duplicates.empty:
    print(f"Found {len(duplicates)} duplicate gene names:")
    print(duplicates.unique())
else:
    print("No duplicate gene names found.")


(36027, 3)
             0            1                2
0  MIR1302-2HG  MIR1302-2HG  Gene Expression
1      FAM138A      FAM138A  Gene Expression
2        OR4F5        OR4F5  Gene Expression
3   AL627309.1   AL627309.1  Gene Expression
4   AL627309.3   AL627309.3  Gene Expression
No duplicate gene names found.


In [23]:
# Chow_2023
adata = sc.read_h5ad(f"{data_dir}Chow_2023/GSE212217_all_samples_with_metadata.h5ad")
print(adata)
print(adata.var.head())
genes['Chow_2023'] = np.array(adata.var_names)

# Check for duplicates
duplicates = pd.Series(adata.var_names)[pd.Series(adata.var_names).duplicated()]
if not duplicates.empty:
    print(f"Found {len(duplicates)} duplicate gene names:")
    print(duplicates.unique())
else:
    print("No duplicate gene names found.")

print(genes['Chow_2023'][:6])   

AnnData object with n_obs × n_vars = 259834 × 36601
    obs: 'orig.ident', 'index', 'nCount_RNA', 'nFeature_RNA', 'propMt', 'nCount_SCT', 'nFeature_SCT', 'integrated_snn_res.0.9', 'seurat_clusters', 'timepoint', 'timepointBinary', 'patient', 'clinical', 'finalIdent', 'sample'
Empty DataFrame
Columns: []
Index: [MIR1302-2HG, FAM138A, OR4F5, AL627309.1, AL627309.3]
No duplicate gene names found.
['MIR1302-2HG' 'FAM138A' 'OR4F5' 'AL627309.1' 'AL627309.3' 'AL627309.2']


In [24]:
# Liu 2022
features = [line.strip() for line in open(f"{data_dir}Liu_2022/features.txt")]   
print(len(features))
genes['Liu_2022'] = np.array(features)
# Check for duplicates
duplicates = pd.Series(features)[pd.Series(features).duplicated()]
if not duplicates.empty:
    print(f"Found {len(duplicates)} duplicate gene names:")
    print(duplicates.unique())
else:
    print("No duplicate gene names found.")
print(genes['Liu_2022'][:6])

19790
No duplicate gene names found.
['OR4F5' 'OR4F29' 'OR4F16' 'SAMD11' 'NOC2L' 'KLHL17']


In [25]:
# Liu 2025
features = pd.read_csv(f"{data_dir}Liu_2025/GSE243013_genes.csv.gz")
print(features.shape)
print(features.head()) 
genes['Liu_2025'] = np.array(features['geneSymbol'].values)
# Check for duplicates
duplicates = features['geneSymbol'][features['geneSymbol'].duplicated()]
if not duplicates.empty:
    print(f"Found {len(duplicates)} duplicate gene names:")
    print(duplicates.unique())
else:
    print("No duplicate gene names found.")
print(genes['Liu_2025'][:6])


(31831, 1)
    geneSymbol
0  MIR1302-2HG
1   AL627309.1
2   AL627309.3
3   AL627309.2
4   AL627309.4
No duplicate gene names found.
['MIR1302-2HG' 'AL627309.1' 'AL627309.3' 'AL627309.2' 'AL627309.4'
 'AL732372.1']


In [26]:
# Zheng 2022
df_path = f"{data_dir}Zheng_2021/GSE156728/GSE156728_BC_10X.CD8.counts.txt.gz"
df = pd.read_csv(df_path, sep="\t", index_col=0, compression="gzip")
print(df.shape)
print(df.iloc[:6, :3])
genes['Zheng_2022'] = np.array(df.index)
# Check for duplicates
duplicates = df.index[df.index.duplicated()]
if not duplicates.empty:
    print(f"Found {len(duplicates)} duplicate gene names:")
    print(duplicates.unique())
else:   
    print("No duplicate gene names found.")
print(genes['Zheng_2022'][:6])


(24148, 4291)
               AAAGATGAGGTGTGGT.1  AAAGCAAAGTACGCGA.1  AAAGCAAGTGTGAATA.1
RP11-34P13.3                    0                   0                   0
RP11-34P13.7                    0                   0                   0
FO538757.3                      0                   0                   0
FO538757.2                      0                   0                   0
AP006222.2                      0                   0                   0
RP4-669L17.10                   0                   0                   0
No duplicate gene names found.
['RP11-34P13.3' 'RP11-34P13.7' 'FO538757.3' 'FO538757.2' 'AP006222.2'
 'RP4-669L17.10']


In [27]:
# Find common genes
common_genes = set.intersection(*(set(g) for g in genes.values()))
print(f"Number of common genes across all studies: {len(common_genes)}")

# Save to file
with open("data/common_genes.txt", "w") as f:
    for gene in sorted(common_genes):
        f.write(f"{gene}\n")
print(sorted(list(common_genes))[:20])



Number of common genes across all studies: 16323
['A1BG', 'A1CF', 'A2M', 'A2ML1', 'A4GALT', 'AAAS', 'AACS', 'AADAC', 'AADAT', 'AAGAB', 'AAK1', 'AAMDC', 'AAMP', 'AANAT', 'AAR2', 'AARD', 'AARS', 'AARS2', 'AARSD1', 'AASDH']
