## Download data

In [1]:
# wget "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE252nnn/GSE252432/suppl/GSE252432_RAW.tar"import tarfile
# wget "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE253nnn/GSE253173/suppl/GSE253173_single_cell_DREAM.h5ad.gz"
# gunzip GSE253173_single_cell_DREAM.h5ad.gz

# import tarfile

# # Path to .tar file
# tar_path = 'GSE252432_RAW.tar'

# # Extract to this directory
# extract_path = './GSE252432_RAW'

# # Open and extract
# with tarfile.open(tar_path, 'r') as tar:
#     tar.extractall(path=extract_path)

# print(f"Extracted to {extract_path}")

## Read data and find matches

In [2]:
import os
import pandas as pd

folder_path = './GSE252432_RAW'
dataframes = {}

# Loop through all .csv.gz files in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith('.csv.gz'):
        full_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(full_path, compression='gzip')
        dataframes[file_name] = df  # Store in dict with filename as key

# Example: show one of them
dataframes['GSM8001307_KH_48_TCR.csv.gz'].head()

Unnamed: 0,barcode,is_cell,contig_id,high_confidence,length,chain,v_gene,d_gene,j_gene,c_gene,...,fwr3_nt,cdr3,cdr3_nt,fwr4,fwr4_nt,reads,umis,raw_clonotype_id,raw_consensus_id,exact_subclonotype_id
0,AAACCTGCATCCTTGC-1,True,AAACCTGCATCCTTGC-1_contig_1,True,479,TRB,TRBV5-5,,TRBJ1-1,TRBC1,...,AGAGGAAGAGGAAACTTCCCTGATCGATTCTCAGCTCGCCAGTTCC...,CASSLGLNTEAFF,TGTGCCAGCAGCTTGGGGCTAAACACTGAAGCTTTCTTT,GQGTRLTVV,GGACAAGGCACCAGACTCACAGTTGTAG,2891,3,clonotype69,clonotype69_consensus_1,1
1,AAACGGGGTCCGTGAC-1,True,AAACGGGGTCCGTGAC-1_contig_1,True,568,TRA,TRAV19,,TRAJ44,TRAC,...,GAAATAAGTGGTCGGTATTCTTGGAACTTCCAGAAATCCACCAGTT...,CALSEDTGTASKLTF,TGTGCTCTGAGTGAGGATACCGGCACTGCCAGTAAACTCACCTTT,GTGTRLQVTL,GGGACTGGAACAAGACTTCAGGTCACGCTCG,801,2,clonotype195,clonotype195_consensus_2,1
2,AAACGGGGTCCGTGAC-1,True,AAACGGGGTCCGTGAC-1_contig_2,True,517,TRB,TRBV14,,TRBJ2-1,TRBC2,...,CAGGATGAGTCCGGTATGCCCAACAATCGATTCTTAGCTGAAAGGA...,CASSLLTPTDDNEQFF,TGTGCCAGCAGCCTCCTTACACCGACGGACGACAATGAGCAGTTCTTC,GPGTRLTVL,GGGCCAGGGACACGGCTCACCGTGCTAG,23208,10,clonotype195,clonotype195_consensus_1,1
3,AAAGTAGCAATCCGAT-1,True,AAAGTAGCAATCCGAT-1_contig_1,True,502,TRB,TRBV7-9,,TRBJ1-2,TRBC1,...,CTAGAAAAATCAAGGCTGCTCAGTGATCGGTTCTCTGCAGAGAGGC...,CASSLDGTNYGYTF,TGTGCCAGCAGCTTAGACGGCACTAACTATGGCTACACCTTC,GSGTRLTVV,GGTTCGGGGACCAGGTTAACCGTTGTAG,9293,3,clonotype16,clonotype16_consensus_1,1
4,AAAGTAGTCACCCGAG-1,True,AAAGTAGTCACCCGAG-1_contig_1,True,556,TRA,TRAV12-1,,TRAJ40,TRAC,...,AATGAAGATGGAAGGTTTACAGCACAGCTCAATAGAGCCAGCCAGT...,CVVNMGGPGTYKYIF,TGTGTGGTGAACATGGGGGGCCCAGGAACCTACAAATACATCTTT,GTGTRLKVLA,GGAACAGGCACCAGGCTGAAGGTTTTAGCAA,6908,3,clonotype88,clonotype88_consensus_3,1


In [3]:
import scanpy as sc
adata_all = sc.read_h5ad("GSE253173_single_cell_DREAM.h5ad")
print(adata_all)

AnnData object with n_obs × n_vars = 390373 × 21906
    obs: 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'predicted.celltype.l2', 'LibraryName', 'PFS_6M', 'Timepoint'
    var: 'gene'
    obsm: 'X_ref.umap'


In [4]:
# Based on barcode, match single cell and TCR data
from collections import defaultdict

def match_barcodes_stop_early(df, adata, threshold=100):
    """
    Iterate over barcodes in df, count matches in adata.obs['LibraryName'],
    and stop if any library's count exceeds 'threshold'.
    Returns the name of the library that exceeded the threshold, or None
    if none exceeded it.
    """
    library_counts = defaultdict(int)

    for bc in df["barcode"].unique():
        # Remove trailing '-1' if present
        pattern = bc.rsplit('-', 1)[0]  
        
        # Find matching rows in adata.obs based on partial string match
        mask = adata.obs.index.str.contains(pattern, regex=False)
        matched_obs = adata.obs[mask]
        
        # Increment counts for each LibraryName
        for lib in matched_obs["LibraryName"]:
            library_counts[lib] += 1
            if library_counts[lib] > threshold:
                # As soon as a single library's count passes threshold, return it
                return lib

    # If we never exceeded 'threshold', return None
    return None


tcr_file_map = {}
for file_name, df in dataframes.items():
    max_lib = match_barcodes_stop_early(df, adata_all, threshold=100)
    if max_lib is not None:
        tcr_file_map[max_lib] = file_name

In [5]:
for sample in adata_all.obs['LibraryName'].unique():
    print(sample,tcr_file_map[sample])

KH_1 GSM8001307_KH_48_TCR.csv.gz
KH_10 GSM8001317_KH_58_TCR.csv.gz
KH_11 GSM8001318_KH_59_TCR.csv.gz
KH_12 GSM8001319_KH_60_TCR.csv.gz
KH_13 GSM8001320_KH_61_TCR.csv.gz
KH_14 GSM8001321_KH_62_TCR.csv.gz
KH_15 GSM8001322_KH_63_TCR.csv.gz
KH_2 GSM8001308_KH_49_TCR.csv.gz
KH_26 GSM8001323_KH_64_TCR.csv.gz
KH_27 GSM8001324_KH_65_TCR.csv.gz
KH_28 GSM8001325_KH_66_TCR.csv.gz
KH_29 GSM8001326_KH_67_TCR.csv.gz
KH_3 GSM8001309_KH_50_TCR.csv.gz
KH_30 GSM8001327_KH_68_TCR.csv.gz
KH_31 GSM8001328_KH_69_TCR.csv.gz
KH_32 GSM8001330_KH_71_TCR.csv.gz
KH_33 GSM8001331_KH_72_TCR.csv.gz
KH_34 GSM8001332_KH_73_TCR.csv.gz
KH_35 GSM8001333_KH_74_TCR.csv.gz
KH_36 GSM8001334_KH_75_TCR.csv.gz
KH_37 GSM8001335_KH_76_TCR.csv.gz
KH_38 GSM8001336_KH_77_TCR.csv.gz
KH_39 GSM8001337_KH_78_TCR.csv.gz
KH_4 GSM8001311_KH_52_TCR.csv.gz
KH_40 GSM8001338_KH_79_TCR.csv.gz
KH_41 GSM8001355_KH_96_TCR.csv.gz
KH_42 GSM8001356_KH_97_TCR.csv.gz
KH_43 GSM8001357_KH_98_TCR.csv.gz
KH_44 GSM8001310_KH_51_TCR.csv.gz
KH_46 GSM8001358_K

In [6]:
import pickle

# Save the dictionary to a file
with open('tcr_file_map.pkl', 'wb') as f:
    pickle.dump(tcr_file_map, f)