In [1]:
import pandas as pd
import glob
import os

def process_tcr_files(folder_path):
    # Define the search pattern for the TCR files.
    # This will find all files in the specified folder that end with the given suffix.
    file_pattern = os.path.join(folder_path, '*scTCR_filtered_contig_annotations.csv.gz')
    
    # Use glob to find all files matching the pattern.
    tcr_files = glob.glob(file_pattern)
    
    if not tcr_files:
        print(f"No files found matching the pattern: {file_pattern}")
        return None

    # A list to hold the individual, processed DataFrames.
    all_tcr_data = []

    # Loop through each file found.
    for f in tcr_files:
        # Extract sample identifiers from the filename.
        # e.g., from 'GEMM14190_PEM1C1_filtered_contig_annotations.csv.gz'
        # sample -> 'GEMM14190'
        # ident -> 'PEM1C1'
        filename = os.path.basename(f)
        parts = filename.split('_')
        sample = parts[0]
        ident = parts[1]
        
        print(f"→ Processing {sample} ({ident}) from {filename}")

        # Read the gzipped CSV file into a pandas DataFrame.
        # Pandas handles the '.gz' decompression automatically.
        try:
            df = pd.read_csv(f)
        except Exception as e:
            print(f"  Could not read file {filename}. Error: {e}")
            continue

        # Create the composite cell ID by combining the barcode and the identifier.
        # This mimics the logic 'barcode + "_" + orig.ident' from your example.
        # We check if the 'barcode' column exists before proceeding.
        if 'barcode' in df.columns:
            df['cell_id'] = df['barcode'] + '_' + ident
        else:
            print(f"  'barcode' column not found in {filename}. Skipping cell_id creation.")
            continue
            
        # Add the sample and ident as new columns for metadata.
        df['sample'] = sample
        df['orig.ident'] = ident
        
        # Add the processed DataFrame to our list.
        all_tcr_data.append(df)

    # Concatenate all the DataFrames in the list into a single DataFrame.
    if not all_tcr_data:
        print("No data was processed.")
        return None
        
    print("\nConcatenating all processed files...")
    concatenated_df = pd.concat(all_tcr_data, ignore_index=True)
    
    return concatenated_df



In [2]:
data_folder = './' 

# Run the processing function
final_tcr_data = process_tcr_files(data_folder)

→ Processing GSM6514154 (PEM3C3) from GSM6514154_PEM3C3_scTCR_filtered_contig_annotations.csv.gz
→ Processing GSM6514178 (PEM12C3) from GSM6514178_PEM12C3_scTCR_filtered_contig_annotations.csv.gz
→ Processing GSM6514192 (PEM19C1) from GSM6514192_PEM19C1_scTCR_filtered_contig_annotations.csv.gz
→ Processing GSM6514182 (PEM14C1) from GSM6514182_PEM14C1_scTCR_filtered_contig_annotations.csv.gz
→ Processing GSM6514162 (PEM7C1) from GSM6514162_PEM7C1_scTCR_filtered_contig_annotations.csv.gz
→ Processing GSM6514181 (PEM13C5) from GSM6514181_PEM13C5_scTCR_filtered_contig_annotations.csv.gz
→ Processing GSM6514197 (PEM22C5) from GSM6514197_PEM22C5_scTCR_filtered_contig_annotations.csv.gz
→ Processing GSM6514157 (PEM5C3) from GSM6514157_PEM5C3_scTCR_filtered_contig_annotations.csv.gz
→ Processing GSM6514185 (PEM15C5) from GSM6514185_PEM15C5_scTCR_filtered_contig_annotations.csv.gz
→ Processing GSM6514166 (PEM8C3) from GSM6514166_PEM8C3_scTCR_filtered_contig_annotations.csv.gz
→ Processing GSM65

In [3]:
final_tcr_data

Unnamed: 0,barcode,is_cell,contig_id,high_confidence,length,chain,v_gene,d_gene,j_gene,c_gene,...,productive,cdr3,cdr3_nt,reads,umis,raw_clonotype_id,raw_consensus_id,cell_id,sample,orig.ident
0,AAACCTGAGGATGGAA-1,True,AAACCTGAGGATGGAA-1_contig_1,True,703,TRB,TRBV7-9,TRBD1,TRBJ1-4,TRBC1,...,True,CASSLGTGANEKLFF,TGTGCCAGCAGCTTGGGGACAGGCGCCAATGAAAAACTGTTTTTT,122394,27,clonotype40,clonotype40_consensus_2,AAACCTGAGGATGGAA-1_PEM3C3,GSM6514154,PEM3C3
1,AAACCTGAGGATGGAA-1,True,AAACCTGAGGATGGAA-1_contig_2,True,517,TRA,TRAV13-1,,TRAJ48,TRAC,...,True,CAARIGNEKLTF,TGTGCAGCAAGGATTGGAAATGAGAAATTAACCTTT,34504,7,clonotype40,clonotype40_consensus_1,AAACCTGAGGATGGAA-1_PEM3C3,GSM6514154,PEM3C3
2,AAACCTGTCGAATGGG-1,True,AAACCTGTCGAATGGG-1_contig_1,True,723,TRA,TRAV5,,TRAJ42,TRAC,...,True,CAESENYGGSQGNLIF,TGTGCAGAGAGTGAAAATTATGGAGGAAGCCAAGGAAATCTCATCTTT,16219,3,clonotype86,clonotype86_consensus_2,AAACCTGTCGAATGGG-1_PEM3C3,GSM6514154,PEM3C3
3,AAACCTGTCGAATGGG-1,True,AAACCTGTCGAATGGG-1_contig_2,True,952,Multi,,,TRAJ5,TRAC,...,,,,3252,1,clonotype86,,AAACCTGTCGAATGGG-1_PEM3C3,GSM6514154,PEM3C3
4,AAACCTGTCGAATGGG-1,True,AAACCTGTCGAATGGG-1_contig_5,True,866,TRB,TRBV24-1,TRBD2,TRBJ2-1,TRBC2,...,True,CATSDFGMNNEQFF,TGTGCCACCAGTGATTTCGGGATGAACAATGAGCAGTTCTTC,62097,15,clonotype86,clonotype86_consensus_1,AAACCTGTCGAATGGG-1_PEM3C3,GSM6514154,PEM3C3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
402736,TTTGTCAGTGAGCGAT-1,True,TTTGTCAGTGAGCGAT-1_contig_2,True,460,TRB,TRBV28,,TRBJ1-1,TRBC1,...,True,CASIRTTEAFF,TGTGCCAGTATCCGGACCACTGAAGCTTTCTTT,8263,4,clonotype2169,clonotype2169_consensus_1,TTTGTCAGTGAGCGAT-1_PEM13C1,GSM6514180,PEM13C1
402737,TTTGTCAGTGAGCGAT-1,True,TTTGTCAGTGAGCGAT-1_contig_3,True,310,TRB,,,TRBJ2-5,TRBC2,...,False,,,4302,1,clonotype2169,,TTTGTCAGTGAGCGAT-1_PEM13C1,GSM6514180,PEM13C1
402738,TTTGTCATCTCACATT-1,True,TTTGTCATCTCACATT-1_contig_1,True,465,TRA,TRAV13-1,,TRAJ53,TRAC,...,True,CAASGGGGSNYKLTF,TGTGCAGCAAGTGGGGGTGGAGGTAGCAACTATAAACTGACATTT,3136,2,clonotype2170,clonotype2170_consensus_2,TTTGTCATCTCACATT-1_PEM13C1,GSM6514180,PEM13C1
402739,TTTGTCATCTCACATT-1,True,TTTGTCATCTCACATT-1_contig_2,True,496,TRB,TRBV3-1,,TRBJ1-6,TRBC1,...,True,CASSQGGLGSPLHF,TGTGCCAGCAGCCAAGGCGGGTTGGGGTCACCCCTCCACTTT,13472,7,clonotype2170,clonotype2170_consensus_1,TTTGTCATCTCACATT-1_PEM13C1,GSM6514180,PEM13C1


In [4]:
final_tcr_data.set_index('cell_id', inplace=True)
final_tcr_data

Unnamed: 0_level_0,barcode,is_cell,contig_id,high_confidence,length,chain,v_gene,d_gene,j_gene,c_gene,full_length,productive,cdr3,cdr3_nt,reads,umis,raw_clonotype_id,raw_consensus_id,sample,orig.ident
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
AAACCTGAGGATGGAA-1_PEM3C3,AAACCTGAGGATGGAA-1,True,AAACCTGAGGATGGAA-1_contig_1,True,703,TRB,TRBV7-9,TRBD1,TRBJ1-4,TRBC1,True,True,CASSLGTGANEKLFF,TGTGCCAGCAGCTTGGGGACAGGCGCCAATGAAAAACTGTTTTTT,122394,27,clonotype40,clonotype40_consensus_2,GSM6514154,PEM3C3
AAACCTGAGGATGGAA-1_PEM3C3,AAACCTGAGGATGGAA-1,True,AAACCTGAGGATGGAA-1_contig_2,True,517,TRA,TRAV13-1,,TRAJ48,TRAC,True,True,CAARIGNEKLTF,TGTGCAGCAAGGATTGGAAATGAGAAATTAACCTTT,34504,7,clonotype40,clonotype40_consensus_1,GSM6514154,PEM3C3
AAACCTGTCGAATGGG-1_PEM3C3,AAACCTGTCGAATGGG-1,True,AAACCTGTCGAATGGG-1_contig_1,True,723,TRA,TRAV5,,TRAJ42,TRAC,True,True,CAESENYGGSQGNLIF,TGTGCAGAGAGTGAAAATTATGGAGGAAGCCAAGGAAATCTCATCTTT,16219,3,clonotype86,clonotype86_consensus_2,GSM6514154,PEM3C3
AAACCTGTCGAATGGG-1_PEM3C3,AAACCTGTCGAATGGG-1,True,AAACCTGTCGAATGGG-1_contig_2,True,952,Multi,,,TRAJ5,TRAC,False,,,,3252,1,clonotype86,,GSM6514154,PEM3C3
AAACCTGTCGAATGGG-1_PEM3C3,AAACCTGTCGAATGGG-1,True,AAACCTGTCGAATGGG-1_contig_5,True,866,TRB,TRBV24-1,TRBD2,TRBJ2-1,TRBC2,True,True,CATSDFGMNNEQFF,TGTGCCACCAGTGATTTCGGGATGAACAATGAGCAGTTCTTC,62097,15,clonotype86,clonotype86_consensus_1,GSM6514154,PEM3C3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCAGTGAGCGAT-1_PEM13C1,TTTGTCAGTGAGCGAT-1,True,TTTGTCAGTGAGCGAT-1_contig_2,True,460,TRB,TRBV28,,TRBJ1-1,TRBC1,True,True,CASIRTTEAFF,TGTGCCAGTATCCGGACCACTGAAGCTTTCTTT,8263,4,clonotype2169,clonotype2169_consensus_1,GSM6514180,PEM13C1
TTTGTCAGTGAGCGAT-1_PEM13C1,TTTGTCAGTGAGCGAT-1,True,TTTGTCAGTGAGCGAT-1_contig_3,True,310,TRB,,,TRBJ2-5,TRBC2,False,False,,,4302,1,clonotype2169,,GSM6514180,PEM13C1
TTTGTCATCTCACATT-1_PEM13C1,TTTGTCATCTCACATT-1,True,TTTGTCATCTCACATT-1_contig_1,True,465,TRA,TRAV13-1,,TRAJ53,TRAC,True,True,CAASGGGGSNYKLTF,TGTGCAGCAAGTGGGGGTGGAGGTAGCAACTATAAACTGACATTT,3136,2,clonotype2170,clonotype2170_consensus_2,GSM6514180,PEM13C1
TTTGTCATCTCACATT-1_PEM13C1,TTTGTCATCTCACATT-1,True,TTTGTCATCTCACATT-1_contig_2,True,496,TRB,TRBV3-1,,TRBJ1-6,TRBC1,True,True,CASSQGGLGSPLHF,TGTGCCAGCAGCCAAGGCGGGTTGGGGTCACCCCTCCACTTT,13472,7,clonotype2170,clonotype2170_consensus_1,GSM6514180,PEM13C1


In [5]:
final_tcr_data.to_csv('vdj.csv',index = True)