In [1]:
import os
import pandas as pd

def process_files_with_none(input_file, folder_path, output_file):
    """
    Match CDR3aa (from the input CSV) as a substring of aaSeqCDR3 (from TSV files) and compute the sum 
    of uniqueMoleculeCount for each cell line. Add a 'None' column for unmatched sequences.

    Parameters:
        input_file (str): Path to the input CSV file with cell line and CDR3aa information.
        folder_path (str): Path to the folder containing TSV files.
        output_file (str): Path to save the resultant output file.
    """
    # Read the first input file
    file1_data = pd.read_csv(input_file, sep=",")
    
    # Ensure the necessary columns are present in the first file
    if 'CDR3aa' not in file1_data.columns or 'cell_line' not in file1_data.columns:
        raise ValueError("Input file must contain 'CDR3aa' and 'cell_line' columns.")

    # Create a lookup dictionary for precomputing matches
    lookup_dict = {}
    for _, cl_row in file1_data.iterrows():
        cdr3aa = cl_row['CDR3aa']
        cell_line = cl_row['cell_line']
        if cdr3aa not in lookup_dict:
            lookup_dict[cdr3aa] = cell_line

    # Initialize a dictionary to store results
    results = {}

    # Iterate through each TSV file in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".tsv") and filename != os.path.basename(output_file):  # Ensure it's a TSV file and not the output file
            filepath = os.path.join(folder_path, filename)
            
            # Read the TSV file
            tsv_data = pd.read_csv(filepath, sep="\t", usecols=['aaSeqCDR3', 'uniqueMoleculeCount'])

            # Dictionary to store cell_line-wise uniqueMoleculeCount for the current file
            file_counts = {}
            unmatched_count = 0  # Counter for unmatched sequences

            # Iterate through the rows in the TSV file
            for _, row in tsv_data.iterrows():
                aaSeqCDR3 = row['aaSeqCDR3']
                unique_count = row['uniqueMoleculeCount']

                # Use the precomputed lookup dictionary for matching
                cell_line = lookup_dict.get(aaSeqCDR3, None)
                if cell_line:
                    # Accumulate the uniqueMoleculeCount for the matching cell_line
                    file_counts[cell_line] = file_counts.get(cell_line, 0) + unique_count
                else:
                    # If no match, add to unmatched count
                    unmatched_count += unique_count
            
            # Add the unmatched count under 'None'
            file_counts['None'] = unmatched_count
#             parts = filename.split("_")
#             print(parts[9])
            
            simplified_name = "_".join(filename.split("_")[2:10])
            
            # Add the file-specific results to the overall results dictionary
            results[simplified_name] = file_counts

            # Display the file being processed
            print(f"Processed file: {filename}")

    # Convert results to a DataFrame
    result_df = pd.DataFrame(results).fillna(0).T  # Transpose to have filenames as rows

    # Save the output as a TSV file
    result_df.to_csv(output_file, sep="\t", index_label="Filename")

In [3]:

# Parameters for processing
input_file = "/home/rittika/Workprojects/TakaraBio-MiXCR/BCR-SEQC_nine_cell_lines.csv"  # Replace with the actual input file path
folder_path = "//home/rittika/Workprojects/NEB"        # Replace with the actual folder path
output_file = "/home/rittika/Workprojects/NEB/NEB-ShortReads.tsv"    # Replace with the desired output file path

# Execute the function
process_files_with_none(input_file, folder_path, output_file)
print(f"All files are done.")

Processed file: MiXCR_IMGT_Exp2-RNA-D1-NEB-L1-4a75bb_S3.clones_IGL.tsv
Processed file: MiXCR_IMGT_Exp2-RNA-D1-NEB-L1-4a75bb_S3.clones_IGH.tsv
Processed file: MiXCR_IMGT_Exp2-RNA-D1-NEB-L2-d681bd_S10.clones_IGK.tsv
Processed file: MiXCR_IMGT_Exp2-RNA-D1-NEB-L2-d681bd_S10.clones_IGL.tsv
Processed file: MiXCR_IMGT_Exp2-RNA-D1-NEB-L2-d681bd_S10.clones_IGH.tsv
Processed file: MiXCR_IMGT_Exp2-RNA-D1-NEB-L3-2626b4_S17.clones_IGH.tsv
Processed file: MiXCR_IMGT_Exp2-RNA-D1-NEB-L3-2626b4_S17.clones_IGK.tsv
Processed file: MiXCR_IMGT_Exp2-RNA-D1-NEB-L3-2626b4_S17.clones_IGL.tsv
Processed file: MiXCR_IMGT_Exp2-RNA-D1-NEB-L1-4a75bb_S3.clones_IGK.tsv
All files are done.
