In [1]:
import os
import pandas as pd

In [12]:
import os
import pandas as pd

def process_files_with_none(input_file, folder_path, output_file):
    """
    Match CDR3aa (from the input CSV) as a substring of aaSeqCDR3 (from TSV files) and compute the sum 
    of uniqueMoleculeCount for each cell line. Add a 'None' column for unmatched sequences.

    Parameters:
        input_file (str): Path to the input CSV file with cell line and CDR3aa information.
        folder_path (str): Path to the folder containing TSV files.
        output_file (str): Path to save the resultant output file.
    """
    # Define the desired cell line order
    cell_line_order = [
        'MEC-1', 'JK-6', 'DAUDI', 'RI-1', 'OCI-LY1',
        'EHEB', 'LB5871-LYMP', 'WSU-DLCL', 'WSU-NHL', 'None', 'TotaluniqUMI'
    ]

    # Read the input CSV file
    file1_data = pd.read_csv(input_file, sep=",")
    if 'CDR3aa' not in file1_data.columns or 'cell_line' not in file1_data.columns:
        raise ValueError("Input file must contain 'CDR3aa' and 'cell_line' columns.")

    # Create a lookup dictionary for cell_line by CDR3aa
    lookup_dict = {row['CDR3aa']: row['cell_line'] for _, row in file1_data.iterrows()}

    # Identify the TSV files in the folder
    files = [f for f in os.listdir(folder_path) if f.endswith(".tsv") and f != os.path.basename(output_file)]

    # Group files by sample name and chain type
    sample_files = {}
    for filename in files:
        if filename.startswith('.') or not filename.endswith(".tsv"):
            continue
        base_name = os.path.splitext(filename)[0]
        parts = base_name.split('_')
        if len(parts) < 2:
            continue  # Skip files with unexpected naming convention
        sample_name = "_".join(parts[:-1])
        chain = parts[-1]
        if sample_name not in sample_files:
            sample_files[sample_name] = {'IGH': [], 'IGK': [], 'IGL': []}
        if chain in ['IGH', 'IGK', 'IGL']:
            sample_files[sample_name][chain].append(filename)

    # Combine IGK and IGL files into a single IGLC file per sample
    combined_files = []
    for sample_name, chains_dict in sample_files.items():
        igk_files = chains_dict['IGK']
        igl_files = chains_dict['IGL']
        igh_files = chains_dict['IGH']

        # Combine IGK & IGL files into IGLC
        if igk_files or igl_files:
            combined_df = pd.DataFrame()
            for f in igk_files + igl_files:
                f_path = os.path.join(folder_path, f)
                temp_df = pd.read_csv(f_path, sep="\t", usecols=['aaSeqCDR3', 'uniqueMoleculeCount'])
                combined_df = pd.concat([combined_df, temp_df], ignore_index=True)
            combined_name = f"{sample_name}_IGLC.tsv"
            combined_path = os.path.join(folder_path, combined_name)
            combined_df.to_csv(combined_path, sep="\t", index=False)
            combined_files.append(combined_name)

        # Keep IGH files as is
        for f in igh_files:
            combined_files.append(f)

    # Initialize a dictionary to store results
    results = {}

    # Process each combined file
    for filename in combined_files:
        filepath = os.path.join(folder_path, filename)
        tsv_data = pd.read_csv(filepath, sep="\t", usecols=['aaSeqCDR3', 'uniqueMoleculeCount'])

        # Initialize counts for all cell lines
        file_counts = {key: 0 for key in cell_line_order}
        unmatched_count = 0

        for _, row in tsv_data.iterrows():
            aaSeqCDR3 = row['aaSeqCDR3']
            unique_count = row['uniqueMoleculeCount']
            cell_line = lookup_dict.get(aaSeqCDR3, None)
            if cell_line in file_counts:
                file_counts[cell_line] += unique_count
            else:
                unmatched_count += unique_count

        file_counts['None'] = unmatched_count
        simplified_name = "_".join(filename.split("_")[2:10])
        results[simplified_name] = file_counts

        print(f"Processed file: {filename}")

    # Convert results to a DataFrame
    result_df = pd.DataFrame(results).fillna(0).T
    result_df['TotaluniqUMI'] = result_df.sum(axis=1)

    # Reorder the columns
    result_df = result_df[cell_line_order]

    # Save the output as a TSV file
    result_df.to_csv(output_file, sep="\t", index_label="Filename")

def calculate_percentages(input_counts_file, output_percentages_file):
    """
    Read the counts file produced by process_files_with_none, and convert counts to percentages.
    For each cell (including 'None'), the value will be (cell_value / TotaluniqUMI) * 100.

    Parameters:
        input_counts_file (str): Path to the input counts TSV file.
        output_percentages_file (str): Path to save the resultant percentage-based TSV file.
    """
    df_counts = pd.read_csv(input_counts_file, sep="\t", index_col="Filename")
    total_col = df_counts['TotaluniqUMI']
    df_cells = df_counts.drop(columns=['TotaluniqUMI'])

    df_percent = (df_cells.div(total_col, axis=0)) * 100
    df_percent['TotaluniqUMI'] = 100

    # Save the percentage DataFrame
    df_percent.to_csv(output_percentages_file, sep="\t", index_label="Filename")


In [15]:
def process_nested_folders(root_folder):
    """
    Recursively traverse nested folders and process each separately.
    """
    for folder_path, subdirs, files in os.walk(root_folder):
        if any(file.endswith(".tsv") for file in files):  # Process folders containing .tsv files
            folder_name = os.path.basename(folder_path)
            print(f"Processing folder: {folder_name}")
            
            short_string = folder_path.replace("/", "_")
            # Generate output file names based on the folder name
            output_file = os.path.join(f"{short_string}_clonotypes_with_filenames_and_umi.tsv")
            percent_output_file = os.path.join( f"{short_string}_clonotypes_percentage.tsv")
            print(percent_output_file)

            # Assuming the CDR3aa file is present in the folder, otherwise adapt accordingly
            input_file = "BCR-SEQC_nine_cell_lines.csv"  # Adjust path if necessary

            # Process the folder with the provided input file
            process_files_with_none(input_file, folder_path, output_file)

            # Calculate percentages
            calculate_percentages(output_file, percent_output_file)
            print(f"Completed processing for folder: {folder_name}")


# Define the root folder
root_folder = "/home/rittika/Workprojects/AbHelix-data/CDR3-trials"  # Replace with the path to your root folder

# Call the function to process nested folders
process_nested_folders(root_folder)

Processing folder: CDR3-trials
_home_rittika_Workprojects_AbHelix-data_CDR3-trials_clonotypes_percentage.tsv
Processed file: MiXCR_IMGT_Exp2_RNA_MME_ABH_L7_NextSeq2k.clones_IGH.tsv
Processed file: MiXCR_IMGT_Exp2_RNA_MME_ABH_L1_NextSeq2k.clones_IGH.tsv
Processed file: MiXCR_IMGT_Exp2_RNA_MME_ABH_L4_NextSeq2k.clones_IGH.tsv
Completed processing for folder: CDR3-trials


In [11]:
   
# # Parameters for processing
# input_file = "/home/rittika/Workprojects/TakaraBio-MiXCR/BCR-SEQC_nine_cell_lines.csv"  # Replace with the actual input file path
# folder_path = "/home/rittika/Workprojects/TakaraBio-MiXCR/MiXCR-TSV/"        # Replace with the actual folder path
# output_file = "/home/rittika/Workprojects/TakaraBio-MiXCR/RNA-Takara-500ng-full-output-new.csv"    # Replace with the desired output file path
# percent_output_file = "/home/rittika/Workprojects/TakaraBio-MiXCR/RNA-Takara-500ng-percent-output-new.csv"    # Replace with the desired output file path

# # process_files_with_none("input.csv", "path_to_folder", "counts_output.tsv")
# # calculate_percentages("counts_output.tsv", "percentages_output.tsv")

# # Execute the function
# process_files_with_none(input_file, folder_path, output_file)
# calculate_percentages(output_file, percent_output_file)
# print(f"All files are done.")