In [None]:
import os
import pandas as pd

def process_clonotypes_with_umi(folder_path, heavy_output_file, light_output_file, summary_file):
    """
    Processes .tsv files in the specified folder, separating IGH (heavy chain)
    and IGK/IGL (light chain) files. Outputs:
        - a combined CSV for heavy chain (IGH),
        - a combined CSV for light chain (IGK + IGL → IGLC),
        - a summary TSV of clonotype counts and UMI totals per file.
    """
    heavy_data = []
    light_data = []
    summary_stats = []

    # Traverse through all files in the folder and subfolders
    for root, _, files in os.walk(folder_path):
        for filename in files:
            if filename.endswith(".tsv"):
                file_path = os.path.join(root, filename)
                print(f"Processing file: {file_path}")

                data = pd.read_csv(file_path, sep='\t')

                if 'nSeqCDR3' in data.columns and 'uniqueMoleculeCount' in data.columns:
                    filtered_data = data[['nSeqCDR3', 'uniqueMoleculeCount']].dropna()

                    grouped = filtered_data.groupby('nSeqCDR3').agg(
                        Count=('nSeqCDR3', 'size'),
                        TotalUMI=('uniqueMoleculeCount', 'sum')
                    ).reset_index()

                    grouped['Filename'] = filename

                    summary_stats.append({
                        'Filename': filename,
                        'UniqueClonotypes': grouped.shape[0],
                        'TotalUMI': grouped['TotalUMI'].sum(),
                        'ChainType': 'IGH' if 'IGH' in filename else 'IGLC'
                    })

                    if 'IGH' in filename:
                        heavy_data.append(grouped)
                    elif 'IGK' in filename or 'IGL' in filename:
                        light_data.append(grouped)
                    else:
                        print(f"Unrecognized file type (not IGH/IGK/IGL): {filename}")
                else:
                    print(f"Warning: Required columns ('nSeqCDR3', 'uniqueMoleculeCount') not found in {filename}")

    # Save heavy chain data
    if heavy_data:
        heavy_df = pd.concat(heavy_data, ignore_index=True)
        heavy_df.rename(columns={'nSeqCDR3': 'Clonotype'}, inplace=True)
        heavy_df = heavy_df[['Filename', 'Clonotype', 'Count', 'TotalUMI']]
        heavy_df.to_csv(heavy_output_file, index=False)
        print(f"Heavy chain (IGH) data saved to {heavy_output_file}")
    else:
        heavy_df = None

    # Save light chain data (combined IGK + IGL → IGLC)
    if light_data:
        light_df = pd.concat(light_data, ignore_index=True)
        light_df.rename(columns={'nSeqCDR3': 'Clonotype'}, inplace=True)
        light_df = light_df[['Filename', 'Clonotype', 'Count', 'TotalUMI']]
        light_df.to_csv(light_output_file, index=False)
        print(f"Light chain (IGLC = IGK+IGL) data saved to {light_output_file}")
    else:
        light_df = None

    # Save summary statistics
    if summary_stats:
        summary_df = pd.DataFrame(summary_stats)
        summary_df = summary_df[['Filename', 'ChainType', 'UniqueClonotypes', 'TotalUMI']]
        summary_df.to_csv(summary_file, sep='\t', index=False)
        print(f"Summary statistics saved to {summary_file}")
    else:
        summary_df = None

    return heavy_df, light_df, summary_df

In [None]:
def process_nested_folders(root_folder):
    """
    Recursively traverse nested folders and process each separately.
    For each folder containing .tsv files, generate:
        - a CSV with per-clonotype UMI data,
        - a summary TSV with total UMI and unique clonotypes per file,
        - line plots of UMI distributions,
        - a scatter plot with UMI as marker size.
    """
    for folder_path, subdirs, files in os.walk(root_folder):
        if any(file.endswith(".tsv") for file in files):  # Process folders containing .tsv files
            folder_name = os.path.basename(folder_path)
            print(f"Processing folder: {folder_name}")

            short_string = folder_path.replace("/", "_").replace("\\", "_")  # Normalize path to valid filename base

            # Inside process_nested_folders
            heavy_output_file = os.path.join(f"{short_string}_IGH_heavychain.csv")
            light_output_file = os.path.join(f"{short_string}_IGLC_lightchain.csv")
            summary_file = os.path.join(f"{short_string}_clonotype_summary.tsv")

            heavy_df, light_df, summary_df = process_clonotypes_with_umi(
                folder_path,
                heavy_output_file,
                light_output_file,
                summary_file
            )


# Define the root folder
root_folder = "MGI"  # Replace with the path to your root folder

# Call the function to process nested folders
process_nested_folders(root_folder)
