In [1]:
import pandas as pd
import os

def process_clonotypes_in_folder(folder_path, output_file):
    # List to hold all clonotype data with filename
    all_clonotype_data = []

    # Traverse through all files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith("_airr.tsv"):  # Process only .tsv files
            file_path = os.path.join(folder_path, filename)
            print(f"Processing file: {filename}")

            # Load the TSV file
            data = pd.read_csv(file_path, sep='\t')

            # Extract the junction_aa column
            if 'junction_aa' in data.columns:
                cdr3_sequences = data['junction_aa'].dropna()

                # Count occurrences of each clonotype
                clonotype_counts = cdr3_sequences.value_counts()

                # Add to the output list
                for clonotype, count in clonotype_counts.items():
                    all_clonotype_data.append({
                        'Filename': filename,
                        'Clonotype': clonotype,
                        'Count': count
                    })
            else:
                print(f"Warning: 'junction_aa' column not found in {filename}")

    # Create a DataFrame from the aggregated data
    clonotype_df = pd.DataFrame(all_clonotype_data)

    # Save the result to a CSV file
    clonotype_df.to_csv(output_file, index=False)
    print(f"Clonotype data saved to {output_file}")

# Define folder path and output file
folder_path = "/home/rittika/Workprojects/trust4-run/takara-bio500ng"  # Replace with your folder path
output_file = "/home/rittika/Workprojects/trust4-run/trust4_TKB-RNA_unique_clonotypes.csv"

# Call the function
process_clonotypes_in_folder(folder_path, output_file)

Processing file: TRUST4_Exp2-RNA-D2-TKB-L2-NextSeq2k-Heavy-Chain-500ng_merged_L001_airr.tsv
Clonotype data saved to /home/rittika/Workprojects/trust4-run/trust4_TKB-RNA_unique_clonotypes.csv
