In [None]:
import os
import csv
from Bio import ExPASy, SwissProt
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

# For multiple multimer FASTA files provided in the .csv file
- In the following code, I am excluding the protein paris whose amino acid length > 3600. If you want to generate fasta file regardless, just delete that block.

In [None]:
# Define your UniProt ID for chain A
sequences = {
   "chainA": "Q9GZQ8",  # MAPLC3B
}

# Create the main output folder
folder_name = "All_multimer_MAPLC3B"  # Updated folder name
os.makedirs(folder_name, exist_ok=True)

# Input CSV file with chainB UniProt IDs and Gene Symbols
csv_file = "SL18_Live_pq_153 modified.csv"

# CSV列索引配置
# 根据CSV格式: EPPK1__P58107, P58107, EPPK1, Epiplakin, ...
UNIPROT_COLUMN = 1  # UniProt ID所在列（第2列，0-based索引为1）
GENE_SYMBOL_COLUMN = 2  # Gene Symbol所在列（第3列，0-based索引为2）

# Fetch MAPLC3B sequence once
try:
   handle_A = ExPASy.get_sprot_raw(sequences["chainA"])
   record_A = SwissProt.read(handle_A)
   seq_A = record_A.sequence
   entry_name_A = record_A.entry_name
   print(f"Successfully retrieved MAPLC3B sequence: {entry_name_A}")
except Exception as e:
   print(f"Failed to fetch MAPLC3B sequence: {e}")
   exit(1)

# Track processing statistics
processed_count = 0
failed_count = 0
skipped_count = 0
created_folders = []

# Process each chainB UniProt ID
with open(csv_file, "r", encoding="utf-8-sig") as file:
   reader = csv.reader(file)
   header = next(reader)  # Get header line
   print(f"CSV header: {header}")
   
   # 验证列索引是否有效
   if len(header) <= max(UNIPROT_COLUMN, GENE_SYMBOL_COLUMN):
       print(f"Error: CSV doesn't have enough columns. Expected at least {max(UNIPROT_COLUMN, GENE_SYMBOL_COLUMN)+1} columns.")
       exit(1)
   
   print(f"Using column {UNIPROT_COLUMN} for UniProt ID: {header[UNIPROT_COLUMN]}")
   print(f"Using column {GENE_SYMBOL_COLUMN} for Gene Symbol: {header[GENE_SYMBOL_COLUMN]}")
   
   for row_num, row in enumerate(reader, start=2):
       if len(row) <= max(UNIPROT_COLUMN, GENE_SYMBOL_COLUMN):
           print(f"Row {row_num}: Insufficient columns")
           continue
           
       chainB_id = row[UNIPROT_COLUMN].strip()  # UniProt ID
       gene_symbol = row[GENE_SYMBOL_COLUMN].strip()  # Gene Symbol
       
       if not chainB_id:
           print(f"Row {row_num}: Empty UniProt ID")
           continue
           
       if not gene_symbol:
           print(f"Row {row_num}: Empty Gene Symbol for {chainB_id}")
           continue

       try:
           # Fetch chainB sequence
           print(f"Processing {chainB_id} ({gene_symbol})...")
           handle_B = ExPASy.get_sprot_raw(chainB_id)
           record_B = SwissProt.read(handle_B)
           seq_B = record_B.sequence
           
           # Skip if sequence length exceeds limit
           if len(seq_A) + len(seq_B) > 3600:
               print(f"⚠️  Skipping {gene_symbol} (combined length {len(seq_A) + len(seq_B)} > 3600)")
               skipped_count += 1
               continue

           # Combine sequences with colon
           combined_seq = f"{seq_A}:{seq_B}"
           combined_record = SeqRecord(
               Seq(combined_seq),
               id=f"MAPLC3B_{gene_symbol}",  # Using gene symbol
               description=f"MAPLC3B-{gene_symbol} multimer complex"
           )

           # Create subfolder for this pair using gene symbol
           pair_folder_name = f"MAPLC3B_{gene_symbol}"
           pair_folder_path = os.path.join(folder_name, pair_folder_name)
           
           # 检查文件夹是否已存在（避免重复基因名的冲突）
           counter = 1
           while os.path.exists(pair_folder_path):
               pair_folder_name = f"MAPLC3B_{gene_symbol}_{counter}"
               pair_folder_path = os.path.join(folder_name, pair_folder_name)
               counter += 1
           
           os.makedirs(pair_folder_path, exist_ok=True)
           created_folders.append(pair_folder_name)

           # Write FASTA file into the subfolder using gene symbol
           output_filename = f"pair_MAPLC3B_{gene_symbol}.fasta"
           output_path = os.path.join(pair_folder_path, output_filename)
           
           with open(output_path, "w") as output_handle:
               SeqIO.write(combined_record, output_handle, "fasta")

           print(f"✓ Wrote: {output_path}")
           processed_count += 1

       except Exception as e:
           print(f"✗ Failed to process {chainB_id} ({gene_symbol}): {e}")
           failed_count += 1

# Create input_folders.txt with all created folder names
input_folders_file = "input_folders.txt"
created_folders.sort()  # Sort for consistent ordering

with open(input_folders_file, 'w') as list_file:
    for folder_name_item in created_folders:
        # 写入相对路径
        folder_path = os.path.join(folder_name, folder_name_item)
        list_file.write(f"{folder_path}\n")

print(f"\n=== Processing Complete ===")
print(f"✅ Successfully processed: {processed_count}")
print(f"⚠️  Skipped (length > 3600): {skipped_count}")
print(f"❌ Failed: {failed_count}")
print(f"📁 Total folders created: {len(created_folders)}")
print(f"📄 Files saved in '{folder_name}' folder")
print(f"📋 Created folder list: {input_folders_file}")

# 显示前几个创建的文件夹
if created_folders:
    print(f"\n📋 First 5 created folders:")
    for i, folder in enumerate(created_folders[:5]):
        print(f"  {i+1}. {folder}")
    if len(created_folders) > 5:
        print(f"  ... and {len(created_folders) - 5} more")

# 验证input_folders.txt内容
print(f"\n🔍 Verifying {input_folders_file}:")
with open(input_folders_file, 'r') as f:
    lines = f.readlines()
    print(f"  Total entries: {len(lines)}")
    if lines:
        print(f"  First entry: {lines[0].strip()}")
        if len(lines) > 1:
            print(f"  Last entry: {lines[-1].strip()}")

print(f"\n💡 You can now use: find {folder_name} -maxdepth 1 -type d -not -path '{folder_name}' > input_folders_verification.txt")
print(f"💡 Or directly use the generated: {input_folders_file}")