In [96]:
import pandas as pd
from Bio import SeqIO
import re
IUPAC_CODES = {
    frozenset(["A", "G"]): "R",
    frozenset(["C", "T"]): "Y",
    frozenset(["A", "C"]): "M",
    frozenset(["G", "T"]): "K",
    frozenset(["G", "C"]): "S",
    frozenset(["A", "T"]): "W"
}

def load_reference_genome(fasta_file):
    """Load reference genome from FASTA file."""
    record = SeqIO.read(fasta_file, "fasta")
    return list(str(record.seq))

def rightmost_position(reference, pos, variant, ref_base):
    """
    Finds the rightmost position where an insertion/deletion should be placed based on repeat structure.
    """
    pos = int(pos) - 1  # Convert to zero-based index
    ref_base = reference[pos] if pos < len(reference) else ref_base
    # print(reference[pos])
    
    # Move the insertion to the rightmost position in case of repeated bases
    while pos + 1 < len(reference) and ''.join(reference[pos + 1]) == variant:
        pos += 1

    return pos + 1  # Convert back to 1-based index

def rightmost_position_segment_ins(reference, pos, segment):
    """
    Finds the rightmost position where an insertion or deletion should be placed
    based on repeated sequences.
    """
    pos = int(pos) - 1  # Convert to zero-based index
    # print(pos)
    # print(reference[pos + 1: pos + 1 + len(segment)])
    # Move position to the rightmost occurrence of the repeated segment
    while pos + len(segment) < len(reference) and ''.join(reference[pos + 1: pos + 1 + len(segment)]) == segment:
        pos += len(segment)
        # print(pos)
        # print(reference[pos + 1: pos + 1 + len(segment)])
    # print(pos)
    return pos+len(segment)  # Convert back to 1-based index

def rightmost_position_segment_del(reference, pos, segment):
    """
    Finds the rightmost position where an insertion or deletion should be placed
    based on repeated sequences.
    """
    pos = int(pos) -1 # Convert to zero-based index
    # print(reference[pos + 1: pos + 1 + len(segment)])
    # print(segment)
        
    # if ''.join(reference[pos + 1: pos + 1 + len(segment)]) == segment:
    #     print("TRUE")
    # else:
    #     print("FALSE")
    # Move position to the rightmost occurrence of the repeated segment
    while pos + len(segment) < len(reference) and ''.join(reference[pos + 1: pos + 1 + len(segment)]) == segment:
        pos += len(segment)
        # print(pos)
        # print(reference[pos + 1: pos + 1 + len(segment)])
    # print(pos)
    return pos  # Convert back to 1-based index


def shift_insertion_right(reference, pos, inserted_segment):
    """
    Shifts the insertion right if the first base of inserted_segment matches 
    the next base after the rightmost position of the insertion.
    """
    rightmost_pos = rightmost_position_segment_ins(reference, pos, inserted_segment)
    # print(rightmost_pos)
    # print(inserted_segment)
    for _ in range(len(inserted_segment) - 1):
        if rightmost_pos + len(inserted_segment) < len(reference) and ''.join(reference[rightmost_pos +1 : rightmost_pos + 1 + len(inserted_segment)]) == inserted_segment[0]:
            # Shift right by moving first base to last position
            inserted_segment = inserted_segment[1:] + inserted_segment[0]
            rightmost_pos += 1
        else:
            break  # Stop shifting if no more matches

    return rightmost_pos, inserted_segment


def shift_deletion_right(reference, pos, deleted_segment):
    """
    Shifts the deletion right if the first base of deleted_segment matches 
    the next base after the rightmost position of the deletion.
    """
    # print(pos)
    # print(deleted_segment)
    rightmost_pos = rightmost_position_segment_del(reference, pos, deleted_segment)
    # print(rightmost_pos)
    # print(len(deleted_segment))

    # if ''.join(reference[rightmost_pos + len(deleted_segment)]) == deleted_segment[0]:
    #     print("TRUE")
    # else:
    #     print("FALSE")
    
    for _ in range(len(deleted_segment) - 1):
        if rightmost_pos + len(deleted_segment) < len(reference) and ''.join(reference[rightmost_pos + len(deleted_segment)]) == deleted_segment[0]:
            # Shift right by moving first base to last position
            deleted_segment = deleted_segment[1:] + deleted_segment[0]
            rightmost_pos += 1
            # print(rightmost_pos)
        else:
            break  # Stop shifting if no more matches
    
    print(deleted_segment)
    print(rightmost_pos)
    return rightmost_pos, deleted_segment

def format_variant(row, modified_reference):
    """
    Formats variants according to EMPOP standards.
    """
    pos, ref, var, var_level, var_type = row["Pos"], row["Ref"], row["Variant"], row["VariantLevel"], row["Type"]
    pos = int(pos)  # Convert position to integer

    if var_type == "SNP":  # SNPs
        modified_reference[pos - 1] = var
        if var_level >= 0.925:
            return f"{ref}{pos}{var}"
        else:
            iupac_code = IUPAC_CODES.get(frozenset([ref, var]), f"{ref}/{var}")
            return f"{ref}{pos}{iupac_code}"

    elif var_type == "INDEL":  # Indels
        if len(ref) + 1 == len(var):  # Single-base Insertion
            inserted_base = var[len(ref):]  # Extract inserted base
            rightmost_pos = rightmost_position(modified_reference, pos, inserted_base, ref[-1])  # Find rightmost position
            if var_level >= 0.925:
                return f"-{rightmost_pos}.1{inserted_base}"  # Example: T315.1C
            else: 
                return f"-{rightmost_pos}.1{inserted_base.lower()}" 

        elif len(ref) - 1 == len(var):  # Single-base Deletion
            if modified_reference[pos] != ref[-1]:  
                corrected_ref_base = modified_reference[pos - 1]  # Store corrected reference base
            else:
                corrected_ref_base = ref[-1]
            deleted_base = corrected_ref_base  # Extract deleted base
            rightmost_pos = rightmost_position(modified_reference, pos, deleted_base, corrected_ref_base)  # Find rightmost position
            
            # del modified_reference[rightmost_pos:rightmost_pos + len(deleted_base)]
            # return " ".join(modified_reference[max(0, rightmost_pos - 10): rightmost_pos + 10])
            return f"{deleted_base}{rightmost_pos}-"  # Example: C524-

        elif len(ref) < len(var):  # Multi-base Insertion
            # print(pos)
            inserted_segment = var[len(ref):]  # Extract inserted bases
            rightmost_pos, adjusted_segment = shift_insertion_right(modified_reference, pos, inserted_segment)  # Find rightmost repeat
            # print(rightmost_pos)
            # print(adjusted_segment)
            # Apply insertion to modified reference
            # print(adjusted_segment)
            if var_level >= 0.925:
                return " ".join([f"-{rightmost_pos}.{i+1}{adjusted_segment[i]}" for i in range(len(inserted_segment))])
            else: 
                return " ".join([f"-{rightmost_pos}.{i+1}{adjusted_segment[i].lower()}" for i in range(len(inserted_segment))])
        
        elif len(ref) > len(var):  # Multi-base Deletion
           
            deleted_segment = ref[len(var):] 
            # print(deleted_segment) # Extract deleted bases
            # print(deleted_segment)
            
            rightmost_pos, adjusted_segment = shift_deletion_right(modified_reference, pos, deleted_segment)  # Find rightmost repeat
            # print(modified_reference[rightmost_pos-10:rightmost_pos + len(adjusted_segment)+15])
            # Apply deletion to modified reference
            # del modified_reference[rightmost_pos:rightmost_pos + len(adjusted_segment)]
            
            return " ".join([f"{adjusted_segment[i]}{rightmost_pos + i}-" for i in range(len(adjusted_segment))])

    return "N/A"  # If type does not match expected cases
    
def extract_numeric_value(empop_variant):
    """
    Extracts the first numeric value from the EMPOP_Variant string.
    Example: "C16193-" → 16193
    """
    match = re.search(r'\d+', empop_variant)  # Find first number in the string
    return int(match.group()) if match else float('inf')  # Convert to int, default to large value if missing


def process_variants(input_file, output_file, ref_fasta):
    """Reads variant file, processes variants into EMPOP format, and saves to output file."""
    print(1)
    modified_reference = load_reference_genome(ref_fasta)  # Mutable reference copy
    print("successfully loaded ref genome")
    print(len(modified_reference))

    
    df = pd.read_csv(input_file, sep="\t")
    # print(df)
    df["VariantLevel"] = pd.to_numeric(df["VariantLevel"], errors="coerce").fillna(1.0)

    corrected_variants = []

    # Process from bottom to top to prevent shifting issues
    for index in reversed(df.index):
        # print(index)
        row = df.loc[index] 
        corrected_variants.append(format_variant(row, modified_reference))

    # Reverse back to original order
    df["EMPOP_Variant"] = corrected_variants[::-1]
    # print(df["EMPOP_Variant"])
    # Sort by extracted numerical values from "EMPOP_Variant"
    df["SortKey"] = df["EMPOP_Variant"].apply(extract_numeric_value)  # Extract numbers
    df = df.sort_values(by="SortKey").drop(columns=["SortKey"])  # Sort numerically and remove temporary column

    # Save sorted output
    df.to_csv(output_file, sep="\t", index=False)
    print(f"Processed file saved to {output_file}")

# Example Usage
# process_variants("variants.txt", "variants_empop.txt", "reference.fasta")


In [97]:
# Example Usage
process_variants("variants_ins.txt", "variants_empop.txt", "rCRS2.fasta")


1
successfully loaded ref genome
16623
Processed file saved to variants_empop.txt


In [98]:
# Example Usage
process_variants("TraceR2-E1_S4_L001_wo_scb_merged_trimmed.rtn.vcf.filtered.txt", "TraceR2-E1_S4_L001_wo_scb_merged_trimmed.rtn.vcf.filtered_empop.txt", "rCRS2.fasta")


1
successfully loaded ref genome
16623
AC
523
Processed file saved to TraceR2-E1_S4_L001_wo_scb_merged_trimmed.rtn.vcf.filtered_empop.txt


In [131]:
#!/usr/bin/env python3

import pandas as pd
import argparse
from Bio import SeqIO
import re
import sys

# IUPAC codes for heteroplasmy
IUPAC_CODES = {
    frozenset(["A", "G"]): "R",
    frozenset(["C", "T"]): "Y",
    frozenset(["A", "C"]): "M",
    frozenset(["G", "T"]): "K",
    frozenset(["G", "C"]): "S",
    frozenset(["A", "T"]): "W"
}

def load_reference_genome(fasta_file):
    record = SeqIO.read(fasta_file, "fasta")
    return list(str(record.seq))

def extract_numeric_value(empop_variant):
    match = re.search(r'\d+', str(empop_variant))
    return int(match.group()) if match else float('inf')

# ========== SNPs ==========
def format_snp_variant(pos, ref, var, var_level, reference, min_variant_frequency):
    if len(ref) == len(var) > 1: #special case of Multi-nucleotide variant (MNV)
        formatted = []
        for i, (r, v) in enumerate(zip(ref, var)):
            sub_pos = pos + i
            reference[sub_pos - 1] = v
            if var_level >= 1 - min_variant_frequency:
                formatted.append(f"{r}{sub_pos}{v}")
            else:
                code = IUPAC_CODES.get(frozenset([r, v]), f"{r}/{v}")
                formatted.append(f"{r}{sub_pos}{code}")
        return " ".join(formatted)
    
    # Standard SNP
    reference[pos - 1] = var
    if var_level >= 1 - min_variant_frequency:
        return f"{ref}{pos}{var}"
    else:
        code = IUPAC_CODES.get(frozenset([ref, var]), f"{ref}/{var}")
        return f"{ref}{pos}{code}"

# ========== Position Shifting ==========
def rightmost_position(reference, pos, insertion):
    pos -= 1
    while pos + 1 < len(reference) and reference[pos + 1] == insertion:
        pos += 1
    # print(pos)
    return pos + 1
    

def rightmost_segment(reference, pos, segment):
    pos -= 1
    while pos + len(segment) < len(reference) and ''.join(reference[pos + 1: pos + 1 + len(segment)]) == segment:
        pos += len(segment)
    return pos + len(segment)

def shift_deletion(reference, pos, segment):
    rightmost = pos - 1
    while rightmost + len(segment) < len(reference) and ''.join(reference[rightmost + 1: rightmost + 1 + len(segment)]) == segment:
        rightmost += len(segment)
    return rightmost, segment

def shift_insertion(reference, pos, segment):
    rightmost = rightmost_segment(reference, pos, segment)
    for _ in range(len(segment) - 1):
        if rightmost + len(segment) < len(reference) and reference[rightmost + len(segment)] == segment[0]:
            segment = segment[1:] + segment[0]
            rightmost += 1
        else:
            break
    return rightmost, segment

# ========== INDELs ==========
def format_indel_variant(row, reference, min_variant_frequency):
    pos, ref, var, level = int(row["Pos"]), row["Ref"], row["Variant"], row["VariantLevel"]
    # print(pos)
    if len(ref) + 1 == len(var):  # Single-base Insertion
        inserted = var[len(ref):]
        rightmost = rightmost_position(reference, pos, inserted)
        base = inserted.lower() if level < 1-min_variant_frequency else inserted
        return f"-{rightmost}.1{base}"

    elif len(ref) - 1 == len(var):  # Single-base Deletion
        # print(pos)
        base = ref[-1]
        print(ref)
        # print(var)
        print(base)
        rightmost = rightmost_position(reference, pos, base)
        return f"{base}{rightmost}-"

    elif len(ref) < len(var):  # Multi-base Insertion
        segment = var[len(ref):]
        rightmost, segment = shift_insertion(reference, pos, segment)
        return " ".join([
            f"-{rightmost}.{i + 1}{(b.lower() if level < 1-min_variant_frequency else b)}"
            for i, b in enumerate(segment)
        ])

    elif len(ref) > len(var):  # Multi-base Deletion
        segment = ref[len(var):]
        rightmost, segment = shift_deletion(reference, pos, segment)
        return " ".join([
            f"{b}{rightmost + i}-" for i, b in enumerate(segment)
        ])

    return "N/A"

# ========== Variant Dispatcher ==========
def format_variant(row, reference, min_variant_frequency):
    # print(row)
    if row["Type"] == "SNP":
        return format_snp_variant(int(row["Pos"]), row["Ref"], row["Variant"], row["VariantLevel"], reference, min_variant_frequency)
    elif row["Type"] == "INDEL":
        return format_indel_variant(row, reference, min_variant_frequency)
    return "N/A"

# ========== Processing Logic ==========
def process_variants(input_file, output_file, ref_fasta):
    reference = load_reference_genome(ref_fasta)
    print(len(reference))
    min_variant_frequency = 0.05
    df = pd.read_csv(input_file, sep="\t")
    
    df["VariantLevel"] = pd.to_numeric(df["VariantLevel"], errors="coerce").fillna(1.0)

    corrected = []
    for idx in reversed(df.index):
        corrected.append(format_variant(df.loc[idx], reference, min_variant_frequency)) #format_variant is being called

    
    df["EMPOP_Variant"] = corrected[::-1]
    df["SortKey"] = df["EMPOP_Variant"].apply(extract_numeric_value)
    df = df.sort_values(by="SortKey").drop(columns="SortKey")
    df.to_csv(output_file, sep="\t", index=False)
    print(f"Processed file saved to {output_file}")

# ========== CLI Entry ==========
# def main():
#     parser = argparse.ArgumentParser(description="Format mitochondrial variants into EMPOP format.")
#     parser.add_argument("input_file", help="Input TSV file with variants.")
#     parser.add_argument("output_file", help="Output TSV file.")
#     parser.add_argument("ref_fasta", help="Reference FASTA file.")
#     args = parser.parse_args()

#     try:
#         process_variants(args.input_file, args.output_file, args.ref_fasta)
#     except Exception as e:
#         print(f" Error: {e}", file=sys.stderr)
#         sys.exit(1)

# if __name__ == "__main__":
#     main()


In [132]:
# Example Usage
process_variants("TraceR2-E1_S4_L001_wo_scb_merged_trimmed.rtn.vcf.filtered.txt", "TraceR2-E1_S4_L001_wo_scb_merged_trimmed.rtn.vcf.filtered_empop.txt", "rCRS2.fasta")
# Example Usage
# process_variants("variants_ins.txt", "variants_empop.txt", "rCRS2.fasta")


16623
GC
C
Processed file saved to TraceR2-E1_S4_L001_wo_scb_merged_trimmed.rtn.vcf.filtered_empop.txt


In [153]:
#!/usr/bin/env python3

import pandas as pd
from Bio import SeqIO
import sys

IUPAC_CODES = {
    frozenset(["A", "G"]): "R",
    frozenset(["C", "T"]): "Y",
    frozenset(["A", "C"]): "M",
    frozenset(["G", "T"]): "K",
    frozenset(["G", "C"]): "S",
    frozenset(["A", "T"]): "W"
}

def load_reference_sequence(fasta_path):
    """Load reference sequence as mutable list of bases."""
    record = SeqIO.read(fasta_path, "fasta")
    return list(str(record.seq))

def format_snp_variant(pos, ref, alt, level, reference, threshold=0.925):
    """Format a SNP according to EMPOP rules and update the reference."""
    pos_index = pos - 1 # 0-index
    reference[pos_index] = alt

    if level >= threshold:
        return f"{ref}{pos}{alt}"
    else:
        iupac = IUPAC_CODES.get(frozenset([ref, alt]), f"{ref}/{alt}")
        return f"{ref}{pos}{iupac}"

def process_snp_dataframe(df, reference, threshold=0.925):
    """Process SNPs in reverse order and return EMPOP-formatted strings."""
    formatted = []
    for idx in reversed(df.index):
        row = df.loc[idx]
        pos = int(row["Pos"])
        ref = row["Ref"]
        alt = row["Variant"]
        level = float(row["VariantLevel"])
        formatted.append(format_snp_variant(pos, ref, alt, level, reference, threshold))
    return formatted[::-1]  # Return to original order

def process_snp_variants(input_file, output_file, reference_fasta, threshold=0.925):
    """Load input, process SNPs, and save EMPOP-formatted output."""
    df = pd.read_csv(input_file, sep="\t")
    df["VariantLevel"] = pd.to_numeric(df["VariantLevel"], errors="coerce").fillna(1.0)

    reference = load_reference_sequence(reference_fasta)
    # print(reference)
    df["EMPOP_Variant"] = process_snp_dataframe(df, reference, threshold)

    # print(reference)

    df.to_csv(output_file, sep="\t", index=False)
    print(f"Processed SNPs saved to {output_file}")



In [154]:
# Example Usage
process_snp_variants(
    "TraceR2-E1_S4_L001_wo_scb_merged_trimmed.rtn.vcf.filtered.txt",
    "TraceR2-E1_S4_L001_wo_scb_merged_trimmed.rtn.vcf.filtered_empop.txt",
    "rCRS2.fasta"
)# Example Usage
# process_variants("variants_ins.txt", "variants_empop.txt", "rCRS2.fasta")


Processed SNPs saved to TraceR2-E1_S4_L001_wo_scb_merged_trimmed.rtn.vcf.filtered_empop.txt


In [211]:
#!/usr/bin/env python3

import pandas as pd
from Bio import SeqIO
import re
import sys

IUPAC_CODES = {
    frozenset(["A", "G"]): "R",
    frozenset(["C", "T"]): "Y",
    frozenset(["A", "C"]): "M",
    frozenset(["G", "T"]): "K",
    frozenset(["G", "C"]): "S",
    frozenset(["A", "T"]): "W"
}

def load_reference(fasta_path):
    """Load reference genome as a mutable list of bases."""
    record = SeqIO.read(fasta_path, "fasta")
    return list(str(record.seq))

def rightmost_repeat_position(reference, pos, segment):
    """Find rightmost position where segment can be inserted or deleted based on repeat structure."""
    pos -= 1  # 0-based
    while (
        pos + len(segment) < len(reference) and
        "".join(reference[pos + 1: pos + 1 + len(segment)]) == segment
    ):
        pos += len(segment)
    return pos + 1  # back to 1-based

def shift_insertion_right(reference, pos, segment):
    pos = rightmost_repeat_position(reference, pos, segment)
    for _ in range(len(segment) - 1):
        next_seq = reference[pos + 1: pos + 1 + len(segment)]
        if "".join(next_seq) == segment[0]:
            segment = segment[1:] + segment[0]
            pos += 1
        else:
            break
    return pos, segment

def shift_deletion_right(reference, pos, segment):
    pos = rightmost_repeat_position(reference, pos, segment) - 1
    for _ in range(len(segment) - 1):
        if reference[pos + len(segment)] == segment[0]:
            segment = segment[1:] + segment[0]
            pos += 1
        else:
            break
    return pos, segment

def apply_snp(pos, ref, var, var_level, reference, min_variant_frequency):
    formatted = []
    for i, (r, v) in enumerate(zip(ref, var)):
        sub_pos = pos + i
        reference[sub_pos - 1] = v

        if var_level >= 1 - min_variant_frequency:
            formatted.append(f"{r}{sub_pos}{v}")
            return " ".join(formatted), "SNP"
        else:
            code = IUPAC_CODES.get(frozenset([r, v]), f"{r}/{v}")
            formatted.append(f"{r}{sub_pos}{code}")
            return " ".join(formatted), "PHP"
    

def apply_insertion(pos, ref, var, var_level, reference, min_freq):
    inserted_segment = var[len(ref):]
    pos, segment = shift_insertion_right(reference, pos, inserted_segment)
    
    variant_parts = []
    for i, base in enumerate(segment):
        is_major = var_level >= 1 - min_freq
        formatted_base = base if is_major else base.lower()
        variant_parts.append(f"-{pos}.{i+1}{formatted_base}")

    updated_type = "LHP" if not is_major else "INDEL"
    return " ".join(variant_parts), updated_type

def apply_deletion(pos, ref, var, var_level, reference, min_freq):
    deleted_segment = ref[len(var):]
    pos, segment = shift_deletion_right(reference, pos, deleted_segment)
    
    is_major = var_level >= 1 - min_freq
    variant_parts = []
    for i, base in enumerate(segment):
        variant_parts.append(f"{base}{pos + i}-")

    updated_type = "INDEL" if is_major else "LHP"
    return " ".join(variant_parts), updated_type


def format_variant(row, reference):
    """Route to SNP or INDEL formatter."""
    pos = int(row["Pos"])
    ref = row["Ref"]
    var = row["Variant"]
    var_type = row["Type"]
    var_level = row["VariantLevel"]

    min_variant_frequency = 0.05
    
    if var_type == "SNP":  # SNP
        return apply_snp(pos, ref, var, var_level, reference, min_variant_frequency)
    elif var_type == "INDEL":
        if len(ref) < len(var):
            return apply_insertion(pos, ref, var, var_level, reference, min_variant_frequency)
        elif len(ref) > len(var):
            return apply_deletion(pos, ref, var, var_level, reference, min_variant_frequency)

    return "N/A"

def extract_numeric_value(empop_variant):
    match = re.search(r'\d+', empop_variant)
    return int(match.group()) if match else float('inf')

def finalize_output_table(df):
    df["EMPOP_Variant"] = df["EMPOP_Variant"].astype(str).str.split()
    df = df.explode("EMPOP_Variant").reset_index(drop=True)
    df["VariantLevel"] = pd.to_numeric(df["VariantLevel"], errors="coerce")

    def add_comma_separated_numbers(series):
        split_lists = series.dropna().astype(str).apply(lambda x: list(map(float, x.split(','))))
        if split_lists.empty:
            return ""
        summed = [sum(x) for x in zip(*split_lists)]
        return ",".join(f"{s:.4g}" for s in summed)

    group_keys = ["EMPOP_Variant"]
    numeric_agg = {
        "VariantLevel": "sum",
        "Coverage": add_comma_separated_numbers,
        "MeanBaseQuality": "first"
    }
    other_cols = [col for col in df.columns if col not in numeric_agg and col not in group_keys]
    full_agg = {**numeric_agg, **{col: "first" for col in other_cols}}

    grouped = df.groupby(group_keys, as_index=False).agg(full_agg)

    def extract_position(variant):
        match = re.search(r"(\d+\.?\d*)", str(variant))
        return float(match.group(1)) if match else float('inf')

    grouped["position"] = grouped["EMPOP_Variant"].apply(extract_position)
    grouped = grouped.sort_values(by="position").drop(columns=["position"])

    def correct_length_het_case(row):
        if "." in row["EMPOP_Variant"] and row["VariantLevel"] >= 0.92:
            return row["EMPOP_Variant"][:-1] + row["EMPOP_Variant"][-1].upper()
        return row["EMPOP_Variant"]

    grouped["EMPOP_Variant"] = grouped.apply(correct_length_het_case, axis=1)
    return grouped

def process_variants(input_file, output_file, reference_fasta):
    df = pd.read_csv(input_file, sep="\t")
    reference = load_reference(reference_fasta)

    results = []
    types = []
    for idx in reversed(df.index):
        row = df.loc[idx]
        variant, updated_type = format_variant(row, reference)
        results.append(variant)
        types.append(updated_type)

    df["EMPOP_Variant"] = results[::-1]  # maintain original order
    df["Type"] = types[::-1]             # update Type column
    
    df = finalize_output_table(df)
    df.to_csv(output_file, sep="\t", index=False)
    print(f"Processed file saved to {output_file}")



In [212]:
# Example Usage
process_variants(
    "TraceR2-E1_S4_L001_wo_scb_merged_trimmed.rtn.vcf.filtered.txt",
    "TraceR2-E1_S4_L001_wo_scb_merged_trimmed.rtn.vcf.filtered_empop.txt",
    "rCRS2.fasta"
)# Example Usage
# process_variants("variants_ins.txt", "variants_empop.txt", "rCRS2.fasta")


Processed file saved to TraceR2-E1_S4_L001_wo_scb_merged_trimmed.rtn.vcf.filtered_empop.txt


In [213]:
from pyexcelerate import Workbook

data = [["Name", "Age"], ["Alice", 30], ["Bob", 25]]
wb = Workbook()
wb.new_sheet("Sheet1", data=data)
wb.save("fast_output.xlsx")

ModuleNotFoundError: No module named 'pyexcelerate'

In [214]:
from openpyxl import Workbook

wb = Workbook()
ws = wb.active
ws["A1"] = "Hello"
wb.save("output.xlsx")