In [11]:
import pandas as pd
import re

# Step 10: Extract numeric positions from the sequence column for sorting
def extract_position(seq):
    match = re.search(r"(\d+\.?\d*)", seq)
    return float(match.group(1)) if match else float('inf')

# Define function to process FDSTools SAST TSV files
def process_fdstools_sast(file_path: str, threshold: float = 8.0):
    IUPAC_CODES = {
        frozenset(["A", "G"]): "R",
        frozenset(["C", "T"]): "Y",
        frozenset(["A", "C"]): "M",
        frozenset(["G", "T"]): "K",
        frozenset(["G", "C"]): "S",
        frozenset(["A", "T"]): "W"
    }

    # Load the FDSTools SAST TSV file
    df = pd.read_csv(file_path, sep="\t", dtype=str)

    # Convert total_mp_sum and total to numeric types
    df["total_mp_sum"] = pd.to_numeric(df["total_mp_sum"], errors="coerce")
    df["total"] = pd.to_numeric(df["total"], errors="coerce")

    # Count each target (e.g. mtNG_004) and flag those with a single occurrence and low total
    marker_counts = df["marker"].value_counts()
    single_low_coverage = df[df["marker"].isin(marker_counts[marker_counts == 1].index) & (df["total"] < 10)].copy()

    # if not single_low_coverage.empty:
    #     print("Markers with single entries and total < 10:")
    #     print(single_low_coverage[["marker", "sequence", "total"]])

    # Replace 'sequence' column with 'marker' for low-coverage targets
    single_low_coverage["sequence"] = single_low_coverage["marker"]
    single_low_coverage = single_low_coverage.drop(columns=["marker"])
    
    # Optional: Write to CSV for external review
    # single_low_coverage.to_csv("low_coverage_singleton_segments.csv", index=False)

    
    # Step 1: Fill NaN in total and total_mp_sum with zeros
    df["total"] = df["total"].fillna(0)
    df["total_mp_sum"] = df["total_mp_sum"].fillna(0)

    # Step 2: Flag rows with total read depth lower than threshold and low-confidence sequences
    df["is_noise_or_low"] = (df["sequence"].isin(["Other sequences"])) | (df["total_mp_sum"] < threshold)

    # Drop unnecessary columns
    columns_to_drop = [
    "total_mp_max", "forward_pct", "forward", "forward_mp_sum",
    "forward_mp_max", "reverse", "reverse_mp_sum", "reverse_mp_max"]
    
    df = df.drop(columns=columns_to_drop, errors="ignore")

    # Merge clean_marker_total_wo_OS_THR back into the main DataFrame.
    clean_total_per_marker = df[~df["is_noise_or_low"]].groupby("marker")["total"].sum().rename("clean_marker_total_wo_OS_THR")
    df = df.merge(clean_total_per_marker, on="marker", how="left")

    # Step 3: Compute normalized variant frequency (only for retained rows)
    df["variant_frequency_wo_OS_THR"] = (df["total"] / df["clean_marker_total_wo_OS_THR"] * 100).round(2)

    # Step 5: Split multiple variants
    df = df.assign(sequence=df["sequence"].str.split())
    df = df.explode("sequence").reset_index(drop=True)
    
    # Step 4: Flag rows to drop
    drop_seqs = ["Other", "sequences", "REF", "N3107DEL"]
    df = df[(~df["sequence"].isin(drop_seqs)) & (df["total_mp_sum"] >= threshold)].copy()

    # Step 6: Calculate estimated coverage for each row
    df["estimated_total_coverage"] = (
        df["total"] / (df["total_mp_sum"] / 100)
    ).round(0).astype("Int64")

    # Step 7: Group by marker + sequence to sum within same marker
    grouped_same_marker = df.groupby(["marker", "sequence"], as_index=False).agg(
        # total=("total", "sum"),
        # total_mp_sum=("total_mp_sum", "sum"),
        # estimated_total_coverage=("estimated_total_coverage", "sum")
        total=("total", "sum"),
        total_mp_sum=("total_mp_sum", "sum"),
        estimated_total_coverage=("estimated_total_coverage", "max"),
        is_noise_or_low=("is_noise_or_low", "first"),
        clean_marker_total_wo_OS_THR=("clean_marker_total_wo_OS_THR", "first"),
        variant_frequency_wo_OS_THR=("variant_frequency_wo_OS_THR", "sum")
    )
    # # print(grouped_same_marker)
    # # Step 8: Recompute variant_frequency within marker
    # grouped_same_marker["variant_frequency"] = (
    #     grouped_same_marker["total"] / grouped_same_marker["estimated_total_coverage"] * 100
    # ).round(1)

    # print(grouped_same_marker)
    
    # Step 9: Group across markers to merge overlapping amplicons (same variant)
    # df["estimated_total_coverage_across_markers"] = (df["total"] / (df["total_mp_sum"] / 100)).round(0).astype("Int64")
    grouped_final = grouped_same_marker.groupby("sequence", as_index=False).agg(
        total=("total", "sum"),
        # total_mp_sum=("total_mp_sum", "sum"),
        estimated_total_coverage=("estimated_total_coverage", "sum"),
        # estimated_total_coverage_across_markers=("estimated_total_coverage_across_markers", "sum"),
        is_noise_or_low=("is_noise_or_low", "first"),
        clean_marker_total_wo_OS_THR=("clean_marker_total_wo_OS_THR", "sum"),
        # variant_frequency_wo_OS_THR=("variant_frequency_wo_OS_THR", "sum"),
        num_markers=("marker", "nunique")
    )

    

    # print(grouped_final)
    
    grouped_final["variant_frequency"] = (
        grouped_final["total"] / grouped_final["estimated_total_coverage"] * 100
    ).round(1)

    grouped_final["variant_frequency_wo_OS_THR"] = (
        grouped_final["total"] / grouped_final["clean_marker_total_wo_OS_THR"] * 100
    ).round(1)

    grouped_final["position"] = grouped_final["sequence"].apply(extract_position)
    grouped_final = grouped_final.sort_values(by="position").drop(columns=["position"])

    
    # Step 11: Apply IUPAC codes for heteroplasmies
    # Step 11: Apply IUPAC codes and adjust formatting for heteroplasmies
    def resolve_heteroplasmy(row):
        seq = row['sequence']
    
        # Handle deletions
        if 'DEL' in seq:
            return seq.replace('DEL', '-')

        # Handle insertions: prefix with "-"

        # Handle insertions / length heteroplasmies
        if '.' in seq:
            if row['variant_frequency_wo_OS_THR'] < 92:
                return '-' + seq[:-1] + seq[-1].lower()  # e.g. 309.2C -> 309.2c
            else:
                return '-' + seq # leave as-is if frequency is high

        # Handle point heteroplasmies with IUPAC
        if row['variant_frequency_wo_OS_THR'] < 92:
            match = re.match(r'([ACGT])(\d+)([ACGT])', seq)
            if not match:
                return seq
            ref, pos, alt = match.groups()
            code = IUPAC_CODES.get(frozenset([ref, alt]))
            if code:
                return f"{ref}{pos}{code}"
    
        return seq



    grouped_final['sequence'] = grouped_final.apply(resolve_heteroplasmy, axis=1)

    grouped_final = pd.concat([grouped_final, single_low_coverage], ignore_index=False)

    print(grouped_final)
    
    return grouped_final
    # return grouped_final, df

# # Placeholder path 
tsv_path = "s23-11298-E1_S16_L001.sast.csv"
reference_sequence = "../rCRS/rCRS2.fasta"


In [12]:
processed_df = process_fdstools_sast(tsv_path)

Markers with single entries and total < 10:
       marker sequence  total
499  mtNG_099  No data      0
      sequence  total  estimated_total_coverage is_noise_or_low  \
7        A263G   1853                      2037           False   
1      -309.1C    469                       588           False   
2      -309.2c    114                       588           False   
3      -315.1C    469                       588           False   
11       A523-   1478                      1802           False   
16       C524-   1478                      1802           False   
13       A750G   2298                      2479           False   
4       A1438G   1887                      2026           False   
18      G3010A   2049                      2173           False   
8       A3447M     28                       172           False   
9       A3796G   3294                      3504           False   
10      A4769G    927                       998           False   
12      A6419M    192    

In [210]:
output_path = "s23-11303-E1_S13_L001_processed10.txt" 
processed_df.to_csv(output_path, sep="\t", index=False)

In [183]:
import pandas as pd
import re

def process_empop_variant_table(file_path: str) -> pd.DataFrame:
    """
    Processes a variant table with EMPOP-style variant annotations.
    - Splits multi-variant rows
    - Sums VariantLevel and allele-specific Coverage
    - Keeps the first value of MeanBaseQuality
    - Sorts variants by position

    Parameters:
    - file_path: path to the tab-separated file

    Returns:
    - A cleaned and sorted DataFrame
    """

    # Load file
    df = pd.read_csv(file_path, sep="\t")

    # Split EMPOP_Variant column and explode into rows
    df["EMPOP_Variant"] = df["EMPOP_Variant"].astype(str).str.split()
    df = df.explode("EMPOP_Variant").reset_index(drop=True)

    # Convert numeric columns
    df["VariantLevel"] = pd.to_numeric(df["VariantLevel"], errors="coerce")

    # Helper: sum comma-separated numbers elementwise
    def add_comma_separated_numbers(series):
        split_lists = series.dropna().astype(str).apply(lambda x: list(map(float, x.split(','))))
        if split_lists.empty:
            return ""
        summed = [sum(x) for x in zip(*split_lists)]
        return ",".join(f"{s:.4g}" for s in summed)

    # Aggregate
    group_keys = ["EMPOP_Variant"]
    numeric_agg = {
        "VariantLevel": "sum",
        "Coverage": add_comma_separated_numbers,
        "MeanBaseQuality": "first"
    }
    other_cols = [col for col in df.columns if col not in numeric_agg and col not in group_keys]
    full_agg = {**numeric_agg, **{col: "first" for col in other_cols}}

    grouped = df.groupby(group_keys, as_index=False).agg(full_agg)

    # Sort by numeric position extracted from variant
    def extract_position(variant):
        match = re.search(r"(\d+\.?\d*)", str(variant))
        return float(match.group(1)) if match else float('inf')

    grouped["position"] = grouped["EMPOP_Variant"].apply(extract_position)
    grouped = grouped.sort_values(by="position").drop(columns=["position"])

    def correct_length_het_case(row):
        if "." in row["EMPOP_Variant"] and row["VariantLevel"] >= 0.92:
            return row["EMPOP_Variant"][:-1] + row["EMPOP_Variant"][-1].upper()
        return row["EMPOP_Variant"]

    grouped["EMPOP_Variant"] = grouped.apply(correct_length_het_case, axis=1)
    
    return grouped


In [184]:
input_path = "s23-11303-E1_S13_L001.rtn.vcf.mutect2_fusion.filtered.empop.txt"
grouped_variants = process_empop_variant_table(input_path)

# Save results
grouped_variants.to_csv("s23-11303-E1_S13_L001.rtn.vcf.mutect2_fusion.filtered.empop_grouped.txt", sep="\t", index=False)
# exploded_df.to_csv("exploded_variants.tsv", sep="\t", index=False)


In [None]:
# df_cleaned = process_empop_variant_table("your_input_file.tsv")
# df_cleaned.to_csv("cleaned_empop_variants.tsv", sep="\t", index=False)

In [192]:
import pandas as pd
import re

def merge_variant_callers(file1: str, file2: str) -> pd.DataFrame:
    """
    Merges two variant tables from different callers on their variant column.
    
    Parameters:
        file1: Path to the first variant caller table (expects 'sequence' column)
        file2: Path to the second variant caller table (expects 'EMPOP_Variant' column)
        
    Returns:
        A merged DataFrame with flags and full column preservation, sorted by position.
    """
    # Load both files
    df1 = pd.read_csv(file1, sep="\t")
    df2 = pd.read_csv(file2, sep="\t")
    
    # Rename variant columns to common key
    df1 = df1.rename(columns={"sequence": "variant"})
    df2 = df2.rename(columns={"EMPOP_Variant": "variant"})
    
    # Merge the dataframes on the variant column
    merged = pd.merge(df1, df2, on="variant", how="outer", suffixes=("_vc1", "_vc2"))

    # Add flags for presence in each caller
    merged["called_in_vc1"] = ~merged["total"].isna()
    merged["called_in_vc2"] = ~merged["VariantLevel"].isna()

    # Extract numeric position for sorting
    def extract_position(seq):
        match = re.search(r"(\d+\.?\d*)", str(seq))
        return float(match.group(1)) if match else float('inf')

    merged["variant_position"] = merged["variant"].apply(extract_position)
    merged = merged.sort_values(by="variant_position").drop(columns=["variant_position"])

    # Reorder columns for clarity
    front = ["variant", "called_in_vc1", "called_in_vc2"]
    other = [col for col in merged.columns if col not in front]
    merged = merged[front + other]

    return merged


In [211]:
df_merged = merge_variant_callers("s23-11303-E1_S13_L001_processed10.txt","s23-11303-E1_S13_L001.rtn.vcf.mutect2_fusion.filtered.empop_grouped.txt")
df_merged.to_csv("merged_variants2.txt", sep="\t", index=False)

In [136]:
import pandas as pd
import re
import numpy as np

# Step 10: Extract numeric positions from the sequence column for sorting
def extract_position(seq):
    match = re.search(r"(\d+\.?\d*)", seq)
    return float(match.group(1)) if match else float('inf')

# Step 11: Apply IUPAC codes and adjust formatting for heteroplasmies
def resolve_heteroplasmy(row, min_variant_frequency_pct, length_heteroplasmy_threshold, IUPAC_CODES):
    seq = row['sequence']

    # Handle deletions
    if 'DEL' in seq:
        return seq.replace('DEL', '-')

    # Handle insertions / length heteroplasmies
    if '.' in seq:
        if row['variant_frequency'] < length_heteroplasmy_threshold:
            return '-' + seq[:-1] + seq[-1].lower()  # e.g. 309.2C -> 309.2c
        else:
            return '-' + seq # leave as-is if frequency is high

    # Handle point heteroplasmies with IUPAC
    if row['variant_frequency'] < 100-min_variant_frequency_pct:
        match = re.match(r'([ACGT])(\d+)([ACGT])', seq)
        if not match:
            return seq
        ref, pos, alt = match.groups()
        code = IUPAC_CODES.get(frozenset([ref, alt]))
        if code:
            return f"{ref}{pos}{code}"

    return seq

def load_marker_ranges(filepath):
    marker_ranges = {}
    in_position_block = False
    with open(filepath, 'r') as f:
        for line in f:
            line = line.strip()
            if line.startswith("[genome_position]"):
                in_position_block = True
                continue
            if line.startswith("[") and in_position_block:
                # Stop if another block begins
                break
            if in_position_block and "=" in line:
                marker, values = line.split("=")
                marker = marker.strip()
                parts = [v.strip() for v in values.split(",")]
                if len(parts) >= 3:
                    chrom, start, end = parts[:3]
                    marker_ranges[marker] = f"{chrom}:{start}–{end}"
    return marker_ranges

# Define function to process FDSTools SAST TSV files
def process_fdstools_sast(file_path: str, min_variant_frequency_pct: float = 5.0, depth_threshold: int = 10, length_heteroplasmy_threshold: float = 90.0):
    IUPAC_CODES = {
        frozenset(["A", "G"]): "R",
        frozenset(["C", "T"]): "Y",
        frozenset(["A", "C"]): "M",
        frozenset(["G", "T"]): "K",
        frozenset(["G", "C"]): "S",
        frozenset(["A", "T"]): "W"
    }

    # Load the FDSTools SAST TSV file
    df = pd.read_csv(file_path, sep="\t", dtype=str)

    # Drop unnecessary columns
    columns_to_drop = [
    "total_mp_max", "forward_pct", "forward", "forward_mp_sum",
    "forward_mp_max", "reverse", "reverse_mp_sum", "reverse_mp_max"]
    
    df = df.drop(columns=columns_to_drop, errors="ignore")
    
    # # Convert total_mp_sum and total to numeric types
    df["total_mp_sum"] = pd.to_numeric(df["total_mp_sum"], errors="coerce")
    df["total"] = pd.to_numeric(df["total"], errors="coerce")

    # Count each marker (e.g. mtNG_001 or mtNG_007) and flag those with a single occurrence and low total
    marker_counts = df["marker"].value_counts()
    # print(marker_counts)
    single_low_coverage = df[df["marker"].isin(marker_counts[marker_counts == 1].index) & (df["total"] < depth_threshold)].copy()
    print(single_low_coverage)
    
    # Step 1: Fill NaN in total and total_mp_sum with zeros
    df["total"] = df["total"].fillna(0)
    df["total_mp_sum"] = df["total_mp_sum"].fillna(0)

    # df = df[df["marker"] == "mtNG_005"]    
    # df = df[df["marker"].isin(["mtNG_001", "mtNG_002"])]
    # Step 2: Flag rows with total read depth lower than threshold and low-confidence sequences
    # df["is_noise_or_low_frq"] = (df["sequence"].isin(["Other sequences"])) & (df["total_mp_sum"] < 0)
 

    # Step 5: Split multiple variants
    df = df.assign(sequence=df["sequence"].str.split())
    df = df.explode("sequence").reset_index(drop=True)
    print(df)
    
    # Merge clean_marker_total_wo_OS_THR back into the main DataFrame.
    # clean_total_per_marker = df[~df["is_noise_or_low_frq"]].groupby("marker")["total"].sum().rename("total_wo_noise_or_low_frq")
    # print(clean_total_per_marker)
    # df = df.merge(clean_total_per_marker, on="marker", how="left")    

    # Step 3: Compute normalized variant frequency (only for retained rows)
    # df["variant_frequency_wo_noise_or_low_frq"] = (df["total"] / df["total_wo_noise_or_low_frq"] * 100).round(2)
    # print(df)


    # # Step 4: Flag rows to drop LETS THINK ABOUT THIS, WHEN IS THE BEST TIME TO FILTER THESE OUT?
    # drop_seqs = ["Other", "sequences", "REF", "N3107DEL"]
    # df = df[(~df["sequence"].isin(drop_seqs)) & (df["total_mp_sum"] >= min_variant_frequency_pct)].copy()

    
    df["interpolated_total_coverage"] = (np.ceil(df["total"] / (df["total_mp_sum"] / 100))).astype("Int64")

    # Step 7: Group by marker + sequence to sum within same marker
    grouped_same_marker = df.groupby(["marker", "sequence"], as_index=False).agg(
        total=("total", "sum"),
        total_mp_sum=("total_mp_sum", "sum"),
        interpolated_total_coverage=("interpolated_total_coverage", "max"),
        # is_noise_or_low_frq=("is_noise_or_low_frq", "first"),
        # total_wo_noise_or_low_frq=("total_wo_noise_or_low_frq", "first"),
        # variant_frequency_wo_noise_or_low_frq=("variant_frequency_wo_noise_or_low_frq", "sum")
    )
    # print(grouped_same_marker)

    # Extract "Other" sequence coverage per marker
    other_per_marker = grouped_same_marker[grouped_same_marker["sequence"] == "Other"][["marker", "total"]]
    other_per_marker = other_per_marker.rename(columns={"total": "other_coverage"})
    print(other_per_marker)
    # Merge with grouped data
    grouped_same_marker = grouped_same_marker.merge(other_per_marker, on="marker", how="left")
    grouped_same_marker["other_coverage"] = grouped_same_marker["other_coverage"].fillna(0)
    # print(grouped_same_marker)

    # Subtract "Other" sequence coverage from interpolated_total_coverage
    grouped_same_marker["adjusted_coverage"] = grouped_same_marker["interpolated_total_coverage"] - grouped_same_marker["other_coverage"]
    
    # Ensure adjusted coverage is not negative or zero (to avoid division by zero)
    grouped_same_marker["adjusted_coverage"] = grouped_same_marker["adjusted_coverage"].clip(lower=1)

    
    # # Step 9: Group across markers to merge overlapping amplicons (same variant)
    grouped_final = grouped_same_marker.groupby("sequence", as_index=False).agg(
        marker=("marker", "first"),
        total=("total", "sum"),
        adjusted_coverage=("adjusted_coverage", "sum"),
        # is_noise_or_low_frq=("is_noise_or_low_frq", "first"),
        # total_wo_noise_or_low_frq=("total_wo_noise_or_low_frq", "sum"),
        num_markers=("marker", "nunique")
    )
    # print(grouped_same_marker)
    
    
    grouped_final["variant_frequency"] = (
        grouped_final["total"] / grouped_final["adjusted_coverage"] * 100
    ).round(2)

    # grouped_final["variant_frequency_wo_noise_or_low_frq"] = (
    #     grouped_final["total"] / grouped_final["total_wo_noise_or_low_frq"] * 100
    # ).round(2)

    grouped_final["position"] = grouped_final["sequence"].apply(extract_position)
    # grouped_final = grouped_final.sort_values(by="position").drop(columns=["position"])

    drop_seqs = ["Other", "sequences", "REF", "N3107DEL"]
    grouped_final = grouped_final[(~grouped_final["sequence"].isin(drop_seqs))]
    
    grouped_final["is_noise_or_low_frq"] = (grouped_final["sequence"].isin(["Other sequences"])) | (grouped_final["variant_frequency"] < 3)
    grouped_final = grouped_final[~grouped_final["is_noise_or_low_frq"]]
    print(grouped_final)
    # print(grouped_final)
    # Temporary store for merged entries
    merged_rows = []
    used_indices = set()

    for pos, group in grouped_final.groupby("position"):
        if group.shape[0] != 2:
            continue

        # Identify deletion and substitution
        del_row = group[group["sequence"].str.endswith("DEL")]
        # print(del_row)
        sub_row = group[~group["sequence"].str.endswith("DEL")]
        # print(sub_row)
        
        if del_row.empty or sub_row.empty:
            continue
    
        del_idx = del_row.index[0]
        # print(del_idx)
        sub_idx = sub_row.index[0]
    
        # Skip if already merged
        if del_idx in used_indices or sub_idx in used_indices:
            continue
    
        # Merge logic
        # if del_row["total"].iloc[0] >= sub_row["total"].iloc[0]:
        #     dominant_row = del_row.iloc[0]
        # else:
        #     dominant_row = sub_row.iloc[0]
        total = del_row["total"].iloc[0] + sub_row["total"].iloc[0]
        coverage = del_row["total_wo_noise_or_low_frq"].iloc[0] 
        freq = round(total / coverage * 100, 1) if coverage else 0
        
        sub_seq = sub_row["sequence"].iloc[0]
        ref, pos_str, alt = re.match(r'([ACGT])(\d+)([ACGT])', sub_seq).groups()
    
        # If deletion is the minor variant, return lowercase
        # del_freq = del_row["variant_frequency_wo_noise_or_low_frq"].iloc[0]
        merged_seq = f"{ref}{pos_str}{alt.lower()}" if del_freq < (length_heteroplasmy_threshold) else f"{ref}{pos_str}{alt}"

        sub_freq = sub_row["variant_frequency"].iloc[0]
        # sub_clean_freq = sub_row["variant_frequency_wo_noise_or_low_frq"].iloc[0]
        del_freq = del_row["variant_frequency"].iloc[0]
        # del_clean_freq = del_row["variant_frequency_wo_noise_or_low_frq"].iloc[0]
    
        freq_annotation = (
            f"sub:{sub_freq} ({sub_clean_freq}) | "
            f"del:{del_freq} ({del_clean_freq})"
        )
        merged_rows.append({
            "sequence": merged_seq,
            "total": total,
            "adjusted_coverage": del_row["adjusted_coverage"].iloc[0],
            "is_noise_or_low_frq": False,
            "total_wo_noise_or_low_frq": total,  # fallback
            "num_markers": f"sub:{sub_row['num_markers'].iloc[0]} | del:{del_row['num_markers'].iloc[0]}", 
            "variant_frequency": freq_annotation, 
            "variant_frequency_wo_noise_or_low_frq": freq,
            "marker": sub_row["marker"].iloc[0],  # arbitrary
            "position": float(pos)
        })
    
        used_indices.update([del_idx, sub_idx])

    # Drop merged ones and add new merged row
    grouped_final = grouped_final.drop(index=used_indices)
    if merged_rows:
        grouped_final = pd.concat([grouped_final, pd.DataFrame(merged_rows)], ignore_index=True)

    grouped_final = grouped_final.sort_values(by="position").drop(columns=["position"])

    grouped_final["sequence"] = grouped_final.apply(
        lambda row: resolve_heteroplasmy(
            row,
            min_variant_frequency_pct,
            length_heteroplasmy_threshold,
            IUPAC_CODES
        ),
        axis=1
    )
    
    grouped_final = pd.concat([grouped_final, single_low_coverage], ignore_index=False)

    # Load the mapping from your txt file
    marker_to_range = load_marker_ranges("mtNG_lib2_v211-flank.txt")
    grouped_final["marker_range"] = grouped_final["marker"].map(marker_to_range)

    grouped_final["position"] = grouped_final["marker"].apply(extract_position)
    grouped_final = grouped_final.sort_values(by="position").drop(columns=["position"])

    return grouped_final

tsv_path = "2800M_L001.sast.csv"


In [137]:
processed_df = process_fdstools_sast(tsv_path)

Empty DataFrame
Columns: [marker, sequence, flags, total, total_mp_sum]
Index: []
        marker sequence   flags  total  total_mp_sum
0     mtNG_001    T152C  allele    492       94.8000
1     mtNG_001     G97T     NaN      4        0.7710
2     mtNG_001    T152C     NaN      4        0.7710
3     mtNG_001   C19DEL     NaN      1        0.1930
4     mtNG_001    T152C     NaN      1        0.1930
...        ...      ...     ...    ...           ...
6212  mtNG_100  C16478A     NaN      1        0.0315
6213  mtNG_100  T16519C     NaN      1        0.0315
6214  mtNG_100  T16475C     NaN      1        0.0315
6215  mtNG_100  G16516T     NaN      1        0.0315
6216  mtNG_100  T16519A     NaN      1        0.0315

[6217 rows x 5 columns]
Empty DataFrame
Columns: [marker, other_coverage]
Index: []
        marker sequence  total  total_mp_sum  interpolated_total_coverage  \
0     mtNG_001    A148T      1        0.1930                          519   
1     mtNG_001     A87G      1        0.193

In [127]:
processed_df

Unnamed: 0,sequence,marker,total,interpolated_total_coverage,num_markers,variant_frequency,is_noise_or_low_frq,flags,total_mp_sum,marker_range
4306,T152C,mtNG_001,1956,1958,2.0,99.9,False,,,chrM:19–155
749,A263G,mtNG_002,2663,2666,2.0,99.89,False,,,chrM:134–266
15,-315.1C,mtNG_003,1216,1227,1.0,99.1,False,,,chrM:260–368
4645,T477C,mtNG_005,618,619,1.0,99.84,False,,,chrM:431–590
1167,A750G,mtNG_006,2454,2460,1.0,99.76,False,,,chrM:573–767
428,A1438G,mtNG_010,1683,1687,2.0,99.76,False,,,chrM:1278–1442
3376,G3010A,mtNG_021,1282,1283,1.0,99.92,False,,,chrM:2925–3113
3400,G3407K,mtNG_023,19,488,1.0,3.89,False,,,chrM:3297–3487
947,A4769G,mtNG_031,388,389,1.0,99.74,False,,,chrM:4720–4916
1273,A8860G,mtNG_054,839,841,1.0,99.76,False,,,chrM:8680–8876
