In [None]:
from Bio import SeqIO
import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()

In [None]:
GFF_COLS = [
    "seqname",
    "source",
    "feature",
    "start",
    "end",
    "score",
    "strand",
    "frame",
    "attribute"
]

In [None]:
gff_file = "Phytozome/PhytozomeV12/Ahypochondriacus/annotation/Ahypochondriacus_315_v1.0.gene.gff3"
original_cds_file = "Phytozome/PhytozomeV12/Ahypochondriacus/annotation/Ahypochondriacus_315_v1.0.cds_primaryTranscriptOnly.fa"

## Parsing original CDS

In [None]:
with open(original_cds_file, 'r') as fin:
    original_cds_records = list(SeqIO.parse(fin, format='fasta'))
print(len(original_cds_records))
original_cds_ids = [rec.id for rec in original_cds_records]
original_cds_record_lengths = [len(rec.seq) for rec in original_cds_records]
original_length_sum = sum(original_cds_record_lengths)
print(original_length_sum)

## Determining trimming ratio

In [None]:
target_length_sum = 2E7
trimming_ratio  = 1 - target_length_sum / original_length_sum
trimming_ratio

## Trimming sequences in GFF file

In [None]:
df = pd.read_csv(gff_file, sep="\t", header=None, names=GFF_COLS, skiprows=3)
# keep only CDS entries
df = df[df["feature"] == "CDS"]
# extract the gene name
df["gene"] = df["attribute"].str.split(";").str[1].str.split("Parent=").str[1].str.split(".").str[0]

In [None]:
unwanted_proteins_file = "unwanted_proteins.txt"
with open(unwanted_proteins_file, "r") as fin:
    unwanted_proteins = [line.strip() for line in fin]
df = df[~df["gene"].isin(unwanted_proteins)]

In [None]:
def trim_gene(group: pd.DataFrame):

    strand = group["strand"].iloc[0]
    # in case of "-" strand, changing side for trimming
    if strand == "+":
        group = group.sort_values(by="start", ascending=True)
    else:
        group = group.sort_values(by="start", ascending=False)
    
    # same for strand -
    total_length = (group["end"] - group["start"] + 1).sum()
    
    # calculate how much to trim
    trimmed_length = int(total_length * trimming_ratio)
    
    # trim from the end of the CDS entries
    remaining_trim = trimmed_length
    
    # iterate from last CDS to the first
    for cds_id, row in group[::-1].iterrows():  
        
        cds_length = row["end"] - row["start"] + 1
        
        if remaining_trim >= cds_length:
            remaining_trim -= cds_length
        else:
            if strand == "+":
                row["end"] -= remaining_trim
            else:
                row["start"] += remaining_trim
            break

    # add all the untrimmed CDS entries
    # cds_id is the latest trimmed CDS
    # adding the last CDS (the one that was trimmed)
    trimmed_group = pd.concat([group[group.index < cds_id], row.to_frame().T], ignore_index=True)
    
    return trimmed_group

In [None]:
# Group by gene name and apply the trimming
trimmed_df = df.groupby("gene").progress_apply(trim_gene).reset_index(drop=True)

## Exporting new GFF file

In [None]:
outfile = "Ahypochondriacus_315_v1.0.gene.cds_only.cter_trimmed.gff3"
trimmed_df[GFF_COLS].to_csv(outfile, index=False, header=False, sep="\t")