In [1]:
from Bio import SeqIO
from tqdm.auto import tqdm
tqdm.pandas()

## After creation of CDS file with AGAT

### Parsing new CDS file

In [2]:
cds_file = "Ahypochondriacus_315_v1.0.cds.fa"
with open(cds_file, 'r') as fin:
    cds_records = list(SeqIO.parse(fin, format='fasta'))
print(f"Number of CDS sequences: {len(cds_records)}")
cds_ids = [rec.id for rec in cds_records]
cds_record_lengths = [len(rec.seq) for rec in cds_records]
length_sum = sum(cds_record_lengths)
print(f"Cumulated length of all CDS: {length_sum}")

Number of CDS sequences: 22398
Cumulated length of all CDS: 19581078


### Checking if the new sequences are subset of the original ones

In [3]:
original_cds_file = "Phytozome/PhytozomeV12/Ahypochondriacus/annotation/Ahypochondriacus_315_v1.0.cds_primaryTranscriptOnly.fa"

In [9]:
with open(original_cds_file, 'r') as fin:
    original_cds_records = list(SeqIO.parse(fin, format='fasta'))
original_cds_records = sorted(original_cds_records, key=lambda rec: rec.id)
original_cds_ids = [rec.id for rec in original_cds_records]
original_cds_record_lengths = [len(rec.seq) for rec in original_cds_records]
original_length_sum = sum(original_cds_record_lengths)

In [5]:
# checking that all cds sequences are subset of the original ones
# it's not a problem if new sequences are present, they were in the annotation file anyway
for cds_id in cds_ids:
    if not cds_id.split(".")[0] in original_cds_ids:
        print(f"Record {cds_id} not found in original file")
    

Record AHYPO_000257-RA.v1.0 not found in original file
Record AHYPO_000714-RB.v1.0 not found in original file
Record AHYPO_002098-RB.v1.0 not found in original file
Record AHYPO_003994-RB.v1.0 not found in original file
Record AHYPO_004528-RA.v1.0 not found in original file
Record AHYPO_006727-RA.v1.0 not found in original file
Record AHYPO_007499-RB.v1.0 not found in original file
Record AHYPO_008341-RA.v1.0 not found in original file
Record AHYPO_008697-RA.v1.0 not found in original file
Record AHYPO_012862-RB.v1.0 not found in original file
Record AHYPO_012998-RB.v1.0 not found in original file
Record AHYPO_014835-RB.v1.0 not found in original file
Record AHYPO_015049-RB.v1.0 not found in original file
Record AHYPO_015496-RA.v1.0 not found in original file
Record AHYPO_015878-RA.v1.0 not found in original file
Record AHYPO_016548-RB.v1.0 not found in original file
Record AHYPO_018999-RB.v1.0 not found in original file
Record AHYPO_019180-RA.v1.0 not found in original file
Record AHY

### Checking if the size ratio is respected

In [10]:
target_length_sum = 2E7
trimming_ratio = 1 - target_length_sum / original_length_sum
trimming_ratio

0.26009715812197265

In [11]:
tolerance = 0.1
expected_length_ratio = 1 - trimming_ratio
min_ratio = expected_length_ratio * (1 - tolerance)
max_ratio = expected_length_ratio * (1 + tolerance)

for record in tqdm(cds_records):
    for original_record in original_cds_records:
        # checking if the new sequence is contained in the original one
        if record.id.split(".")[0] == original_record.id:
            
            if not record.seq in original_record.seq:
                raise ValueError(f'Sequence of {record.id} is not included in the original one')
            
            # checking if the size ratio is respected
            actual_ratio = len(record.seq) / len(original_record.seq)
            if not min_ratio <= actual_ratio <= max_ratio:
                raise ValueError(f'Ratio of {record.id}: {actual_ratio}  is not between {min_ratio} and {max_ratio}')
        

  0%|          | 0/22398 [00:00<?, ?it/s]

KeyboardInterrupt: 