In [2]:
import pandas as pd

def fasta_to_dataframe(fasta_file):
    headers = []
    sequences = []
    with open(fasta_file, 'r') as f:
        current_sequence = []
        current_header = None
        for line in f:
            line = line.strip()
            if line.startswith('>'):  # Header line
                if current_header is not None:
                    # Save the previous sequence
                    headers.append(current_header)
                    sequences.append(''.join(current_sequence))
                current_header = line[1:]  # Remove '>'
                current_sequence = []
            else:
                current_sequence.append(line)
        # Add the last sequence
        if current_header is not None:
            headers.append(current_header)
            sequences.append(''.join(current_sequence))
    # Create a DataFrame
    return pd.DataFrame({'Header': headers, 'Sequence': sequences})

In [3]:
fasta_file = "../data/DB.COX1.fna"
df = fasta_to_dataframe(fasta_file)

In [4]:
df[['BOLD_Metadata', 'Taxonomy']] = df['Header'].str.split(';', n=1, expand=True)
df[['BOLD_ID', 'Specimen_ID', 'Country']] = df['BOLD_Metadata'].str.split('|', n=2, expand=True)

tax_columns = ['Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']
df[tax_columns] = df['Taxonomy'].str.extract(
    r'p:([^,]+),c:([^,]+),o:([^,]+),f:([^,]+),g:([^,]+),s:([^;]+)'
)

df = df.drop(columns=['Header', 'BOLD_Metadata', 'Taxonomy'])



                                            Sequence       BOLD_ID  \
0  TCTTTGGAATTTGGGCAGGAATAGTAGGAACTTCTTTAAGTTTATT...  BOLD:AAA7085   
1  TACTTTATATTTTATCTTTGGAATTTGGGCAGGAATAGTAGGAACT...  BOLD:AAA7085   
2  TACTTTATATTTTATCTTTGGAATTTGGGCAGGAATAGTAGGAACT...  BOLD:AAA7085   
3  TACTTTATATTTTATCTTTGGAATTTGGGCAGGAATAGTAGGAACT...  BOLD:AAA7085   
4  TACTTTATATTTTATCTTTGGAATTTGGGCAGGAATAGTAGGAACT...  BOLD:AAA7085   

    Specimen_ID     Country      Phylum    Class        Order       Family  \
0  BLPDA1021-18  Costa_Rica  Arthropoda  Insecta  Lepidoptera  Saturniidae   
1  MHMYN6354-14  Costa_Rica  Arthropoda  Insecta  Lepidoptera  Saturniidae   
2   MHASB734-07  Costa_Rica  Arthropoda  Insecta  Lepidoptera  Saturniidae   
3   MHATB310-06  Costa_Rica  Arthropoda  Insecta  Lepidoptera  Saturniidae   
4  BLPAA6663-17  Costa_Rica  Arthropoda  Insecta  Lepidoptera  Saturniidae   

     Genus                Species  
0  Lonomia  Lonomia_santarosensis  
1  Lonomia  Lonomia_santarosensis  
2 

In [5]:
df['BOLD_ID'] = df['BOLD_ID'].str.removeprefix('BOLD:')
print(df.head())

                                            Sequence  BOLD_ID   Specimen_ID  \
0  TCTTTGGAATTTGGGCAGGAATAGTAGGAACTTCTTTAAGTTTATT...  AAA7085  BLPDA1021-18   
1  TACTTTATATTTTATCTTTGGAATTTGGGCAGGAATAGTAGGAACT...  AAA7085  MHMYN6354-14   
2  TACTTTATATTTTATCTTTGGAATTTGGGCAGGAATAGTAGGAACT...  AAA7085   MHASB734-07   
3  TACTTTATATTTTATCTTTGGAATTTGGGCAGGAATAGTAGGAACT...  AAA7085   MHATB310-06   
4  TACTTTATATTTTATCTTTGGAATTTGGGCAGGAATAGTAGGAACT...  AAA7085  BLPAA6663-17   

      Country      Phylum    Class        Order       Family    Genus  \
0  Costa_Rica  Arthropoda  Insecta  Lepidoptera  Saturniidae  Lonomia   
1  Costa_Rica  Arthropoda  Insecta  Lepidoptera  Saturniidae  Lonomia   
2  Costa_Rica  Arthropoda  Insecta  Lepidoptera  Saturniidae  Lonomia   
3  Costa_Rica  Arthropoda  Insecta  Lepidoptera  Saturniidae  Lonomia   
4  Costa_Rica  Arthropoda  Insecta  Lepidoptera  Saturniidae  Lonomia   

                 Species  
0  Lonomia_santarosensis  
1  Lonomia_santarosensis  
2  Lo

In [6]:
# Absolute frequency of each unique Order
order_counts = df['Order'].value_counts()

# Relative frequency (proportion) of each unique Order
order_relative = df['Order'].value_counts(normalize=True)

# Combine both into a single DataFrame for convenience
order_stats = pd.DataFrame({
    'Absolute Frequency': order_counts,
    'Relative Frequency': order_relative
})

print(order_stats)

                 Absolute Frequency  Relative Frequency
Order                                                  
Lepidoptera                  687953        2.889169e-01
Diptera                      642639        2.698865e-01
Hymenoptera                  242091        1.016700e-01
Coleoptera                   197958        8.313563e-02
Hemiptera                    127555        5.356877e-02
...                             ...                 ...
Diplura_order                     2        8.399320e-07
Stygiomysida                      2        8.399320e-07
Mystacocaridida                   1        4.199660e-07
Platycopioida                     1        4.199660e-07
Glomeridesmida                    1        4.199660e-07

[122 rows x 2 columns]


In [9]:
(order_stats.shape)

(122, 2)