# Definitions

In [2]:
import pandas as pd

def fasta_to_dataframe(fasta_file):
    headers = []
    sequences = []
    with open(fasta_file, 'r') as f:
        current_sequence = []
        current_header = None
        for line in f:
            line = line.strip()
            if line.startswith('>'):  # Header line
                if current_header is not None:
                    # Save the previous sequence
                    headers.append(current_header)
                    sequences.append(''.join(current_sequence))
                current_header = line[1:]  # Remove '>'
                current_sequence = []
            else:
                current_sequence.append(line)
        # Add the last sequence
        if current_header is not None:
            headers.append(current_header)
            sequences.append(''.join(current_sequence))
    # Create a DataFrame
    return pd.DataFrame({'Header': headers, 'Sequence': sequences})

In [4]:
fasta_file = "../../data/DB.COX1.fna"
df = fasta_to_dataframe(fasta_file)

## Clean Data Frame

In [28]:
df[['BOLD_Metadata', 'Taxonomy']] = df['Header'].str.split(';', n=1, expand=True)
df[['BOLD_ID', 'Specimen_ID', 'Country']] = df['BOLD_Metadata'].str.split('|', n=2, expand=True)
df['BOLD_ID'] = df['BOLD_ID'].str.removeprefix('BOLD:')

tax_columns = ['Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']
df[tax_columns] = df['Taxonomy'].str.extract(
    r'p:([^,]+),c:([^,]+),o:([^,]+),f:([^,]+),g:([^,]+),s:([^;]+)'
)

df = df.drop(columns=['Header', 'BOLD_Metadata', 'Taxonomy'])

In [30]:
print(df.head())

                                            Sequence  BOLD_ID   Specimen_ID  \
0  TCTTTGGAATTTGGGCAGGAATAGTAGGAACTTCTTTAAGTTTATT...  AAA7085  BLPDA1021-18   
1  TACTTTATATTTTATCTTTGGAATTTGGGCAGGAATAGTAGGAACT...  AAA7085  MHMYN6354-14   
2  TACTTTATATTTTATCTTTGGAATTTGGGCAGGAATAGTAGGAACT...  AAA7085   MHASB734-07   
3  TACTTTATATTTTATCTTTGGAATTTGGGCAGGAATAGTAGGAACT...  AAA7085   MHATB310-06   
4  TACTTTATATTTTATCTTTGGAATTTGGGCAGGAATAGTAGGAACT...  AAA7085  BLPAA6663-17   

      Country      Phylum    Class        Order       Family    Genus  \
0  Costa_Rica  Arthropoda  Insecta  Lepidoptera  Saturniidae  Lonomia   
1  Costa_Rica  Arthropoda  Insecta  Lepidoptera  Saturniidae  Lonomia   
2  Costa_Rica  Arthropoda  Insecta  Lepidoptera  Saturniidae  Lonomia   
3  Costa_Rica  Arthropoda  Insecta  Lepidoptera  Saturniidae  Lonomia   
4  Costa_Rica  Arthropoda  Insecta  Lepidoptera  Saturniidae  Lonomia   

                 Species  
0  Lonomia_santarosensis  
1  Lonomia_santarosensis  
2  Lo