# Definitions

In [2]:
import pandas as pd

def fasta_to_dataframe(fasta_file):
    headers = []
    sequences = []
    with open(fasta_file, 'r') as f:
        current_sequence = []
        current_header = None
        for line in f:
            line = line.strip()
            if line.startswith('>'):  # Header line
                if current_header is not None:
                    # Save the previous sequence
                    headers.append(current_header)
                    sequences.append(''.join(current_sequence))
                current_header = line[1:]  # Remove '>'
                current_sequence = []
            else:
                current_sequence.append(line)
        # Add the last sequence
        if current_header is not None:
            headers.append(current_header)
            sequences.append(''.join(current_sequence))
    # Create a DataFrame
    return pd.DataFrame({'Header': headers, 'Sequence': sequences})

## Read and Format FASTA file

In [28]:
fasta_file = "../data/DB.COX1.trimmed.fna"
cutadapt_result = fasta_to_dataframe(fasta_file)

cutadapt_result[['BOLD_Metadata', 'Taxonomy']] = cutadapt_result['Header'].str.split(';', n=1, expand=True)
cutadapt_result[['BOLD_ID', 'Specimen_ID', 'Country']] = cutadapt_result['BOLD_Metadata'].str.split('|', n=2, expand=True)
cutadapt_result['BOLD_ID'] = cutadapt_result['BOLD_ID'].str.removeprefix('BOLD:')

tax_columns = ['Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']
cutadapt_result[tax_columns] = cutadapt_result['Taxonomy'].str.extract(
    r'p:([^,]+),c:([^,]+),o:([^,]+),f:([^,]+),g:([^,]+),s:([^;]+)'
)

cutadapt_result = cutadapt_result.drop(columns=['Header', 'BOLD_Metadata', 'Taxonomy'])

## Reading my Result

In [3]:
my_result = pd.read_csv("../../data/current/primer-finder-fixed3.csv", sep=";")

my_result['f_index'] = my_result['f_index'].astype(int)
my_result['b_index'] = my_result['b_index'].astype(int)
my_result['f_match'] = my_result['f_match'].astype(str)

my_result['region'] = my_result.apply(lambda x: x['read'][(x['f_index'] + len(x['f_match'])):x['b_index']], axis=1)
my_result['region_length'] = my_result['region'].str.len()

         BOLD ID       Read ID     Country            Phylum      Class  \
0  >BOLD:AAA7085  BLPDA1021-18  Costa_Rica  tax=p:Arthropoda  c:Insecta   
1  >BOLD:AAA7085  MHMYN6354-14  Costa_Rica  tax=p:Arthropoda  c:Insecta   
2  >BOLD:AAA7085   MHASB734-07  Costa_Rica  tax=p:Arthropoda  c:Insecta   
3  >BOLD:AAA7085   MHATB310-06  Costa_Rica  tax=p:Arthropoda  c:Insecta   
4  >BOLD:AAA7085  BLPAA6663-17  Costa_Rica  tax=p:Arthropoda  c:Insecta   

           Order         Family      Genus                  Species  f_score  \
0  o:Lepidoptera  f:Saturniidae  g:Lonomia  s:Lonomia_santarosensis       49   
1  o:Lepidoptera  f:Saturniidae  g:Lonomia  s:Lonomia_santarosensis       49   
2  o:Lepidoptera  f:Saturniidae  g:Lonomia  s:Lonomia_santarosensis       49   
3  o:Lepidoptera  f:Saturniidae  g:Lonomia  s:Lonomia_santarosensis       49   
4  o:Lepidoptera  f:Saturniidae  g:Lonomia  s:Lonomia_santarosensis       49   

                      f_match  f_index  b_score                  b_m

## Compare Mine and Cutadapt
* result of my perfect matches with cutadapt result
* filter all which dont exist in cutadapt
* what else?