# Definitions

In [1]:
import pandas as pd

def fasta_to_dataframe(fasta_file):
    headers = []
    sequences = []
    with open(fasta_file, 'r') as f:
        current_sequence = []
        current_header = None
        for line in f:
            line = line.strip()
            if line.startswith('>'):  # Header line
                if current_header is not None:
                    # Save the previous sequence
                    headers.append(current_header)
                    sequences.append(''.join(current_sequence))
                current_header = line[1:]  # Remove '>'
                current_sequence = []
            else:
                current_sequence.append(line)
        # Add the last sequence
        if current_header is not None:
            headers.append(current_header)
            sequences.append(''.join(current_sequence))
    # Create a DataFrame
    return pd.DataFrame({'Header': headers, 'Sequence': sequences})

## Read and Format FASTA file

In [2]:
fasta_file = "../../data/DB.COX1.trimmed.fna"
cutadapt_result = fasta_to_dataframe(fasta_file)

cutadapt_result[['BOLD Metadata', 'Taxonomy']] = cutadapt_result['Header'].str.split(';', n=1, expand=True)
cutadapt_result[['BOLD ID', 'Read ID', 'Country']] = cutadapt_result['BOLD Metadata'].str.split('|', n=2, expand=True)
cutadapt_result['BOLD ID'] = cutadapt_result['BOLD ID'].str.removeprefix('BOLD:')

tax_columns = ['Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']
cutadapt_result[tax_columns] = cutadapt_result['Taxonomy'].str.extract(
    r'p:([^,]+),c:([^,]+),o:([^,]+),f:([^,]+),g:([^,]+),s:([^;]+)'
)

cutadapt_result = cutadapt_result.drop(columns=['Header', 'BOLD Metadata', 'Taxonomy'])
print(cutadapt_result.head())

                                            Sequence  BOLD ID       Read ID  \
0  TTTATCCTCTAACATTGCTCATGGAGGGTCTTCTGTAGATCTAGCT...  AAA7085  BLPDA1021-18   
1  TTTATCCTCTAACATTGCTCATGGAGGGTCTTCTGTAGATCTAGCT...  AAA7085  MHMYN6354-14   
2  TTTATCCTCTAACATTGCTCATGGAGGGTCTTCTGTAGATCTAGCT...  AAA7085   MHASB734-07   
3  TTTATCCTCTAACATTGCTCATGGAGGGTCTTCTGTAGATCTAGCT...  AAA7085   MHATB310-06   
4  TTTATCCTCTAACATTGCTCATGGAGGGTCTTCTGTAGATCTAGCT...  AAA7085  BLPAA6663-17   

      Country      Phylum    Class        Order       Family    Genus  \
0  Costa_Rica  Arthropoda  Insecta  Lepidoptera  Saturniidae  Lonomia   
1  Costa_Rica  Arthropoda  Insecta  Lepidoptera  Saturniidae  Lonomia   
2  Costa_Rica  Arthropoda  Insecta  Lepidoptera  Saturniidae  Lonomia   
3  Costa_Rica  Arthropoda  Insecta  Lepidoptera  Saturniidae  Lonomia   
4  Costa_Rica  Arthropoda  Insecta  Lepidoptera  Saturniidae  Lonomia   

                 Species  
0  Lonomia_santarosensis  
1  Lonomia_santarosensis  
2  Lo

## Reading my Result

In [3]:
my_result = pd.read_csv("../../data/current/primer-finder-fixed3.csv", sep=";")

my_result['f_index'] = my_result['f_index'].astype(int)
my_result['b_index'] = my_result['b_index'].astype(int)
my_result['f_match'] = my_result['f_match'].astype(str)

my_result['region'] = my_result.apply(lambda x: x['read'][(x['f_index'] + len(x['f_match'])):x['b_index']], axis=1)
my_result['region_length'] = my_result['region'].str.len()
print(my_result.head())

         BOLD ID       Read ID     Country            Phylum      Class  \
0  >BOLD:AAA7085  BLPDA1021-18  Costa_Rica  tax=p:Arthropoda  c:Insecta   
1  >BOLD:AAA7085  MHMYN6354-14  Costa_Rica  tax=p:Arthropoda  c:Insecta   
2  >BOLD:AAA7085   MHASB734-07  Costa_Rica  tax=p:Arthropoda  c:Insecta   
3  >BOLD:AAA7085   MHATB310-06  Costa_Rica  tax=p:Arthropoda  c:Insecta   
4  >BOLD:AAA7085  BLPAA6663-17  Costa_Rica  tax=p:Arthropoda  c:Insecta   

           Order         Family      Genus                  Species  f_score  \
0  o:Lepidoptera  f:Saturniidae  g:Lonomia  s:Lonomia_santarosensis       49   
1  o:Lepidoptera  f:Saturniidae  g:Lonomia  s:Lonomia_santarosensis       49   
2  o:Lepidoptera  f:Saturniidae  g:Lonomia  s:Lonomia_santarosensis       49   
3  o:Lepidoptera  f:Saturniidae  g:Lonomia  s:Lonomia_santarosensis       49   
4  o:Lepidoptera  f:Saturniidae  g:Lonomia  s:Lonomia_santarosensis       49   

                      f_match  f_index  b_score                  b_m

## Compare Mine and Cutadapt
* result of my perfect matches with cutadapt result
* filter all which dont exist in cutadapt
* what else?

#### Are there any areas which differ for perfect matches?

In [4]:
perfect_matches = my_result[~((my_result['f_score'] != 52) | (my_result['b_score'] != 46))]
filtered_df = cutadapt_result[cutadapt_result['Read ID'].isin(perfect_matches['Read ID'])]

In [5]:
print(f'Perfect matches: {perfect_matches.shape[0]}')
print(f'Perfect matches found in cutadapt result: {filtered_df.shape[0]}')

dif = perfect_matches.shape[0] - filtered_df.shape[0]
print(f'Diff: {dif}, or {dif / perfect_matches.shape[0] * 100}% were not found in the cutadapt result.\n')

Perfect matches: 898561
Perfect matches found in cutadapt result: 898561
Diff: 0, or 0.0% were not found in the cutadapt result.



In [6]:
merged_df = filtered_df[['Read ID', 'Sequence']].merge(
    perfect_matches[['Read ID', 'region']],
    on='Read ID',
    how='inner'
).rename(columns={
    'Sequence': 'ca_sequence',
    'region': 'my_sequence'
})
different_sequences = merged_df[~(merged_df['ca_sequence'] == merged_df['my_sequence'])]
print(different_sequences.shape[0])
print(different_sequences.head())

28
              Read ID                                        ca_sequence  \
153755  GBMNE23810-21  TTTATCTGGAGGTATTGCCCATGGGGGTGCTTCCGTAGATTTAGCT...   
684885    NEPTA859-13  CTTATCAGCAAATATTGCTCATAGTGGTAGATCAGTTGATTTAGCA...   
684898    NEPTA862-13  CTTATCAGCAAATATTGCTCATAGTGGTAGATCAGTTGATTTAGCA...   
696620    GCOL7710-16  TTTATCTTCTAATATTACTCATAGAGGAGCTTCTGTTGATTTAGCT...   
696621    GENHP169-11  TTTATCTTCTAATATTACTCATAGAGGAGCTTCTGTTGATTTAGCT...   

                                              my_sequence  
153755  TTTATCTGGAGGTATTGCCCATGGGGGTGCTTCCGTAGATTTAGCT...  
684885  CTTATCAGCAAATATTGCTCATAGTGGTAGATCAGTTGATTTAGCA...  
684898  CTTATCAGCAAATATTGCTCATAGTGGTAGATCAGTTGATTTAGCA...  
696620  TTTATCTTCTAATATTACTCATAGAGGAGCTTCTGTTGATTTAGCT...  
696621  TTTATCTTCTAATATTACTCATAGAGGAGCTTCTGTTGATTTAGCT...  


In [7]:
different_sequences['ca_length'] = different_sequences['ca_sequence'].str.len()
different_sequences['my_length'] = different_sequences['my_sequence'].str.len()
# 28 / 898561 * 100

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  different_sequences['ca_length'] = different_sequences['ca_sequence'].str.len()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  different_sequences['my_length'] = different_sequences['my_sequence'].str.len()


#### why do those (28) entires fail with cutadapt?

In [8]:
perfect_failed = my_result[my_result['Read ID'].isin(different_sequences['Read ID'])]
perfect_failed = different_sequences.merge(
    perfect_failed,
    on='Read ID',
    how='inner'
).drop(columns=['region', 'region_length'])

In [9]:
perfect_failed.to_csv('../../data/current/perfect_failed.csv', index=False)

#### whats up with my bad results?

In [10]:
should_be_good = my_result[~((my_result['f_score'] != 52) & (my_result['b_score'] != 46))]
filtered_df2 = cutadapt_result[cutadapt_result['Read ID'].isin(should_be_good['Read ID'])]
merged_df2 = should_be_good.merge(
    filtered_df2,
    on='Read ID',
    how='inner'
).rename(columns={
    'Sequence': 'ca_sequence',
    'region': 'my_sequence'
})

In [11]:
merged_df2['ca_length'] = merged_df2['ca_sequence'].str.len()
merged_df2['my_length'] = merged_df2['my_sequence'].str.len()

In [31]:
merged_df2 = merged_df2[merged_df2['ca_sequence'] != merged_df2['my_sequence']]
#merged_df2_buckets = merged_df2.groupby(['b_score']).size().reset_index(name='Count')
#merged_df2_buckets
merged_df2[(merged_df2['my_length'] <= 180) & (merged_df2['my_length'] >= 150)].sort_values('my_length', ascending=False)


Unnamed: 0,BOLD ID_x,Read ID,Country_x,Phylum_x,Class_x,Order_x,Family_x,Genus_x,Species_x,f_score,...,BOLD ID_y,Country_y,Phylum_y,Class_y,Order_y,Family_y,Genus_y,Species_y,ca_length,my_length
1521988,>BOLD:ACV9239,SSROC6257-15,Canada,tax=p:Arthropoda,c:Insecta,o:Hymenoptera,f:Pompilidae,g:Arachnospila,s:Arachnospila_michiganensis,52,...,ACV9239,Canada,Arthropoda,Insecta,Hymenoptera,Pompilidae,Arachnospila,Arachnospila_michiganensis,199,180
1297077,>BOLD:ACI9027,GMNWL2095-14,Norway,tax=p:Arthropoda,c:Insecta,o:Hymenoptera,f:Pteromalidae,g:Pachyneuron,s:Pachyneuron_groenlandicum,52,...,ACI9027,Norway,Arthropoda,Insecta,Hymenoptera,Pteromalidae,Pachyneuron,Pachyneuron_groenlandicum,199,180
360634,>BOLD:AAN8044,GMOPK1306-15,Canada,tax=p:Arthropoda,c:Insecta,o:Hymenoptera,f:Mymaridae,g:Anagrus,s:Anagrus_daanei,52,...,AAN8044,Canada,Arthropoda,Insecta,Hymenoptera,Mymaridae,Anagrus,Anagrus_daanei,199,180
364505,>BOLD:AAN8044,SMTPR5330-16,Canada,tax=p:Arthropoda,c:Insecta,o:Hymenoptera,f:Mymaridae,g:Anagrus,s:Anagrus_daanei,52,...,AAN8044,Canada,Arthropoda,Insecta,Hymenoptera,Mymaridae,Anagrus,Anagrus_daanei,199,180
1855469,>BOLD:ADH0609,AUCET476-13,Australia,tax=p:Arthropoda,c:Insecta,o:Coleoptera,f:Scarabaeidae,g:Eupoecila,s:Eupoecila_evanescens,52,...,ADH0609,Australia,Arthropoda,Insecta,Coleoptera,Scarabaeidae,Eupoecila,Eupoecila_evanescens,205,179
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
267500,>BOLD:AAG2890,SSWLA1022-13,Canada,tax=p:Arthropoda,c:Insecta,o:Hemiptera,f:Cicadellidae,g:Populicerus,s:Populicerus_lachrymalis,52,...,AAG2890,Canada,Arthropoda,Insecta,Hemiptera,Cicadellidae,Populicerus,Populicerus_lachrymalis,151,154
712954,>BOLD:AAD3466,DRYAS18245-15,Finland,tax=p:Arthropoda,c:Insecta,o:Diptera,f:Simuliidae,g:Metacnephia,s:Metacnephia_lyra,52,...,AAD3466,Finland,Arthropoda,Insecta,Diptera,Simuliidae,Metacnephia,Metacnephia_lyra,151,154
760444,>BOLD:ACB8275,AMTPA1721-15,Germany,tax=p:Arthropoda,c:Insecta,o:Hymenoptera,f:Pteromalidae,g:Mesopolobus,s:Mesopolobus_tibialis,52,...,ACB8275,Germany,Arthropoda,Insecta,Hymenoptera,Pteromalidae,Mesopolobus,Mesopolobus_tibialis,145,153
1696937,>BOLD:AAN8184,RRSSC1533-15,Canada,tax=p:Arthropoda,c:Insecta,o:Hymenoptera,f:Encyrtidae,g:Holcencyrtus,s:Holcencyrtus_dennoi,52,...,AAN8184,Canada,Arthropoda,Insecta,Hymenoptera,Encyrtidae,Holcencyrtus,Holcencyrtus_dennoi,145,152
