# Definitions

In [1]:
import pandas as pd

def fasta_to_dataframe(fasta_file):
    headers = []
    sequences = []
    with open(fasta_file, 'r') as f:
        current_sequence = []
        current_header = None
        for line in f:
            line = line.strip()
            if line.startswith('>'):  # Header line
                if current_header is not None:
                    # Save the previous sequence
                    headers.append(current_header)
                    sequences.append(''.join(current_sequence))
                current_header = line[1:]  # Remove '>'
                current_sequence = []
            else:
                current_sequence.append(line)
        # Add the last sequence
        if current_header is not None:
            headers.append(current_header)
            sequences.append(''.join(current_sequence))
    # Create a DataFrame
    return pd.DataFrame({'Header': headers, 'Sequence': sequences})

## Read and Format FASTA file

In [2]:
fasta_file = "../../data/DB.COX1.trimmed.fna"
cutadapt_result = fasta_to_dataframe(fasta_file)

cutadapt_result[['BOLD Metadata', 'Taxonomy']] = cutadapt_result['Header'].str.split(';', n=1, expand=True)
cutadapt_result[['BOLD ID', 'Read ID', 'Country']] = cutadapt_result['BOLD Metadata'].str.split('|', n=2, expand=True)
cutadapt_result['BOLD ID'] = cutadapt_result['BOLD ID'].str.removeprefix('BOLD:')

tax_columns = ['Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']
cutadapt_result[tax_columns] = cutadapt_result['Taxonomy'].str.extract(
    r'p:([^,]+),c:([^,]+),o:([^,]+),f:([^,]+),g:([^,]+),s:([^;]+)'
)

cutadapt_result = cutadapt_result.drop(columns=['Header', 'BOLD Metadata', 'Taxonomy'])
print(cutadapt_result.head())

                                            Sequence  BOLD ID       Read ID  \
0  TTTATCCTCTAACATTGCTCATGGAGGGTCTTCTGTAGATCTAGCT...  AAA7085  BLPDA1021-18   
1  TTTATCCTCTAACATTGCTCATGGAGGGTCTTCTGTAGATCTAGCT...  AAA7085  MHMYN6354-14   
2  TTTATCCTCTAACATTGCTCATGGAGGGTCTTCTGTAGATCTAGCT...  AAA7085   MHASB734-07   
3  TTTATCCTCTAACATTGCTCATGGAGGGTCTTCTGTAGATCTAGCT...  AAA7085   MHATB310-06   
4  TTTATCCTCTAACATTGCTCATGGAGGGTCTTCTGTAGATCTAGCT...  AAA7085  BLPAA6663-17   

      Country      Phylum    Class        Order       Family    Genus  \
0  Costa_Rica  Arthropoda  Insecta  Lepidoptera  Saturniidae  Lonomia   
1  Costa_Rica  Arthropoda  Insecta  Lepidoptera  Saturniidae  Lonomia   
2  Costa_Rica  Arthropoda  Insecta  Lepidoptera  Saturniidae  Lonomia   
3  Costa_Rica  Arthropoda  Insecta  Lepidoptera  Saturniidae  Lonomia   
4  Costa_Rica  Arthropoda  Insecta  Lepidoptera  Saturniidae  Lonomia   

                 Species  
0  Lonomia_santarosensis  
1  Lonomia_santarosensis  
2  Lo

## Reading my Result

In [3]:
my_result = pd.read_csv("../../data/primer-finder-13-02.csv", sep=";")

my_result['f_index'] = my_result['f_index'].astype(int)
my_result['b_index'] = my_result['b_index'].astype(int)
my_result['f_match'] = my_result['f_match'].astype(str)

my_result['region'] = my_result.apply(lambda x: x['read'][(x['f_index'] + len(x['f_match'])):x['b_index']], axis=1)
my_result['region_length'] = my_result['region'].str.len()
# print(my_result.head())

## Compare Mine and Cutadapt
* result of my perfect matches with cutadapt result
* filter all which dont exist in cutadapt
* what else?

#### Are there any areas which differ for perfect matches?

In [4]:
perfect_matches = my_result[~((my_result['f_score'] != 52) | (my_result['b_score'] != 46))]
filtered_df = cutadapt_result[cutadapt_result['Read ID'].isin(perfect_matches['Read ID'])]

In [5]:
print(f'Perfect matches: {perfect_matches.shape[0]}')
print(f'Perfect matches found in cutadapt result: {filtered_df.shape[0]}')

dif = perfect_matches.shape[0] - filtered_df.shape[0]
print(f'Diff: {dif}, or {dif / perfect_matches.shape[0] * 100}% were not found in the cutadapt result.\n')

Perfect matches: 898561
Perfect matches found in cutadapt result: 898561
Diff: 0, or 0.0% were not found in the cutadapt result.



In [6]:
merged_df = filtered_df[['Read ID', 'Sequence']].merge(
    perfect_matches[['Read ID', 'region']],
    on='Read ID',
    how='inner'
).rename(columns={
    'Sequence': 'ca_sequence',
    'region': 'my_sequence'
})
different_sequences = merged_df[~(merged_df['ca_sequence'] == merged_df['my_sequence'])]
print(different_sequences.shape[0])
print(different_sequences.head())

28
              Read ID                                        ca_sequence  \
153755  GBMNE23810-21  TTTATCTGGAGGTATTGCCCATGGGGGTGCTTCCGTAGATTTAGCT...   
684885    NEPTA859-13  CTTATCAGCAAATATTGCTCATAGTGGTAGATCAGTTGATTTAGCA...   
684898    NEPTA862-13  CTTATCAGCAAATATTGCTCATAGTGGTAGATCAGTTGATTTAGCA...   
696620    GCOL7710-16  TTTATCTTCTAATATTACTCATAGAGGAGCTTCTGTTGATTTAGCT...   
696621    GENHP169-11  TTTATCTTCTAATATTACTCATAGAGGAGCTTCTGTTGATTTAGCT...   

                                              my_sequence  
153755  TTTATCTGGAGGTATTGCCCATGGGGGTGCTTCCGTAGATTTAGCT...  
684885  CTTATCAGCAAATATTGCTCATAGTGGTAGATCAGTTGATTTAGCA...  
684898  CTTATCAGCAAATATTGCTCATAGTGGTAGATCAGTTGATTTAGCA...  
696620  TTTATCTTCTAATATTACTCATAGAGGAGCTTCTGTTGATTTAGCT...  
696621  TTTATCTTCTAATATTACTCATAGAGGAGCTTCTGTTGATTTAGCT...  


In [7]:
different_sequences['ca_length'] = different_sequences['ca_sequence'].str.len()
different_sequences['my_length'] = different_sequences['my_sequence'].str.len()
# 28 / 898561 * 100

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  different_sequences['ca_length'] = different_sequences['ca_sequence'].str.len()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  different_sequences['my_length'] = different_sequences['my_sequence'].str.len()


#### why do those (28) entires fail with cutadapt?

In [8]:
perfect_failed = my_result[my_result['Read ID'].isin(different_sequences['Read ID'])]
perfect_failed = different_sequences.merge(
    perfect_failed,
    on='Read ID',
    how='inner'
).drop(columns=['region', 'region_length'])

In [9]:
perfect_failed.to_csv('../../data/current/perfect_failed.csv', index=False)

#### whats up with my bad results, when one result was perfect??

In [10]:
should_be_good = my_result[~((my_result['f_score'] != 52) & (my_result['b_score'] != 46))]
filtered_df2 = cutadapt_result[cutadapt_result['Read ID'].isin(should_be_good['Read ID'])]
merged_df2 = should_be_good.merge(
    filtered_df2,
    on='Read ID',
    how='inner'
).rename(columns={
    'Sequence': 'ca_sequence',
    'region': 'my_sequence'
})

merged_df2['ca_length'] = merged_df2['ca_sequence'].str.len()
merged_df2['my_length'] = merged_df2['my_sequence'].str.len()

In [11]:
merged_with_diff = merged_df2[merged_df2['ca_sequence'] != merged_df2['my_sequence']]
#merged_df2_buckets = merged_with_diff.groupby(['b_score']).size().reset_index(name='Count')
#merged_df2_buckets
merged_with_diff = merged_with_diff[(merged_with_diff['my_length'] <= 185) & (merged_with_diff['my_length'] >= 140)].sort_values('my_length', ascending=False)
merged_with_diff['remainder'] = merged_with_diff.apply(lambda x: len(x['read'][x['f_index']+231::]), axis=1)
merged_with_diff.groupby(['remainder']).size().reset_index()


Unnamed: 0,remainder,0
0,0,471
1,1,23
2,2,1
3,3,1817
4,4,213
5,5,215
6,6,5173
7,7,164
8,8,531
9,9,2216


#### does cutadapt always cut at least 3 base pairs? - yes:

In [13]:
merged_with_remainder = my_result.merge(
    cutadapt_result,
    on='Read ID',
    how='inner'
)
merged_with_remainder['remainder'] = merged_with_remainder[merged_with_remainder['Sequence'] != ''].apply(
    lambda x: len(x['read'].split(x['Sequence'])[-1]), axis=1
)

merged_with_remainder.groupby(['remainder']).size().reset_index(name='Count').sort_values(by=['remainder'], ascending=[True])

Unnamed: 0,remainder,Count
0,3.0,3332
1,4.0,462
2,5.0,409
3,6.0,9200
4,7.0,786
...,...,...
905,1029.0,1
906,1056.0,16
907,1066.0,88
908,1071.0,1


#### what is the best score, if the b-primer is outside the read?
there are regions with len 190 lol
also, with a remainder of 12 or less, there might be a b-primer match in front of the f-primer match, resulting in NO region/sequence.

In [14]:
outside = my_result
outside['remainder'] = outside.apply(lambda x: len(x['read'][x['f_index']+231::]), axis=1)
outside = outside[outside['remainder'] <= 12]

outside_no_region = outside[outside['region'] == ''].copy()
outside_no_region['region'] = outside_no_region.apply(lambda x: x['read'][(x['f_index'] + len(x['f_match'])):x['b_index']], axis=1)
outside_no_region

Unnamed: 0,BOLD ID,Read ID,Country,Phylum,Class,Order,Family,Genus,Species,f_score,f_match,f_index,b_score,b_match,b_index,read,region,region_length,remainder
13777,>BOLD:AAM1247,MAHEM349-10,Pakistan,tax=p:Arthropoda,c:Insecta,o:Hemiptera,f:Aleyrodidae,g:Bemisia,s:Bemisia_tabaci,29,GGTATCTCGTCTATTCAGATTATCCT,589,28,CTTATTTTACCAGGCTGGTATT,19,TCATCCAGAAGTTTATGTTCTTATTTTACCAGGCTTTGGTATTGTT...,,0,0
13981,>BOLD:AAM1247,MAHEM382-10,Pakistan,tax=p:Arthropoda,c:Insecta,o:Hemiptera,f:Aleyrodidae,g:Bemisia,s:Bemisia_tabaci,29,GGTATCTCGTCTATTCAGATTATCCT,589,28,CTTATTTTACCAGGCTGGTATT,19,TCATCCAGAAGTTTATGTTCTTATTTTACCAGGCTTTGGTATTGTT...,,0,9
14034,>BOLD:AAM1247,MAHEM370-10,Pakistan,tax=p:Arthropoda,c:Insecta,o:Hemiptera,f:Aleyrodidae,g:Bemisia,s:Bemisia_tabaci,29,GGTATCTCGTCTATTCAGATTATCCT,589,28,CTTATTTTACCAGGCTGGTATT,19,TCATCCAGAAGTTTATGTTCTTATTTTACCAGGCTTTGGTATTGTT...,,0,10
14071,>BOLD:AAM1247,MAHEM366-10,Pakistan,tax=p:Arthropoda,c:Insecta,o:Hemiptera,f:Aleyrodidae,g:Bemisia,s:Bemisia_tabaci,29,GGTATCTCGTCTATTCAGATTATCCT,589,28,CTTATTTTACCAGGCTGGTATT,19,TCATCCAGAAGTTTATGTTCTTATTTTACCAGGCTTTGGTATTGTT...,,0,10
14112,>BOLD:AAM1247,MAHEM361-10,Pakistan,tax=p:Arthropoda,c:Insecta,o:Hemiptera,f:Aleyrodidae,g:Bemisia,s:Bemisia_tabaci,29,GGTATCTCGTCTATTCAGATTATCCT,589,28,CTTATTTTACCAGGCTGGTATT,19,TCATCCAGAAGTTTATGTTCTTATTTTACCAGGCTTTGGTATTGTT...,,0,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2378502,>BOLD:ADC8118,GBMTG4732-16,,tax=p:Arthropoda,c:Arachnida,o:Ixodida,f:Argasidae,g:Argas,s:Argas_miniatus,29,GG---TGGTTCGGAACAAGTTATATCGT,782,30,CCGTGATTTATCAGGGAG-AATCA,588,ATTTCTTTAATAAGATATATATTATTGCTTGTTATAGTGTTGATTA...,,0,0
2379746,>BOLD:ADR7996,DISA506-19,South_Atlantic_Ocean,tax=p:Arthropoda,c:Malacostraca,o:Isopoda,f:Sphaeromatidae,g:Exosphaeroma,s:Exosphaeroma_truncatitelson,30,GAATTGAAATGACCTGTACAAACCCTC,434,29,CCCTGATATTCAAGCCGCAG-TATCCAC,38,AAAACATCGTCGGTGTCTACTTTCGCCGATAAGATCTGCCCAGTGA...,,0,0
2380340,>BOLD:ACL6603,ZSMDB022-14,Chile,tax=p:Arthropoda,c:Insecta,o:Coleoptera,f:Dytiscidae,g:Rhantus,s:Rhantus_signatus,30,GGTACGCAGGAAATAACTGATTTAATGCCT,603,33,CATGTATTAGCTCTAG-AATTAC,559,CACCCGGCAGGACGTCTCGCGGGCGTGCCAGTTTAACACCGCGGGC...,,0,0
2380872,>BOLD:AAJ5917,GBLN0644-06,,tax=p:Arthropoda,c:Insecta,o:Lepidoptera,f:Nymphalidae,g:Aemona,s:Aemona_lena,31,GGGTCAAGGAT---CTGGTATATCTACCACC,215,28,CCGAGTTTTCGATCTAGGTGACTTTTA,31,CTGCACTGTGAAGACGTGTTGGATGCGTCTGCCGAGTTTTCGATCT...,,0,0


In [22]:
cutadapt_result.merge(
    outside_no_region,
    on='Read ID',
    how='inner'
).groupby(['BOLD ID_x']).size().reset_index(name='Count')

Unnamed: 0,BOLD ID_x,Count
0,AAA2224,43
1,AAA3766,1
2,AAA7860,1
3,AAB3870,20
4,AAB5640,1
5,AAB9982,1
6,AAC3270,1
7,AAD5869,1
8,AAD8971,1
9,AAE6226,1


so there are 156 sequences for which cutadapt found a decent result, but I didn't find anything. (out of 4651, so ~ 3.3% could be salvagable?)
of these, 43 were in AAA2224, 22 in ACV4760 and 20 in AAB3870.

In [44]:
## the ones where I only found prefix of cutadapt

a = merged_with_diff[merged_with_diff.apply(lambda row: not str(row['ca_sequence']).startswith(str(row['my_sequence'])), axis=1)]
a[a['remainder'] > 0]

Unnamed: 0,BOLD ID_x,Read ID,Country_x,Phylum_x,Class_x,Order_x,Family_x,Genus_x,Species_x,f_score,...,Country_y,Phylum_y,Class_y,Order_y,Family_y,Genus_y,Species_y,ca_length,my_length,remainder
800613,>BOLD:AAC5767,GBGL4535-07,,tax=p:Arthropoda,c:Insecta,o:Lepidoptera,f:Pieridae,g:Gonepteryx,s:Gonepteryx_cleopatra,19,...,,Arthropoda,Insecta,Lepidoptera,Pieridae,Gonepteryx,Gonepteryx_cleopatra,205,185,80
1095506,>BOLD:ACF2924,GBMH1021-06,,tax=p:Arthropoda,c:Insecta,o:Hemiptera,f:Aphididae,g:Rhopalosiphum,s:Rhopalosiphum_padi,20,...,,Arthropoda,Insecta,Hemiptera,Aphididae,Rhopalosiphum,Rhopalosiphum_padi,203,185,282
1847380,>BOLD:AEY4233,GBAHF3049-19,,tax=p:Arthropoda,c:Insecta,o:Hymenoptera,f:Formicidae,g:Formica,s:Formica_frontalis,21,...,,Arthropoda,Insecta,Hymenoptera,Formicidae,Formica,Formica_frontalis,205,185,310
826245,>BOLD:AAA9222,GBGL4534-07,,tax=p:Arthropoda,c:Insecta,o:Lepidoptera,f:Pieridae,g:Gonepteryx,s:Gonepteryx_rhamni,19,...,,Arthropoda,Insecta,Lepidoptera,Pieridae,Gonepteryx,Gonepteryx_rhamni,205,185,80
1322514,>BOLD:AAU0165,BJUP398-17,China,tax=p:Arthropoda,c:Insecta,o:Lepidoptera,f:Hesperiidae,g:Ochlodes,s:Ochlodes_subhyalinus,19,...,China,Arthropoda,Insecta,Lepidoptera,Hesperiidae,Ochlodes,Ochlodes_subhyalinus,205,185,82
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486483,>BOLD:AAD4619,GBCM23378-19,United_States,tax=p:Arthropoda,c:Thecostraca,o:Sessilia,f:Chthamalidae,g:Chthamalus,s:Chthamalus_fissus,20,...,United_States,Arthropoda,Thecostraca,Sessilia,Chthamalidae,Chthamalus,Chthamalus_fissus,205,177,81
991821,>BOLD:AAF4765,GBMNE1702-21,United_States,tax=p:Arthropoda,c:Insecta,o:Hymenoptera,f:Megachilidae,g:Megachile,s:Megachile_policaris,24,...,United_States,Arthropoda,Insecta,Hymenoptera,Megachilidae,Megachile,Megachile_policaris,205,177,247
1725216,>BOLD:AAM7674,LYMAA2934-14,Canada,tax=p:Arthropoda,c:Insecta,o:Coleoptera,f:Carabidae,g:Gastrellarius,s:Gastrellarius_honestus,21,...,Canada,Arthropoda,Insecta,Coleoptera,Carabidae,Gastrellarius,Gastrellarius_honestus,205,177,76
991816,>BOLD:AAF4765,GBMNE1704-21,United_States,tax=p:Arthropoda,c:Insecta,o:Hymenoptera,f:Megachilidae,g:Megachile,s:Megachile_policaris,24,...,United_States,Arthropoda,Insecta,Hymenoptera,Megachilidae,Megachile,Megachile_policaris,205,177,247


there are 13197 sequences for which I only found a prefix of cutadapt's sequence.
179 where I found the last three bps which cutadapt trimmed.
269 where we found entirely different sections, almost always when I had a perfect b_match.


check all instances that are removed by cutadapt, if there are any usefull.

In [8]:
not_in_cutadapt = my_result[~my_result['Read ID'].isin(cutadapt_result['Read ID'])]
not_in_cutadapt

Unnamed: 0,BOLD ID,Read ID,Country,Phylum,Class,Order,Family,Genus,Species,f_score,f_match,f_index,b_score,b_match,b_index,read,region,region_length
70,>BOLD:AAA7085,MHATB313-06,Costa_Rica,tax=p:Arthropoda,c:Insecta,o:Lepidoptera,f:Saturniidae,g:Lonomia,s:Lonomia_santarosensis,49,GGAACAGGATGAACGGTATACCCTCC,319,22,ATTAC-AGCT-TTTCTTTTAC,520,TACTTTATATTTTATCTTTGGAATTTGGGCAGGAATAGTAGGAACT...,TTTATCCTCTAACATTGCTCATGGAGGGTCTTCTGTAGATCTAGCT...,175
89,>BOLD:AAG4445,CNFNQ405-14,Canada,tax=p:Arthropoda,c:Insecta,o:Coleoptera,f:Chrysomelidae,g:Altica,s:Altica_corni,49,GGAACCGGATGAACAGTTTATCCTCC,309,2,T,509,TTTCTATTTGGAATTTGAGCAGGAATAATTGGCACCTCCATAAGAC...,ACTATCATCTAATCTTGCACATAATGGCCCATCTGTTGATTTAGCT...,174
90,>BOLD:AAG4445,CNFNP2128-14,Canada,tax=p:Arthropoda,c:Insecta,o:Coleoptera,f:Chrysomelidae,g:Altica,s:Altica_corni,49,GGAACCGGATGAACAGTTTATCCTCC,318,2,T,518,ACATTATACTTTCTATTTGGAATTTGAGCAGGAATAATTGGCACCT...,ACTATCATCTAATCTTGCACACAATGGCCCATCTGTTGATTTAGCT...,174
95,>BOLD:AAG4445,SMTPB14396-13,Canada,tax=p:Arthropoda,c:Insecta,o:Coleoptera,f:Chrysomelidae,g:Altica,s:Altica_corni,49,GGAACCGGATGAACAGTTTATCCTCC,318,12,TATTA-CA,518,ACATTATACTTTCTATTTGGAATTTGAGCGGGAATAATTGGCACCT...,ACTATCATCTAATCTTGCACACAATGGCCCATCTGTTGATTTAGCT...,174
111,>BOLD:AAG4445,SMTPB16362-13,Canada,tax=p:Arthropoda,c:Insecta,o:Coleoptera,f:Chrysomelidae,g:Altica,s:Altica_corni,49,GGAACCGGATGAACAGTTTATCCTCC,306,12,TATTA-CA,506,CTATTTGGAATTTGAGCAGGAATAATTGGCACCTCCATAAGACTTT...,ACTATCATCTAATCTTGCACACAATGGCCCATCTGTTGATTTAGCT...,174
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2381261,>BOLD:ACZ8274,AMTPF6582-16,Germany,tax=p:Arthropoda,c:Insecta,o:Diptera,f:Drosophilidae,g:Drosophila,s:Drosophila_subquinaria,52,GGTACAGGATGAACTGTTTACCCCCC,309,25,CAGCTATTA-CTGTTACTTTC,514,TTTATTTTTGGTGCTTGAGCTGGAATAGTTGGAACATCTTTAAGAA...,ATTATCTGCAGGAATTGCTCACGGAGGAGCATCAGTTGATTTAGCA...,179
2381270,>BOLD:AAM4099,GBCH4360-10,Spain,tax=p:Arthropoda,c:Arachnida,o:Scorpiones,f:Buthidae,g:Buthus,s:Buthus_occitanus,41,GGTACGGGATGGACTGTGTACCCGCCC,297,37,CCTGTGTTGGCGGGTGCAATTAC,528,ATTTGAGCTTCAATGGTAGGGACAGCTTTAAGTTTGCTGATTCGGG...,TTATCTTCTTCTTTGGCGCATATAGGGGGTTCTGTGGATTTAACTA...,204
2381285,>BOLD:ACM6769,SMTPG1889-14,Canada,tax=p:Arthropoda,c:Arachnida,o:Araneae,f:Philodromidae,g:Thanatus,s:Thanatus_striatus,40,GGGGCCGGCTGGACTGTTTATCCTCC,319,33,CTTTTTTTAGCTGGTGC-ATCA,398,AACTTTATATTTAATTTTTGGTGCTTGGGCTGCAATAGTGGGAACT...,ATTAGCCTCATTGACTGGGCATGCTGGAAGTGCAGTCGATTTTGCA...,53
2381295,>BOLD:AAG6731,ANICL817-10,Australia,tax=p:Arthropoda,c:Insecta,o:Lepidoptera,f:Nolidae,g:Nola,s:Nola_biguttalis,52,GGAACAGGATGAACAGTTTACCCCCC,319,22,ATTACT-GCT-TTTCTTTTAC,520,AACATTATATTTTATTTTTGGAATTTGAGCAGGTATAGTAGGAACT...,GCTTTCATCTAATATTGCTCATAGAGGAAGATCTGTAGATTTAGCA...,175


In [19]:
not_in_cutadapt_but_good_len1 = not_in_cutadapt[230 > not_in_cutadapt['region_length']]
not_in_cutadapt_but_good_len = not_in_cutadapt_but_good_len1[not_in_cutadapt_but_good_len1['region_length'] < 180]

not_in_cutadapt_but_good_len

Unnamed: 0,BOLD ID,Read ID,Country,Phylum,Class,Order,Family,Genus,Species,f_score,f_match,f_index,b_score,b_match,b_index,read,region,region_length
70,>BOLD:AAA7085,MHATB313-06,Costa_Rica,tax=p:Arthropoda,c:Insecta,o:Lepidoptera,f:Saturniidae,g:Lonomia,s:Lonomia_santarosensis,49,GGAACAGGATGAACGGTATACCCTCC,319,22,ATTAC-AGCT-TTTCTTTTAC,520,TACTTTATATTTTATCTTTGGAATTTGGGCAGGAATAGTAGGAACT...,TTTATCCTCTAACATTGCTCATGGAGGGTCTTCTGTAGATCTAGCT...,175
89,>BOLD:AAG4445,CNFNQ405-14,Canada,tax=p:Arthropoda,c:Insecta,o:Coleoptera,f:Chrysomelidae,g:Altica,s:Altica_corni,49,GGAACCGGATGAACAGTTTATCCTCC,309,2,T,509,TTTCTATTTGGAATTTGAGCAGGAATAATTGGCACCTCCATAAGAC...,ACTATCATCTAATCTTGCACATAATGGCCCATCTGTTGATTTAGCT...,174
90,>BOLD:AAG4445,CNFNP2128-14,Canada,tax=p:Arthropoda,c:Insecta,o:Coleoptera,f:Chrysomelidae,g:Altica,s:Altica_corni,49,GGAACCGGATGAACAGTTTATCCTCC,318,2,T,518,ACATTATACTTTCTATTTGGAATTTGAGCAGGAATAATTGGCACCT...,ACTATCATCTAATCTTGCACACAATGGCCCATCTGTTGATTTAGCT...,174
95,>BOLD:AAG4445,SMTPB14396-13,Canada,tax=p:Arthropoda,c:Insecta,o:Coleoptera,f:Chrysomelidae,g:Altica,s:Altica_corni,49,GGAACCGGATGAACAGTTTATCCTCC,318,12,TATTA-CA,518,ACATTATACTTTCTATTTGGAATTTGAGCGGGAATAATTGGCACCT...,ACTATCATCTAATCTTGCACACAATGGCCCATCTGTTGATTTAGCT...,174
111,>BOLD:AAG4445,SMTPB16362-13,Canada,tax=p:Arthropoda,c:Insecta,o:Coleoptera,f:Chrysomelidae,g:Altica,s:Altica_corni,49,GGAACCGGATGAACAGTTTATCCTCC,306,12,TATTA-CA,506,CTATTTGGAATTTGAGCAGGAATAATTGGCACCTCCATAAGACTTT...,ACTATCATCTAATCTTGCACACAATGGCCCATCTGTTGATTTAGCT...,174
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2380976,>BOLD:AAZ3009,ANICL721-10,Australia,tax=p:Arthropoda,c:Insecta,o:Lepidoptera,f:Nolidae,g:Aquita,s:Aquita_plagiochyta,52,GGAACAGGATGAACAGTTTACCCCCC,319,21,ATTAC-AGCT-TTCCTTATTA,520,AACTTTATATTTTATTTTTGGAATTTGAGCTGGAATAGTAGGAACG...,ACTCTCATCTAATATTGCTCATGGAGGAAGCTCTGTAGATTTAGCT...,175
2381003,>BOLD:AEK1944,PSFOR051-13,Italy,tax=p:Arthropoda,c:Insecta,o:Coleoptera,f:Cerambycidae,g:Evodinus,s:Evodinus_clathratus,49,GGTACAGGCTGAACTGTTTATCCCCC,316,0,,514,TCTTTACTTTATTTTCGGTGCTTGAGCAGGAATGGTGGGAACATCA...,ACTTTCGTCTAATATTGCTCATAGCGGATCTTCTGTAGATTTAGCT...,118
2381261,>BOLD:ACZ8274,AMTPF6582-16,Germany,tax=p:Arthropoda,c:Insecta,o:Diptera,f:Drosophilidae,g:Drosophila,s:Drosophila_subquinaria,52,GGTACAGGATGAACTGTTTACCCCCC,309,25,CAGCTATTA-CTGTTACTTTC,514,TTTATTTTTGGTGCTTGAGCTGGAATAGTTGGAACATCTTTAAGAA...,ATTATCTGCAGGAATTGCTCACGGAGGAGCATCAGTTGATTTAGCA...,179
2381285,>BOLD:ACM6769,SMTPG1889-14,Canada,tax=p:Arthropoda,c:Arachnida,o:Araneae,f:Philodromidae,g:Thanatus,s:Thanatus_striatus,40,GGGGCCGGCTGGACTGTTTATCCTCC,319,33,CTTTTTTTAGCTGGTGC-ATCA,398,AACTTTATATTTAATTTTTGGTGCTTGGGCTGCAATAGTGGGAACT...,ATTAGCCTCATTGACTGGGCATGCTGGAAGTGCAGTCGATTTTGCA...,53
