In [2]:
# Using Python 3.12.1 (local env: HeatNSalt)

import pandas as pd
import numpy as np

## The images were sorted manually by moving the 'bad' images to a new folder via Windows explorer. This folder was scanned:
# bad_img_df = pd.DataFrame(os.listdir('E:\Biodiscover_Vault\PMR_ExStream22_Main_EPT\Filtered images'), columns=['bad_img'])
# bad_img_df.to_excel('EPT_filtered_images.xlsx')

bad_img_df = pd.read_excel('raw/EPT_filtered_images.xlsx')
IMG_df = pd.read_csv('outputs/image_data_with_DNA.csv')
All_IMGs_df = pd.read_csv('outputs/image_data_complete.csv')

print(f'Total number of images collected: {len(All_IMGs_df.index)}')
print(f'Number of images after quality filtering: {len(All_IMGs_df.index) - len(bad_img_df.index)}')

cleaning_list = IMG_df['Image File Name'].isin(bad_img_df['bad_img'])
image_data_with_DNA_clean = IMG_df[~cleaning_list]

image_data_with_DNA_clean.to_csv('outputs/image_data_with_DNA_clean.csv')

# Delete DNA info if reads below 1000 + COUNT
# Rename image_data_with_DNA_clean to df for handling reasons:
df = image_data_with_DNA_clean

# Identify columns that end with '_DNA'
data_columns = [col for col in df.columns if '_DNA' not in col]
base_df = df[data_columns]
dna_columns = [col for col in df.columns if col.endswith('_DNA')]
repeat_columns = [col for col in df.columns if col.endswith('_DNA_repeat')]

# Set values to NaN where 'read count_DNA' or 'read count_DNA_repeat' is below 1000

print(f"Number of specimen with <1000 reads in first samples: {df.loc[df['read count_DNA'] < 1000, 'Specimen ID'].nunique()}")
not_in_final_df = {}
not_in_final_df['delete_in_first'] = df.loc[df['read count_DNA'] < 1000, 'Specimen ID'].unique()
print(f"Number of specimen with <1000 reads in repeated samples: {df.loc[df['read count_DNA_repeat'] < 1000, 'Specimen ID'].nunique()}")
not_in_final_df['delete_in_repeat'] = df.loc[df['read count_DNA_repeat'] < 1000, 'Specimen ID'].unique()

df.loc[df['read count_DNA'] < 1000, dna_columns] = np.nan
df.loc[df['read count_DNA_repeat'] < 1000, repeat_columns] = np.nan


# Delete DNA info if assigned to wrong order for DNA and DNA_repeat
non_match_mask = df['Order_DNA'].notna() & (df['Order_DNA'] != df['Order_Morpho'])
repeat_non_match_mask = df['Order_DNA_repeat'].notna() & (df['Order_DNA_repeat'] != df['Order_Morpho'])

print(f"Number of specimen where Morpho_Order =/= DNA_Order: {df.loc[non_match_mask, 'Specimen ID'].nunique()}")
not_in_final_df['Morpho_Order =/= DNA_Order'] = df.loc[non_match_mask, 'Specimen ID'].unique()
print(f"Number of specimen where Morpho_Order =/= DNA_repeat_Order: {df.loc[repeat_non_match_mask, 'Specimen ID'].nunique()}")
not_in_final_df['Morpho_Order =/= DNA_repeat_Order'] = df.loc[repeat_non_match_mask, 'Specimen ID'].unique()

df.loc[non_match_mask, dna_columns] = np.nan
df.loc[repeat_non_match_mask, repeat_columns] = np.nan

def determine_final_label(row):
    if pd.isna(row['read count_DNA']) and pd.isna(row['read count_DNA_repeat']):
        return pd.Series([np.nan] * len(dna_columns), index=dna_columns)  # Both values are NaN
    elif pd.isna(row['read count_DNA']):
        # Only 'read count_DNA' is NaN, return the '_repeat' values and rename columns
        repeat_values = row[repeat_columns].copy()
        repeat_values.index = [col.replace('_repeat', '') for col in repeat_values.index]
        return repeat_values
    elif pd.isna(row['read count_DNA_repeat']):
        return row[dna_columns]  # Only 'read count_DNA_repeat' is NaN
    else:
        # Neither value is NaN, return the row with higher 'read count' and rename columns if needed
        if row['read count_DNA'] > row['read count_DNA_repeat']:
            return row[dna_columns]
        else:
            repeat_values = row[repeat_columns].copy()
            repeat_values.index = [col.replace('_repeat', '') for col in repeat_values.index]
            return repeat_values

# Decide if the first run or if the repeated run is chosen based on read count
DNA_df = df.apply(determine_final_label, axis = 1)
DNA_df = DNA_df.reindex(columns=['OTU_DNA', 'Order_DNA', 'Family_DNA', 'Genus_DNA', 'Species_DNA', 'Label_DNA', 'read count_DNA'])
EPT_DL_df = base_df.join(DNA_df)

print(f'Number of specimen remaining: {EPT_DL_df['Specimen ID'].nunique()}')
print(f'Corresponding number of images: {EPT_DL_df['Specimen ID'].count()}')
EPT_DL_species_df = EPT_DL_df.dropna(subset=['Species_DNA'])
print(f'Number of OTUs (/classes): {EPT_DL_species_df['OTU_DNA'].nunique()}')
print(f'Number of specimen on species level: {EPT_DL_species_df['Specimen ID'].nunique()}')
unique_counts = EPT_DL_species_df.groupby('Species_DNA')['Specimen ID'].nunique().sort_values(ascending=False)
print(f'Corresponding number of images: {EPT_DL_species_df['Specimen ID'].count()}')

pd.DataFrame(unique_counts)



  IMG_df = pd.read_csv('outputs/image_data_with_DNA.csv')


Total number of images collected: 184851
Number of images after quality filtering: 170436
Number of specimen with <1000 reads in first samples: 116
Number of specimen with <1000 reads in repeated samples: 21
Number of specimen where Morpho_Order =/= DNA_Order: 0
Number of specimen where Morpho_Order =/= DNA_repeat_Order: 2
Number of specimen remaining: 815
Corresponding number of images: 159794
Number of OTUs (/classes): 20
Number of specimen on species level: 762
Corresponding number of images: 149597


Unnamed: 0_level_0,Specimen ID
Species_DNA,Unnamed: 1_level_1
Limnephilus lunatus,233
Goera pilosa,122
Potamophylax rotundipennis,91
Ephemera danica,77
Baetis rhodani,49
Chaetopteryx villosa,46
Athripsodes cinereus,35
Caenis horaria,20
Halesus radiatus,20
Anabolia nervosa,17


In [7]:
not_in_final_df

{'delete_in_first': array(['4_G1', '4_G3', '4_H5', '4_H7', '4_H8', '4_H10', '5_A3', '5_A4',
        '5_B4', '5_F12', '6_A1', '6_A7', '6_A8', '6_A10', '6_B10', '6_C2',
        '6_C4', '6_C6', '6_D4', '6_D9', '6_D10', '6_E8', '9_C5', '9_C6',
        '9_C12', '9_E3', '9_E5', '9_E7', '9_E12', '9_F8', '9_G10', '9_H2',
        '9_H4', '10_A4', '10_A5', '7_E10', '7_F1', '7_F5', '7_F6', '7_G8',
        '7_G12', '7_H2', '7_H3', '7_H10', '8_A6', '8_A7', '8_A10', '8_A11',
        '8_B3', '8_B7', '8_B12', '5_B7', '5_C1', '5_C3', '5_D7', '5_D8',
        '5_D11', '5_E1', '5_E11', '8_F8', '8_F12', '8_G5', '8_H3', '9_A6',
        '9_B4', '3_F12', '3_G8', '3_G10', '3_H2', '4_A11', '4_B1', '4_B5',
        '4_C1', '4_C4', '4_C5', '4_C12', '4_D1', '4_D2', '4_D4', '4_D10',
        '4_E3', '8_D1', '8_D4', '8_D12', '8_E4', '8_E5', '8_E9', '8_F2',
        '8_F3', '2_E3', '3_A4', '3_A6', '3_A9', '3_A11', '3_B7', '3_B10',
        '3_C3', '3_C8', '3_D5', '3_D8', '6_F5', '6_F9', '6_G5', '6_G11',
        '6_G12', 

In [9]:
list(set(not_in_final_df['delete_in_first']).intersection(not_in_final_df['delete_in_repeat']))

['9_E3',
 '8_H3',
 '6_C6',
 '3_C3',
 '9_C6',
 '8_D12',
 '7_C2',
 '7_F6',
 '8_B3',
 '5_C1',
 '6_D9',
 '4_E3']

In [48]:
IMG_df[IMG_df['Specimen ID']== '3_B12']

Unnamed: 0,Specimen ID,Sample Name/Number,Species Name,Image File Name,Other Notes,Max Feret Diameter,Perimeter,Area,Holes,Area+Holes,...,Species_DNA,Label_DNA,read count_DNA,OTU_DNA_repeat,Order_DNA_repeat,Family_DNA_repeat,Genus_DNA_repeat,Species_DNA_repeat,Label_DNA_repeat,read count_DNA_repeat
144763,3_B12,3_B12_1,PMR_ExStream22_Main_EPT,1_3_B12_1_2023_12_07-11-06-15-214.PNG,C14_T_ohne,359,1492,45657,479,46136,...,,,,>OTU_25,Trichoptera,Hydropsychidae,Hydropsyche,Hydropsyche saxonica,Hydropsyche saxonica,267480.0
144764,3_B12,3_B12_1,PMR_ExStream22_Main_EPT,2_3_B12_1_2023_12_07-11-06-15-225.PNG,C14_T_ohne,345,1232,42782,1018,43800,...,,,,>OTU_25,Trichoptera,Hydropsychidae,Hydropsyche,Hydropsyche saxonica,Hydropsyche saxonica,267480.0
144765,3_B12,3_B12_1,PMR_ExStream22_Main_EPT,1_3_B12_1_2023_12_07-11-06-15-237.PNG,C14_T_ohne,359,1492,45657,479,46136,...,,,,>OTU_25,Trichoptera,Hydropsychidae,Hydropsyche,Hydropsyche saxonica,Hydropsyche saxonica,267480.0
144766,3_B12,3_B12_1,PMR_ExStream22_Main_EPT,2_3_B12_1_2023_12_07-11-06-15-248.PNG,C14_T_ohne,347,1272,40867,817,41684,...,,,,>OTU_25,Trichoptera,Hydropsychidae,Hydropsyche,Hydropsyche saxonica,Hydropsyche saxonica,267480.0
144767,3_B12,3_B12_1,PMR_ExStream22_Main_EPT,1_3_B12_1_2023_12_07-11-06-15-259.PNG,C14_T_ohne,367,1478,45943,723,46666,...,,,,>OTU_25,Trichoptera,Hydropsychidae,Hydropsyche,Hydropsyche saxonica,Hydropsyche saxonica,267480.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144847,3_B12,3_B12_2,PMR_ExStream22_Main_EPT,1_3_B12_2_2023_12_07-11-06-32-175.PNG,C14_T_ohne,375,1564,37000,35,37035,...,,,,>OTU_25,Trichoptera,Hydropsychidae,Hydropsyche,Hydropsyche saxonica,Hydropsyche saxonica,267480.0
144848,3_B12,3_B12_2,PMR_ExStream22_Main_EPT,2_3_B12_2_2023_12_07-11-06-32-210.PNG,C14_T_ohne,377,1978,54002,14,54016,...,,,,>OTU_25,Trichoptera,Hydropsychidae,Hydropsyche,Hydropsyche saxonica,Hydropsyche saxonica,267480.0
144849,3_B12,3_B12_2,PMR_ExStream22_Main_EPT,1_3_B12_2_2023_12_07-11-06-32-199.PNG,C14_T_ohne,372,1562,37267,42,37309,...,,,,>OTU_25,Trichoptera,Hydropsychidae,Hydropsyche,Hydropsyche saxonica,Hydropsyche saxonica,267480.0
144850,3_B12,3_B12_2,PMR_ExStream22_Main_EPT,2_3_B12_2_2023_12_07-11-06-32-233.PNG,C14_T_ohne,378,1981,53916,15,53931,...,,,,>OTU_25,Trichoptera,Hydropsychidae,Hydropsyche,Hydropsyche saxonica,Hydropsyche saxonica,267480.0


In [43]:
subset = IMG_df[IMG_df['Specimen ID'].isin(not_in_final_df['delete_in_first'])]
filtered = subset[subset['Order_DNA_repeat'].isna()]

filtered['Specimen ID'].unique()


array(['9_E5', '9_F8', '8_A11', '2_E3'], dtype=object)

In [2]:
# Remove species with less than 5 specimen:
print('Specimen that will be removed:')
print(EPT_DL_species_df[EPT_DL_species_df['Species_DNA'].isin(unique_counts[unique_counts < 5].index)]['Specimen ID'].unique())
EPT_14_DL_df =  EPT_DL_species_df[~EPT_DL_species_df['Species_DNA'].isin(unique_counts[unique_counts < 5].index)]

print(f'Number of specimen that will be used for deep learning: {EPT_14_DL_df['Specimen ID'].nunique()}')
print(f'Number of species (/classes): {EPT_14_DL_df['Species_DNA'].nunique()}')
print(f'Corresponding number of images: {EPT_14_DL_df['Specimen ID'].count()}')

EPT_14_DL_df.to_csv('outputs/EPT-14-DL.csv', index=False)

unique_counts_dl = EPT_14_DL_df.groupby('Species_DNA')['Specimen ID'].nunique().sort_values(ascending=False)

pd.DataFrame(unique_counts_dl)

Specimen that will be removed:
['4_G1' '4_G2' '9_F1' '9_G7' '3_H11' '4_F5' '8_E6' '8_E8' '8_E9' '3_B12'
 '10_B1' '10_C1']
Number of specimen that will be used for deep learning: 750
Number of species (/classes): 14
Corresponding number of images: 148523


Unnamed: 0_level_0,Specimen ID
Species_DNA,Unnamed: 1_level_1
Limnephilus lunatus,233
Goera pilosa,122
Potamophylax rotundipennis,91
Ephemera danica,77
Baetis rhodani,49
Chaetopteryx villosa,46
Athripsodes cinereus,35
Caenis horaria,20
Halesus radiatus,20
Anabolia nervosa,17


Compare failed samples with rest of the samples

In [3]:
# Values in IMG_df['Specimen ID'] but not in All_IMGs_df['Sample Name/Number']
All_IMGs_df['Sample Name/Number'] = All_IMGs_df['Sample Name/Number'].apply(lambda x: x.rsplit('_', 1)[0])
All_IMGs_df[~All_IMGs_df['Sample Name/Number'].isin(EPT_14_DL_df['Specimen ID'])].dropna(axis=1).describe().round(2)

Unnamed: 0,Max Feret Diameter,Perimeter,Area,Holes,Area+Holes,Exposure Time (µs),Framerate (FPS),Light Intensity (%),Aperture,ROI (left),ROI (top),ROI (right),ROI (bottom)
count,26957.0,26957.0,26957.0,26957.0,26957.0,26957.0,26957.0,26957.0,26957.0,26957.0,26957.0,26957.0,26957.0
mean,207.57,670.39,17525.88,157.02,17682.89,2000.0,50.0,100.0,8.0,308.52,0.0,816.97,476.0
std,226.41,831.83,38486.16,731.49,38800.12,0.0,0.0,0.0,0.0,182.1,0.0,180.02,0.0
min,19.0,54.0,201.0,0.0,201.0,2000.0,50.0,100.0,8.0,0.0,0.0,476.0,476.0
25%,64.0,184.0,895.0,0.0,897.0,2000.0,50.0,100.0,8.0,148.0,0.0,662.0,476.0
50%,123.0,350.0,3397.0,3.0,3405.0,2000.0,50.0,100.0,8.0,313.0,0.0,832.0,476.0
75%,227.0,736.0,11167.0,40.0,11277.0,2000.0,50.0,100.0,8.0,459.0,0.0,966.0,476.0
max,1109.0,5189.0,209006.0,10461.0,209018.0,2000.0,50.0,100.0,8.0,640.0,0.0,1116.0,476.0


In [4]:
All_IMGs_df[All_IMGs_df['Sample Name/Number'].isin(EPT_14_DL_df['Specimen ID'])].dropna(axis=1).describe().round(2)

Unnamed: 0,Max Feret Diameter,Perimeter,Area,Holes,Area+Holes,Exposure Time (µs),Framerate (FPS),Light Intensity (%),Aperture,ROI (left),ROI (top),ROI (right),ROI (bottom)
count,157894.0,157894.0,157894.0,157894.0,157894.0,157894.0,157894.0,157894.0,157894.0,157894.0,157894.0,157894.0,157894.0
mean,275.92,928.65,20247.77,248.98,20496.75,2000.0,50.0,100.0,8.0,307.74,0.0,814.04,476.0
std,198.84,778.12,31815.0,1352.77,32527.46,0.0,0.0,0.0,0.0,182.44,0.0,181.95,0.0
min,18.0,53.0,201.0,0.0,201.0,2000.0,50.0,100.0,8.0,0.0,0.0,476.0,476.0
25%,147.0,453.0,4512.0,3.0,4535.0,2000.0,50.0,100.0,8.0,149.0,0.0,661.0,476.0
50%,217.0,670.0,8124.0,23.0,8182.0,2000.0,50.0,100.0,8.0,300.0,0.0,821.0,476.0
75%,330.0,1089.0,17845.0,102.0,18014.0,2000.0,50.0,100.0,8.0,465.0,0.0,973.0,476.0
max,1099.0,4957.0,221029.0,26816.0,247134.0,2000.0,50.0,100.0,8.0,640.0,0.0,1116.0,476.0


In [5]:
IMG_df[~IMG_df['Specimen ID'].isin(EPT_14_DL_df['Specimen ID'])].dropna(axis=1).describe().round(2)

Unnamed: 0,Max Feret Diameter,Perimeter,Area,Holes,Area+Holes,Exposure Time (µs),Framerate (FPS),Light Intensity (%),Aperture,ROI (left),ROI (top),ROI (right),ROI (bottom),Plate,Empty Tube Weight,Tube Weight with dried Specimen,Specimen Weight
count,13876.0,13876.0,13876.0,13876.0,13876.0,13876.0,13876.0,13876.0,13876.0,13876.0,13876.0,13876.0,13876.0,13876.0,13876.0,13876.0,13876.0
mean,222.41,754.48,21113.98,209.75,21323.73,2000.0,50.0,100.0,8.0,295.55,0.0,810.11,476.0,5.61,1523.67,1536.32,12.65
std,240.79,908.24,41316.37,914.22,41670.89,0.0,0.0,0.0,0.0,187.79,0.0,187.0,0.0,2.64,5.8,29.17,28.76
min,20.0,59.0,201.0,0.0,201.0,2000.0,50.0,100.0,8.0,0.0,0.0,476.0,476.0,1.0,1511.57,1512.59,-0.04
25%,70.0,199.0,1271.0,0.0,1277.0,2000.0,50.0,100.0,8.0,130.0,0.0,644.0,476.0,3.0,1518.43,1520.44,0.99
50%,122.0,350.0,3534.0,5.0,3553.5,2000.0,50.0,100.0,8.0,281.0,0.0,810.0,476.0,6.0,1524.92,1530.12,1.23
75%,234.0,825.0,13163.5,50.0,13212.5,2000.0,50.0,100.0,8.0,460.0,0.0,982.0,476.0,8.0,1529.32,1536.68,12.47
max,1109.0,5189.0,209006.0,10461.0,209018.0,2000.0,50.0,100.0,8.0,640.0,0.0,1116.0,476.0,10.0,1535.52,1675.32,147.1


In [6]:
IMG_df[IMG_df['Specimen ID'].isin(EPT_14_DL_df['Specimen ID'])].dropna(axis=1).describe().round(2)

Unnamed: 0,Max Feret Diameter,Perimeter,Area,Holes,Area+Holes,Exposure Time (µs),Framerate (FPS),Light Intensity (%),Aperture,ROI (left),ROI (top),ROI (right),ROI (bottom),Plate,Empty Tube Weight,Tube Weight with dried Specimen,Specimen Weight
count,157894.0,157894.0,157894.0,157894.0,157894.0,157894.0,157894.0,157894.0,157894.0,157894.0,157894.0,157894.0,157894.0,157894.0,157894.0,157894.0,157894.0
mean,275.92,928.65,20247.77,248.98,20496.75,2000.0,50.0,100.0,8.0,307.74,0.0,814.04,476.0,5.04,1525.18,1531.16,5.98
std,198.84,778.12,31815.0,1352.77,32527.46,0.0,0.0,0.0,0.0,182.44,0.0,181.95,0.0,2.74,8.13,19.16,19.0
min,18.0,53.0,201.0,0.0,201.0,2000.0,50.0,100.0,8.0,0.0,0.0,476.0,476.0,1.0,1424.94,1509.32,-0.19
25%,147.0,453.0,4512.0,3.0,4535.0,2000.0,50.0,100.0,8.0,149.0,0.0,661.0,476.0,3.0,1521.48,1523.68,0.86
50%,217.0,670.0,8124.0,23.0,8182.0,2000.0,50.0,100.0,8.0,300.0,0.0,821.0,476.0,5.0,1525.49,1527.78,1.18
75%,330.0,1089.0,17845.0,102.0,18014.0,2000.0,50.0,100.0,8.0,465.0,0.0,973.0,476.0,7.0,1529.26,1532.6,2.25
max,1099.0,4957.0,221029.0,26816.0,247134.0,2000.0,50.0,100.0,8.0,640.0,0.0,1116.0,476.0,10.0,1542.27,1740.4,211.63
