In [1]:
# Using Python 3.12.1 (local env: metabarcoding3.0)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from boldigger3.id_engine import parse_fasta

fasta_dict, fasta_name, project_directory = parse_fasta('raw/ExStreamEPT_apscale_OTUs_filtered_done.fasta')

image_data = pd.read_csv('outputs/image_data_merged_fixed.csv', sep=';',on_bad_lines='warn')
image_data['Image Path'] = image_data['Image Path'].str.replace('\images\Expo_2000_Ap_8\PMR_ExStream22_Main_EPT', '')
changes_df = pd.read_excel('raw/EPT-14-change-map.xlsx')
changes_todo = changes_df[changes_df['Columns to change'] != 'Delete']

for index, row in changes_todo.iterrows():
    if type(row['New values']) != float:
        image_data.loc[image_data['Specimen ID'] == row['Specimen ID'], row['Columns to change'].split(', ')] = row['New values'].split(', ')
    else:
        image_data.loc[image_data['Specimen ID'] == row['Specimen ID'], row['Columns to change']] = row['New values']

image_data['Specimen Weight'] = image_data['Tube Weight with dried Specimen']-image_data['Empty Tube Weight']
image_data['Specimen Weight'] = image_data['Specimen Weight'].round(4)

# Remove columns that are not needed or given seperatly, since all values are the same (e.g. Species Name (Used as Project Name), Exposure Time (2000 µs), FPS (50), Light Intensity (100%))
image_data = image_data.drop(columns=['Species Name', 'Other Notes', 'Aperture', 'Exposure Time (µs)', 'Framerate (FPS)', 'Light Intensity (%)', 'Empty Tube Weight', 'Tube Weight with dried Specimen', 'Plate', 'Well', 'Channel'])
bad_img_df = pd.read_excel('raw/EPT_filtered_images.xlsx')
image_data.insert(loc=3, column='Passed image QC', value=~image_data['Image File Name'].isin(bad_img_df['bad_img']))

  image_data['Image Path'] = image_data['Image Path'].str.replace('\images\Expo_2000_Ap_8\PMR_ExStream22_Main_EPT', '')


In [2]:
# Delete DNA info if reads below 1000 + COUNT
# Rename image_data to df for handling reasons:
df = image_data

# Identify columns that end with '_DNA'
data_columns = [col for col in df.columns if '_DNA' not in col]
base_df = df[data_columns]
dna_columns = [col for col in df.columns if col.endswith('_DNA')]
repeat_columns = [col for col in df.columns if col.endswith('_DNA_repeat')]

# Set values to NaN where 'read count_DNA' or 'read count_DNA_repeat' is below 1000

print(f"Number of specimen with <1000 reads in first samples: {df.loc[df['read count_DNA'] < 1000, 'Specimen ID'].nunique()}")
not_in_final_df = {}
not_in_final_df['delete_in_first'] = df.loc[df['read count_DNA'] < 1000, 'Specimen ID'].unique()
print(f"Number of specimen with <1000 reads in repeated samples: {df.loc[df['read count_DNA_repeat'] < 1000, 'Specimen ID'].nunique()}")
not_in_final_df['delete_in_repeat'] = df.loc[df['read count_DNA_repeat'] < 1000, 'Specimen ID'].unique()

df.loc[df['read count_DNA'] < 1000, dna_columns] = np.nan
df.loc[df['read count_DNA_repeat'] < 1000, repeat_columns] = np.nan


# Delete DNA info if assigned to wrong order for DNA and DNA_repeat
non_match_mask = df['Order_DNA'].notna() & (df['Order_DNA'] != df['Order_Morpho'])
repeat_non_match_mask = df['Order_DNA_repeat'].notna() & (df['Order_DNA_repeat'] != df['Order_Morpho'])

print(f"Number of specimen where Morpho_Order =/= DNA_Order: {df.loc[non_match_mask, 'Specimen ID'].nunique()}")
not_in_final_df['Morpho_Order =/= DNA_Order'] = df.loc[non_match_mask, 'Specimen ID'].unique()
print(f"Number of specimen where Morpho_Order =/= DNA_repeat_Order: {df.loc[repeat_non_match_mask, 'Specimen ID'].nunique()}")
not_in_final_df['Morpho_Order =/= DNA_repeat_Order'] = df.loc[repeat_non_match_mask, 'Specimen ID'].unique()

df.loc[non_match_mask, dna_columns] = np.nan
df.loc[repeat_non_match_mask, repeat_columns] = np.nan

def determine_final_label(row):
    if pd.isna(row['read count_DNA']) and pd.isna(row['read count_DNA_repeat']):
        return pd.Series([np.nan] * len(dna_columns), index=dna_columns)  # Both values are NaN
    elif pd.isna(row['read count_DNA']):
        # Only 'read count_DNA' is NaN, return the '_repeat' values and rename columns
        repeat_values = row[repeat_columns].copy()
        repeat_values.index = [col.replace('_repeat', '') for col in repeat_values.index]
        return repeat_values
    elif pd.isna(row['read count_DNA_repeat']):
        return row[dna_columns]  # Only 'read count_DNA_repeat' is NaN
    else:
        # Neither value is NaN, return the row with higher 'read count' and rename columns if needed
        if row['read count_DNA'] > row['read count_DNA_repeat']:
            return row[dna_columns]
        else:
            repeat_values = row[repeat_columns].copy()
            repeat_values.index = [col.replace('_repeat', '') for col in repeat_values.index]
            return repeat_values

# Decide if the first run or if the repeated run is chosen based on read count
DNA_df = df.apply(determine_final_label, axis = 1)
DNA_df = DNA_df.reindex(columns=['OTU_DNA', 'Order_DNA', 'Family_DNA', 'Genus_DNA', 'Species_DNA', 'Label_DNA', 'read count_DNA'])

Number of specimen with <1000 reads in first samples: 132
Number of specimen with <1000 reads in repeated samples: 27
Number of specimen where Morpho_Order =/= DNA_Order: 20
Number of specimen where Morpho_Order =/= DNA_repeat_Order: 7


In [3]:
metadata_df = base_df.join(DNA_df)
metadata_df.insert(loc=11, column='Date (DD/MM/YYYY HH:MM:SS)', value=pd.to_datetime(metadata_df['Date (DD/MM/YYYY HH:MM)'], format='%d/%m/%Y %H:%M:%S'))
metadata_df = metadata_df.drop(columns=['Date (DD/MM/YYYY HH:MM)'])
metadata_df = metadata_df.sort_values(by='Date (DD/MM/YYYY HH:MM:SS)')

In [4]:
metadata_df.insert(loc=20, column='Seq', value=image_data['OTU_DNA'].apply(lambda x: fasta_dict[x.replace('>', '')].seq if isinstance(x, str) else np.nan))

In [None]:
metadata_df.to_csv('outputs/EPT-14_metadata.csv', sep=';', index=False)