In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import boto3
import re

In [36]:
metadata = pd.read_csv('metadata_read_maps.tsv',sep='\t')
s3 = boto3.client('s3')
bucket = 'modulome'
prefix = 'ylipolytic2025_repeat/'

def list_s3_files(bucket, prefix):
    keys = []
    paginator = s3.get_paginator('list_objects_v2')
    for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
        if 'Contents' in page:
            keys.extend([obj['Key'] for obj in page['Contents'] if obj['Key'].endswith('.fq.gz')])
    return keys

s3_files = list_s3_files(bucket, prefix)


In [38]:
maps = pd.DataFrame(s3_files,columns=['Read'])

maps['Filtered_Read'] = [w.replace('ylipolytic2025_repeat/','').replace(' ','').replace('-','').replace('_','').replace('.fq.gz','')
                         for w in maps['Read']]

unmapped = maps[maps['Read'].isin([w for w in maps['Read'] if 'W29' in w or 'PD' in w or 'SUC' in w])].copy(deep=False)

unmapped['Filtered_Read'] = [w[:-2] for w in unmapped['Filtered_Read']]

unmapped

Unnamed: 0,Read,Filtered_Read
100,ylipolytic2025_repeat/PD017A1S021A_1.fq.gz,PD017A1S021
101,ylipolytic2025_repeat/PD017A1S021A_2.fq.gz,PD017A1S021
102,ylipolytic2025_repeat/PD017A1S022A_1.fq.gz,PD017A1S022
103,ylipolytic2025_repeat/PD017A1S022A_2.fq.gz,PD017A1S022
104,ylipolytic2025_repeat/PD017A1S031A_1.fq.gz,PD017A1S031
...,...,...
263,ylipolytic2025_repeat/W29S07-2transA_2.fq.gz,W29S072trans
264,ylipolytic2025_repeat/W29S08-1transA_1.fq.gz,W29S081trans
265,ylipolytic2025_repeat/W29S08-1transA_2.fq.gz,W29S081trans
266,ylipolytic2025_repeat/W29S08-2transA_1.fq.gz,W29S082trans


In [39]:
#metadata['File name'] = [w.replace('trans','transA').replace() for w in metadata['File name']]

unmapped_met = metadata[metadata['File name'].isin([w for w in metadata['File name'] if 'W29' in w or 'PD' in w or 'SUC' in w])].copy(deep=False)
unmapped_met['File name'] = [w.replace('#','').replace(' ','') for w in unmapped_met['File name']]
unmapped_met

Unnamed: 0,File name,Condition,Project,Strain,Mode of operation,Media,Limitation,C-source,Growth rate,DO,pH,Temperature
54,W29S021trans,µ0,Growth_Variance_W29,W29,Continuous,Delft,Carbon,Glucose,0.00,40,6,30
55,W29S022trans,µ0,Growth_Variance_W29,W29,Continuous,Delft,Carbon,Glucose,0.00,40,6,30
56,W29S031trans,µ0.05,Growth_Variance_W29,W29,Continuous,Delft,Carbon,Glucose,0.05,40,6,30
57,W29S032trans,µ0.05,Growth_Variance_W29,W29,Continuous,Delft,Carbon,Glucose,0.05,40,6,30
58,W29S041trans,µ0.1,Growth_Variance_W29,W29,Continuous,Delft,Carbon,Glucose,0.10,40,6,30
...,...,...,...,...,...,...,...,...,...,...,...,...
111,PD019A1S052,µ0.05_DO5,O2_oscillation,W29,Continuous,Delft,Carbon,Glucose,0.05,5,6,30
112,PD019A1S061,µ0.05_DO2.5,O2_oscillation,W29,Continuous,Delft,Carbon,Glucose,0.05,2.5,6,30
113,PD019A1S062,µ0.05_DO2.5,O2_oscillation,W29,Continuous,Delft,Carbon,Glucose,0.05,2.5,6,30
114,PD019A1S071,µ0.05_DO0,O2_oscillation,W29,Continuous,Delft,Carbon,Glucose,0.05,0,6,30


In [50]:
#[w for w in unmapped_met['File name'].tolist() if w in unmapped['Filtered_Read'].tolist()]

read1 = []
read2 = []

for w in unmapped_met['File name']:
    read1.append(unmapped[unmapped['Filtered_Read'] == w]['Read'].tolist()[0])
    read2.append(unmapped[unmapped['Filtered_Read'] == w]['Read'].tolist()[1])

In [53]:
unmapped_met['Read1'] = read1
unmapped_met['Read2'] = read2

unmapped_met.to_csv('unmapped_met.csv')

In [52]:
read2

['ylipolytic2025_repeat/W29S02-1transA_2.fq.gz',
 'ylipolytic2025_repeat/W29S02-2transA_2.fq.gz',
 'ylipolytic2025_repeat/W29S03-1transA_2.fq.gz',
 'ylipolytic2025_repeat/W29S03-2transA_2.fq.gz',
 'ylipolytic2025_repeat/W29S04-1transA_2.fq.gz',
 'ylipolytic2025_repeat/W29S04-2transA_2.fq.gz',
 'ylipolytic2025_repeat/W29S05-1transA_2.fq.gz',
 'ylipolytic2025_repeat/W29S05-2transA_2.fq.gz',
 'ylipolytic2025_repeat/W29S06-1transA_2.fq.gz',
 'ylipolytic2025_repeat/W29S06-2transA_2.fq.gz',
 'ylipolytic2025_repeat/W29S07-1transA_2.fq.gz',
 'ylipolytic2025_repeat/W29S07-2transA_2.fq.gz',
 'ylipolytic2025_repeat/W29S08-1transA_2.fq.gz',
 'ylipolytic2025_repeat/W29S08-2transA_2.fq.gz',
 'ylipolytic2025_repeat/SUCS02-1transA_2.fq.gz',
 'ylipolytic2025_repeat/SUCS02-2transA_2.fq.gz',
 'ylipolytic2025_repeat/SUCS03-1transA_2.fq.gz',
 'ylipolytic2025_repeat/SUCS03-2transA_2.fq.gz',
 'ylipolytic2025_repeat/SUCS04-1transA_2.fq.gz',
 'ylipolytic2025_repeat/SUCS04-2transA_2.fq.gz',
 'ylipolytic2025_rep

In [21]:
read_list = maps["Read"].tolist()

# ── Function to find top 2 regex matches for each file name ─
def find_regex_matches(sample_id):
    # Create a regex pattern to match variations like _1, _2, A1, A2, etc.
    # Escape special characters in the sample_id
    pattern = re.escape(sample_id)
    
    # Search for all reads that contain this pattern
    matched_reads = [r for r in read_list if re.search(pattern, r)]
    
    # Sort and return top 2 matches (typically _1 and _2)
    return sorted(matched_reads)[:2] if matched_reads else [pd.NA, pd.NA]

# ── Apply function across df1 ─────────────────────────────
matches = metadata["File name"].apply(find_regex_matches)
metadata[["Read1", "Read2"]] = pd.DataFrame(matches.tolist(), index=metadata.index)

# ── Optional: create long-form mapping table ──────────────
match_table = (
    metadata[["File name", "Read1", "Read2"]]
    .melt(id_vars="File name", value_name="Read")
    .dropna(subset=["Read"])
    .drop(columns="variable")
)


In [23]:
metadata.to_csv('mapped_metadata.csv')

In [22]:
metadata[metadata['File name'] =='']

Unnamed: 0,File name,Condition,Project,Strain,Mode of operation,Media,Limitation,C-source,Growth rate,DO,pH,Temperature,Read1,Read2
0,20210204-YL-CN-1,Glucose_Nlimit,Batch_Shaken,W29,Shaken culture,Delft-N limited,Nitrogen,Glucose,0.3,High,6,30,ylipolytic2025_repeat/20210204-YL-CN-1A_1.fq.gz,ylipolytic2025_repeat/20210204-YL-CN-1A_2.fq.gz
1,20210204-YL-CN-2,Glucose_Nlimit,Batch_Shaken,W29,Shaken culture,Delft-N limited,Nitrogen,Glucose,0.3,High,6,30,ylipolytic2025_repeat/20210204-YL-CN-2A_1.fq.gz,ylipolytic2025_repeat/20210204-YL-CN-2A_2.fq.gz
2,20210204-YL-fill-1,Glucose_lowDO,Batch_Shaken,W29,Shaken culture,Delft,Carbon,Glucose,0.3,Low,6,30,ylipolytic2025_repeat/20210204-YL-fill-1A_1.fq.gz,ylipolytic2025_repeat/20210204-YL-fill-1A_2.fq.gz
3,20210204-YL-fill-2,Glucose_lowDO,Batch_Shaken,W29,Shaken culture,Delft,Carbon,Glucose,0.3,Low,6,30,ylipolytic2025_repeat/20210204-YL-fill-2A_1.fq.gz,ylipolytic2025_repeat/20210204-YL-fill-2A_2.fq.gz
4,20210204-YL-ref-1,Glucose_shaken_reference,Batch_Shaken,W29,Shaken culture,Delft,Carbon,Glucose,0.3,High,6,30,ylipolytic2025_repeat/20210204-YL-ref-1A_1.fq.gz,ylipolytic2025_repeat/20210204-YL-ref-1A_2.fq.gz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,R3_S05_#2,CO2_40%,CO2_oscillation,W29,Continuous,Delft,,Glucose,0.1,25,N.A,30,ylipolytic2025_repeat/R3_S05_#2A_1.fq.gz,ylipolytic2025_repeat/R3_S05_#2A_2.fq.gz
136,R3_S06_#1,CO2_50%,CO2_oscillation,W29,Continuous,Delft,,Glucose,0.1,25,N.A,30,ylipolytic2025_repeat/R3_S06_#1A_1.fq.gz,ylipolytic2025_repeat/R3_S06_#1A_2.fq.gz
137,R3_S06_#2,CO2_50%,CO2_oscillation,W29,Continuous,Delft,,Glucose,0.1,25,N.A,30,ylipolytic2025_repeat/R3_S06_#2A_1.fq.gz,ylipolytic2025_repeat/R3_S06_#2A_2.fq.gz
138,R3_S08_#1,CO2_0%,CO2_oscillation,W29,Continuous,Delft,,Glucose,0.1,25,N.A,30,ylipolytic2025_repeat/R3_S08_#1A_1.fq.gz,ylipolytic2025_repeat/R3_S08_#1A_2.fq.gz
