In [2]:
import boto3
import pandas as pd
import numpy as np
import logging
from io import StringIO
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
import plotly.io as pio
import os


def read_csv_from_s3(bucket_name, key):
    """
    Reads a CSV file from S3 and returns it as a pandas DataFrame.

    Parameters:
        bucket_name (str): Name of the S3 bucket
        key (str): Full path to the CSV file in the bucket

    Returns:
        pd.DataFrame: Loaded DataFrame
    """
    s3 = boto3.client('s3')
    response = s3.get_object(Bucket=bucket_name, Key=key)
    
    csv_content = response['Body'].read().decode('utf-8')
    df = pd.read_csv(StringIO(csv_content))
    
    return df


In [None]:
# Set up S3 client
s3 = boto3.client('s3')
bucket_name = "cellprofiler-resuts"
base_folder_path = "IRIC/CellSeedingDensity_1000VS2000_202505/NL150-1000"
# List of kit identifiers (folder names in S3) and friendly labels
times = ['3h',"6h", "12h", "24h", "48h", "72h"]
output_folder='/Users/dcamacho/Documents/Pfizer_Josiane/QCDM/CellSeeding/NL150-1000'

os.makedirs(output_folder, exist_ok=True)

# QC Feature settings
feature_groups = {
    'ImageQuality_Power': {'color': 'orange', 'threshold': 'iqr'},
    'ImageQuality_PercentMax': {'color': 'blue', 'threshold': 'fixed', 'fixed_thresh': 0.001},
    'Count_Nuclei': {'color': 'red'}
}

for time in times:
    print(f"📦 Processing kit: {time}")
    try:
        # Locate Image.csv
        response = s3.list_objects_v2(
            Bucket=bucket_name,
            Prefix=f"{base_folder_path}/{time}/",
            Delimiter='/'
        )
        matching_files = [
            obj['Key']
            for obj in response.get('Contents', [])
            if obj['Key'].endswith("Image.csv")
        ]
        if not matching_files:
            raise FileNotFoundError(f"No matching Image.csv found for {time}")

        features_key = matching_files[0]
        df = read_csv_from_s3(bucket_name, features_key)
        print(f"✅ Loaded {features_key}")
        
    except Exception as e:
        print(f"❌ Error: {e}")
        continue

    all_traces = []
    all_titles = []

    for group_prefix, settings in feature_groups.items():
        matching_columns = [col for col in df.columns if col.startswith(group_prefix)]
        for feature_col in matching_columns:
            values = df[feature_col].dropna()
            wells = df.loc[values.index, 'Metadata_Well']
            site = df.loc[values.index, 'Metadata_Site'].astype(str)
            if 'threshold' in settings.keys():
                # Compute thresholds
                if settings['threshold'] == 'iqr':
                    Q1 = values.quantile(0.25)
                    Q3 = values.quantile(0.75)
                    IQR = Q3 - Q1
                    lower_thresh = Q1 - 1.5 * IQR
                    upper_thresh = Q3 + 1.5 * IQR
                    fail_mask = (df[feature_col] < lower_thresh) | (df[feature_col] > upper_thresh)
                else:
                    lower_thresh = settings['fixed_thresh']
                    upper_thresh = None
                    fail_mask = df[feature_col] >= lower_thresh

                # Save QC results to table
                qc_col = f"ImageQC_{feature_col}"
                df[qc_col] = fail_mask.where(~df[feature_col].isna(), np.nan)

            # Plot: histogram
            histogram = go.Histogram(
                x=values,
                nbinsx=100,
                marker_color=settings['color'],
                opacity=0.5,
                showlegend=False
            )
            all_traces.append([histogram])

            # Plot: hover
            scatter = go.Scatter(
                x=values,
                y=[0.1] * len(values),
                mode='markers',
                marker=dict(color='lightgrey'),
                customdata=np.stack([wells + '_S' + site], axis=-1),
                hovertemplate="Well: %{customdata[0]}<br>Value: %{x:.2f}<extra></extra>",
                showlegend=False
            )
            all_traces[-1].append(scatter)

            # Plot: thresholds
            if upper_thresh is not None:
                all_traces[-1].append(go.Scatter(
                    x=[upper_thresh, upper_thresh],
                    y=[0, 100],
                    mode='lines',
                    line=dict(color='red', width=2, dash='dot'),
                    name='Upper Threshold',
                    showlegend=False
                ))
            if lower_thresh is not None and group_prefix == 'ImageQuality_Power':
                all_traces[-1].append(go.Scatter(
                    x=[lower_thresh, lower_thresh],
                    y=[0, 100],
                    mode='lines',
                    line=dict(color='red', width=2, dash='dot'),
                    name='Lower Threshold',
                    showlegend=False
                ))

            all_titles.append(f"{feature_col} / {time}")

    # Create subplot
    fig = make_subplots(rows=len(all_traces), cols=1, subplot_titles=all_titles)
    for i, trace_group in enumerate(all_traces):
        for trace in trace_group:
            fig.add_trace(trace, row=i + 1, col=1)

    fig.update_layout(
        height=430 * len(all_traces),
        width=800,
        title_text=f"Feature Distributions with QC for {time}",
        template='simple_white',
        barmode='overlay'
    )

    # Save HTML
    html_path = f"{output_folder}/qc_plots_{time}.html"
    pio.write_html(fig, file=html_path, auto_open=False)
    print(f"💾 HTML saved: {html_path}")

    # Save modified CSV to S3
    csv_buffer = StringIO()
    df.to_csv(csv_buffer, index=False)
    s3.put_object(
        Bucket=bucket_name,
        Key=features_key,
        Body=csv_buffer.getvalue()
    )
    print(f"✅ Updated CSV with QC columns uploaded: {features_key}")


In [5]:
df

Unnamed: 0,Count_Cells,Count_Cytoplasm,Count_Nuclei,ExecutionTime_01LoadData,ExecutionTime_02ImageMath,ExecutionTime_03MeasureImageQuality,ExecutionTime_04IdentifyPrimaryObjects,ExecutionTime_05IdentifySecondaryObjects,ExecutionTime_06IdentifyTertiaryObjects,ExecutionTime_07MeasureColocalization,...,Width_DNA,ImageQC_ImageQuality_PowerLogLogSlope_CL488R,ImageQC_ImageQuality_PowerLogLogSlope_CL488Y,ImageQC_ImageQuality_PowerLogLogSlope_CL640,ImageQC_ImageQuality_PowerLogLogSlope_DNA,ImageQC_ImageQuality_PercentMaximal_CL488R,ImageQC_ImageQuality_PercentMaximal_CL488Y,ImageQC_ImageQuality_PercentMaximal_CL640,ImageQC_ImageQuality_PercentMaximal_DNA,ImageQC_Count_Nuclei
0,75.0,75.0,75.0,3.86,0.06,21.97,3.01,2.25,0.42,3.99,...,2000,False,False,False,False,False,False,False,False,False
1,78.0,78.0,78.0,1.61,0.02,23.09,3.27,2.69,0.46,4.80,...,2000,False,False,False,False,False,False,False,False,False
2,16.0,16.0,16.0,1.69,0.03,22.89,2.97,1.77,0.37,2.00,...,2000,False,False,False,False,False,False,False,False,False
3,127.0,127.0,127.0,1.14,0.02,23.49,3.98,4.33,0.41,3.75,...,2000,False,False,False,False,False,False,False,False,True
4,47.0,47.0,47.0,0.75,0.02,20.19,2.80,1.94,0.42,3.06,...,2000,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1227,103.0,103.0,103.0,0.84,0.01,18.97,2.60,2.54,0.45,4.97,...,2000,False,False,False,False,False,False,False,False,False
1228,87.0,87.0,87.0,0.71,0.02,18.34,2.45,1.90,0.38,3.31,...,2000,False,False,False,False,False,False,False,False,False
1229,60.0,60.0,60.0,0.69,0.02,18.67,2.40,1.71,0.35,2.49,...,2000,False,False,False,False,False,False,False,False,False
1230,61.0,61.0,61.0,0.75,0.02,18.89,2.40,1.83,0.36,2.97,...,2000,False,False,False,False,False,False,False,False,False


In [12]:
df.loc[df.filter(like='ImageQC_').any(axis=1), 'ImageNumber']

15        16
16        17
17        18
18        19
21        22
        ... 
1660    1661
1661    1662
1662    1663
1663    1664
1664    1665
Name: ImageNumber, Length: 205, dtype: int64