In [26]:
import boto3
import pandas as pd
import numpy as np
import logging
from io import StringIO
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
import plotly.io as pio


def read_csv_from_s3(bucket_name, key):
    """
    Reads a CSV file from S3 and returns it as a pandas DataFrame.

    Parameters:
        bucket_name (str): Name of the S3 bucket
        key (str): Full path to the CSV file in the bucket

    Returns:
        pd.DataFrame: Loaded DataFrame
    """
    s3 = boto3.client('s3')
    response = s3.get_object(Bucket=bucket_name, Key=key)
    
    csv_content = response['Body'].read().decode('utf-8')
    df = pd.read_csv(StringIO(csv_content))
    
    return df


In [53]:
# Set up S3 client
s3 = boto3.client('s3')
bucket_name = "cellprofiler-resuts"
base_folder_path = "IRIC/CQDM_CTL_Plate_Validation_202501/Plate_1"
# List of kit identifiers (folder names in S3) and friendly labels
times = ["6h", "12h", "18h", "24h_2", "48h_2", "72h_2"]



# QC Feature settings
feature_groups = {
    'ImageQuality_Power': {'color': 'orange', 'threshold': 'iqr'},
    'ImageQuality_PercentMax': {'color': 'blue', 'threshold': 'fixed', 'fixed_thresh': 0.001},
    'Count_Nuclei': {'color': 'red', 'threshold': 'iqr'}
}

for time in times:
    print(f"📦 Processing kit: {time}")
    try:
        # Locate Image.csv
        response = s3.list_objects_v2(
            Bucket=bucket_name,
            Prefix=f"{base_folder_path}/{time}/",
            Delimiter='/'
        )
        matching_files = [
            obj['Key']
            for obj in response.get('Contents', [])
            if obj['Key'].endswith("Image.csv")
        ]
        if not matching_files:
            raise FileNotFoundError(f"No matching Image.csv found for {time}")

        features_key = matching_files[0]
        df = read_csv_from_s3(bucket_name, features_key)
        print(f"✅ Loaded {features_key}")
        
    except Exception as e:
        print(f"❌ Error: {e}")
        continue

    all_traces = []
    all_titles = []

    for group_prefix, settings in feature_groups.items():
        matching_columns = [col for col in df.columns if col.startswith(group_prefix)]
        for feature_col in matching_columns:
            values = df[feature_col].dropna()
            wells = df.loc[values.index, 'Metadata_Well']
            site = df.loc[values.index, 'Metadata_Site'].astype(str)

            # Compute thresholds
            if settings['threshold'] == 'iqr':
                Q1 = values.quantile(0.25)
                Q3 = values.quantile(0.75)
                IQR = Q3 - Q1
                lower_thresh = Q1 - 1.5 * IQR
                upper_thresh = Q3 + 1.5 * IQR
                fail_mask = (df[feature_col] < lower_thresh) | (df[feature_col] > upper_thresh)
            else:
                lower_thresh = settings['fixed_thresh']
                upper_thresh = None
                fail_mask = df[feature_col] >= lower_thresh

            # Save QC results to table
            qc_col = f"ImageQC_{feature_col}"
            df[qc_col] = fail_mask.where(~df[feature_col].isna(), np.nan)

            # Plot: histogram
            histogram = go.Histogram(
                x=values,
                nbinsx=100,
                marker_color=settings['color'],
                opacity=0.5,
                showlegend=False
            )
            all_traces.append([histogram])

            # Plot: hover
            scatter = go.Scatter(
                x=values,
                y=[0.1] * len(values),
                mode='markers',
                marker=dict(color='lightgrey'),
                customdata=np.stack([wells + '_S' + site], axis=-1),
                hovertemplate="Well: %{customdata[0]}<br>Value: %{x:.2f}<extra></extra>",
                showlegend=False
            )
            all_traces[-1].append(scatter)

            # Plot: thresholds
            if upper_thresh is not None:
                all_traces[-1].append(go.Scatter(
                    x=[upper_thresh, upper_thresh],
                    y=[0, 100],
                    mode='lines',
                    line=dict(color='red', width=2, dash='dot'),
                    name='Upper Threshold',
                    showlegend=False
                ))
            if lower_thresh is not None and group_prefix != 'ImageQuality_PercentMax':
                all_traces[-1].append(go.Scatter(
                    x=[lower_thresh, lower_thresh],
                    y=[0, 100],
                    mode='lines',
                    line=dict(color='red', width=2, dash='dot'),
                    name='Lower Threshold',
                    showlegend=False
                ))

            all_titles.append(f"{feature_col} / {time}")

    # Create subplot
    fig = make_subplots(rows=len(all_traces), cols=1, subplot_titles=all_titles)
    for i, trace_group in enumerate(all_traces):
        for trace in trace_group:
            fig.add_trace(trace, row=i + 1, col=1)

    fig.update_layout(
        height=430 * len(all_traces),
        width=800,
        title_text=f"Feature Distributions with QC for {time}",
        template='simple_white',
        barmode='overlay'
    )

    # Save HTML
    html_path = f"qc_plots_{time}.html"
    pio.write_html(fig, file=html_path, auto_open=False)
    print(f"💾 HTML saved: {html_path}")

    # Save modified CSV to S3
    csv_buffer = StringIO()
    df.to_csv(csv_buffer, index=False)
    s3.put_object(
        Bucket=bucket_name,
        Key=features_key,
        Body=csv_buffer.getvalue()
    )
    print(f"✅ Updated CSV with QC columns uploaded: {features_key}")


📦 Processing kit: 6h
✅ Loaded IRIC/CQDM_CTL_Plate_Validation_202501/Plate_1/6h/Image.csv
💾 HTML saved: qc_plots_6h.html
✅ Updated CSV with QC columns uploaded: IRIC/CQDM_CTL_Plate_Validation_202501/Plate_1/6h/Image.csv
📦 Processing kit: 12h
✅ Loaded IRIC/CQDM_CTL_Plate_Validation_202501/Plate_1/12h/Image.csv
💾 HTML saved: qc_plots_12h.html
✅ Updated CSV with QC columns uploaded: IRIC/CQDM_CTL_Plate_Validation_202501/Plate_1/12h/Image.csv
📦 Processing kit: 18h
✅ Loaded IRIC/CQDM_CTL_Plate_Validation_202501/Plate_1/18h/Image.csv
💾 HTML saved: qc_plots_18h.html
✅ Updated CSV with QC columns uploaded: IRIC/CQDM_CTL_Plate_Validation_202501/Plate_1/18h/Image.csv
📦 Processing kit: 24h_2
✅ Loaded IRIC/CQDM_CTL_Plate_Validation_202501/Plate_1/24h_2/Image.csv
💾 HTML saved: qc_plots_24h_2.html
✅ Updated CSV with QC columns uploaded: IRIC/CQDM_CTL_Plate_Validation_202501/Plate_1/24h_2/Image.csv
📦 Processing kit: 48h_2
✅ Loaded IRIC/CQDM_CTL_Plate_Validation_202501/Plate_1/48h_2/Image.csv
💾 HTML sa

np.int64(0)