In [1]:
import pandas as pd 
import numpy as np
from scipy import stats
from sklearn.cross_decomposition import PLSRegression

import matplotlib.pyplot as plt
%matplotlib inline

import mahalanobis as maha

In [2]:
# Parameters from the analysis in `Covid Confirm.ipynb`

HIGHEST_CONC = 30   # [muM] – the highest concentration in the confirm experiments
N_COMPONENTS = 18   # effective dimension for the non-infected distribtuion
THRESH = 15 # distance to non-infected from DMSO

In [3]:
df = pd.read_csv('output/Covid_confirm_ranked.csv', sep=';')

Remove measurements at the highest dose (30 muM)

In [4]:
df = df[df['conc'] < HIGHEST_CONC]

Remove irrelevant items:
- Non-infected
- DMSO
- Antiviral control (Remdesivir, Nirmatrelvir, GC376)
- CP control
- Single dose measurements (?)

In [5]:
# Read the type of drug: primary hit, secondary hit, or analog
subset = pd.read_csv("data/metadata_annotations.csv", usecols=["CBCSbatch_id", "arm"])
subset.rename(columns={"CBCSbatch_id": "batch_id", "arm": "type"}, inplace=True)

# Read control id-s
antiviral_control_id = subset[subset['type'] == 'antiviral_control']['batch_id'].to_list()
cp_control_id = subset[subset['type'] == 'cp_control']['batch_id'].to_list()

In [6]:
# Non-inf and DMSO
irrelevant_names = ['non-inf', 'DMSO']
df_filtered = df[~df['name'].isin(irrelevant_names)]

# Control
df_filtered = df_filtered[~df_filtered['batch_id'].isin(antiviral_control_id)]
df_filtered = df_filtered[~df_filtered['batch_id'].isin(cp_control_id)]

# # Single dose measurements
# df_filtered = df[df.duplicated('batch_id')].copy()

In [7]:
selected = pd.DataFrame([])

for id_ in df_filtered['batch_id'].unique():
    with_id = df_filtered[df_filtered['batch_id'] == id_].sort_values(by=['conc'])
    
    if np.count_nonzero(with_id) == 1:
        under_thresh = with_id[with_id['distance'] <= THRESH]
        selected = pd.concat([selected, under_thresh])

    else:
        sorted_distances = with_id['distance'].to_numpy()
        argmin_dist = np.argmin(sorted_distances)
        if sorted_distances[argmin_dist] > THRESH:
            continue      
        if (argmin_dist > 0):
            selected = pd.concat([selected, with_id[argmin_dist-1: argmin_dist+1]])
        else:
            selected = pd.concat([selected, with_id[argmin_dist:argmin_dist+1]])

In [8]:
import plotly.express as px
import plotly.graph_objects as go


def plot_compounds_interactive(compounds, dmso=None):
    drug_names = {
        id_: name_ for (id_, name_) in zip(compounds['batch_id'], compounds['name'])
    }

    drugs = compounds[~compounds['batch_id'].isin(['non-inf', 'DMSO'])]
    fig = px.line(
        drugs.sort_values(by="conc"), x="conc", y="distance",
        color='name',
        markers=True,
        width=1200, height=400,
        error_y='distance_q_75', error_y_minus='distance_q_75',
        category_orders={"name": drug_names.values()},
        hover_name="name", 
        hover_data={"name": False, "count_nuclei": ':.1f', "conc": True, "distance": ':.3f'},
    )

    # Healthy cells - confidence intervals
    for i, conf in enumerate([0.5, 0.75, 0.95]):
        label = dict(
            text="Non-infected", font=dict(size=15, color="black"), textposition="middle left"
        ) if i == 0 else None

        left, right = stats.chi2.interval(confidence=conf, df=N_COMPONENTS)
        fig.add_hrect(
            type="rect",
            y0=np.sqrt(left), y1=np.sqrt(right),
            fillcolor="royalblue", opacity=0.8*(1-conf)**0.5,
            layer="below", line_width=0, label=label
        )

    # DMSO
    if dmso is not None:
        conc = range(0, 31, 1)
        median = dmso['distance'].iloc[0]
        perc_25 = dmso['distance_q_25'].iloc[0]
        perc_75 = dmso['distance_q_75'].iloc[0]
        fig.add_hrect(
            type="rect",
            y0=median-perc_25, y1=median+perc_75,
            fillcolor="grey", opacity=0.25,
            layer="below", line_width=0,
            label=dict(
                text="DMSO", font=dict(size=15, color="black"), textposition="middle left"
            )
        )

    conc = np.unique(compounds['conc'])
    fig.update_xaxes(title_text="concentration", gridcolor='lightgrey', type="log", tickvals=conc)
    fig.update_yaxes(title_text="distance", gridcolor='lightgrey', type="log", tickvals=[1, 5, 10, 100])
    fig.update_layout(
        title_text="Distance to non-infected cell distribution",
        plot_bgcolor='white'
    )

    fig.show()


In [9]:
plot_compounds_interactive(selected)

In [10]:
# Save to file
selected.to_csv("output/initial_design/pool.csv", sep=';', index=False)

print(f"Items in total: {len(selected)},")
print("of which unique compounds:", selected['batch_id'].nunique())

Items in total: 942,
of which unique compounds: 599
