In [9]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import numpy as np
from scipy.optimize import curve_fit
import warnings

In [None]:
Harvard_df = pd.read_csv('data/HarvardDataSet.csv')

In [21]:
Harvard_df

Unnamed: 0,Cell HMS LINCS ID,Cell Name,Small Molecule HMS LINCS ID,Small Molecule Name,Small Mol Concentration (uM),Primary Target,Pathway,Mean Normalized Growth Rate Inhibition Value,Increased Fraction Dead
0,50211-2,HCC1806,10390-103-1,Abemaciclib,0.001000,CDK4/6,Cell cycle,0.9779,0.00362
1,50211-2,HCC1806,10390-103-1,Abemaciclib,0.003162,CDK4/6,Cell cycle,0.9667,-0.00301
2,50211-2,HCC1806,10390-103-1,Abemaciclib,0.010000,CDK4/6,Cell cycle,0.9168,0.00484
3,50211-2,HCC1806,10390-103-1,Abemaciclib,0.031623,CDK4/6,Cell cycle,0.7658,0.01650
4,50211-2,HCC1806,10390-103-1,Abemaciclib,0.100000,CDK4/6,Cell cycle,0.7132,0.01025
...,...,...,...,...,...,...,...,...,...
10705,51083-2,SUM159PT,10194-106-1,Cabozantinib,0.100000,VEGFR2/MET,RTK,1.0115,-0.00152
10706,51083-2,SUM159PT,10194-106-1,Cabozantinib,0.316230,VEGFR2/MET,RTK,0.9965,0.00568
10707,51083-2,SUM159PT,10194-106-1,Cabozantinib,1.000000,VEGFR2/MET,RTK,0.9307,0.03130
10708,51083-2,SUM159PT,10194-106-1,Cabozantinib,3.162300,VEGFR2/MET,RTK,0.7480,0.08743


In [27]:
Harvard_subdf_IC50 = Harvard_df[["Cell Name", "Small Molecule Name", "Small Mol Concentration (uM)", "Mean Normalized Growth Rate Inhibition Value"]]

In [29]:
Harvard_subdf_IC50

Unnamed: 0,Cell Name,Small Molecule Name,Small Mol Concentration (uM),Mean Normalized Growth Rate Inhibition Value
0,HCC1806,Abemaciclib,0.001000,0.9779
1,HCC1806,Abemaciclib,0.003162,0.9667
2,HCC1806,Abemaciclib,0.010000,0.9168
3,HCC1806,Abemaciclib,0.031623,0.7658
4,HCC1806,Abemaciclib,0.100000,0.7132
...,...,...,...,...
10705,SUM159PT,Cabozantinib,0.100000,1.0115
10706,SUM159PT,Cabozantinib,0.316230,0.9965
10707,SUM159PT,Cabozantinib,1.000000,0.9307
10708,SUM159PT,Cabozantinib,3.162300,0.7480


In [51]:
Harvard_subdf_IC50.nunique()

Cell Name                                         35
Small Molecule Name                               34
Small Mol Concentration (uM)                      14
Mean Normalized Growth Rate Inhibition Value    6808
dtype: int64

In [53]:
Harvard_subdf_IC50["Cell Name"].value_counts()

Cell Name
HCC1806          306
HCC1143          306
HCC1419          306
HCC1500          306
HCC1937          306
HCC1954          306
CAL-120          306
CAL-85-1         306
PDXHCI002        306
CAL-51           306
SUM149PT         306
T47D             306
CAMA-1           306
HCC1428          306
HME1             306
SUM1315MO2       306
HCC1395          306
MDA-MB-468       306
Hs 578T          306
HCC38            306
MCF 10A          306
MCF7             306
MDA-MB-231       306
SK-BR-3          306
BT-20            306
BT-549           306
HCC70            306
MDA-MB-453       306
PDX1258          306
PDX1328          306
MDA-MB-134-VI    306
MDA-MB-157       306
MDA-MB-361       306
MDA-MB-436       306
SUM159PT         306
Name: count, dtype: int64

In [39]:
import pandas as pd
import numpy as np
from scipy.optimize import curve_fit
import warnings
from typing import Tuple, Optional
from tqdm import tqdm

class BatchEC50Calculator:
    def __init__(self):
        pass

    @staticmethod
    def four_param_logistic(x, bottom, top, logEC50, hill_slope):
        """4-parameter logistic function (sigmoid)."""
        return bottom + (top - bottom) / (1 + 10**((logEC50 - np.log10(x)) * hill_slope))

    def calculate_ec50_single(self, concentrations, responses) -> Tuple[Optional[float], list]:
        """
        Fit a 4PL curve to a single concentration-response series.
        Returns:
            ec50: float (in µM) or None if fit fails
            popt: list of fitted parameters
        """
        concentrations = np.array(concentrations)
        responses = np.array(responses)

        if np.std(responses) < 1e-3 or len(np.unique(responses)) < 4:
            return None, [None] * 4

        p0 = [min(responses), max(responses), np.log10(np.median(concentrations)), 1.0]

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            try:
                popt, _ = curve_fit(self.four_param_logistic, concentrations, responses, p0=p0, maxfev=10000)
                ec50 = 10 ** popt[2]  # logEC50
            except RuntimeError:
                return None, [None] * 4

        return ec50, popt


    def calculate_all_ec50(self, df: pd.DataFrame, min_points: int = 4) -> pd.DataFrame:
        """
        Calculate EC50 for each (Cell Name, Small Molecule Name) pair in the dataframe.
        min_points: Minimum number of data points required to fit the curve.
        Returns:
            DataFrame with columns: Cell Name, Small Molecule Name, EC50
        """
        results = []

        grouped = df.groupby(["Cell Name", "Small Molecule Name"])
        print(f"Total unique cell-drug pairs: {len(grouped)}")

        for (cell, drug), group in tqdm(grouped, desc="Fitting curves"):
            concentrations = group["Small Mol Concentration (uM)"].values
            responses = group["Mean Normalized Growth Rate Inhibition Value"].values

            # Drop NaNs or invalid values
            mask = ~np.isnan(concentrations) & ~np.isnan(responses)
            concentrations = concentrations[mask]
            responses = responses[mask]

            if len(concentrations) < min_points:
                continue

            ec50, _ = self.calculate_ec50_single(concentrations, responses)
            results.append({
                "Cell Name": cell,
                "Small Molecule Name": drug,
                "EC50 (uM)": ec50,
                "N Points": len(concentrations),
            })

        return pd.DataFrame(results)

In [43]:
Harvard_subdf_IC50["Cell Name"] = Harvard_subdf_IC50["Cell Name"].str.strip()
Harvard_subdf_IC50["Small Molecule Name"] = Harvard_subdf_IC50["Small Molecule Name"].str.strip()

print("Expected:", 35 * 34)
print("Actual:", Harvard_subdf_IC50.groupby(["Cell Name", "Small Molecule Name"]).ngroups)

Expected: 1190
Actual: 1190


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Harvard_subdf_IC50["Cell Name"] = Harvard_subdf_IC50["Cell Name"].str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Harvard_subdf_IC50["Small Molecule Name"] = Harvard_subdf_IC50["Small Molecule Name"].str.strip()


In [45]:
# Assuming your dataframe is named `df`:
# Columns: Cell Name, Small Molecule Name, Small Mol Concentration (uM), Mean Normalized Growth Rate Inhibition Value

calculator = BatchEC50Calculator()
ec50_results = calculator.calculate_all_ec50(Harvard_subdf_IC50)

print(ec50_results.head())


Total unique cell-drug pairs: 1190


Fitting curves: 100%|██████████████████████| 1190/1190 [00:06<00:00, 185.19it/s]

  Cell Name Small Molecule Name     EC50 (uM)  N Points
0     BT-20           A-1210477  5.487963e-03         9
1     BT-20             ABT-737  1.078987e+09         9
2     BT-20             AZD7762  1.650602e+00         9
3     BT-20         Abemaciclib  4.558122e-64         9
4     BT-20           Alpelisib  1.743564e+08         9





In [47]:
ec50_results

Unnamed: 0,Cell Name,Small Molecule Name,EC50 (uM),N Points
0,BT-20,A-1210477,5.487963e-03,9
1,BT-20,ABT-737,1.078987e+09,9
2,BT-20,AZD7762,1.650602e+00,9
3,BT-20,Abemaciclib,4.558122e-64,9
4,BT-20,Alpelisib,1.743564e+08,9
...,...,...,...,...
1185,T47D,Topotecan,6.966630e-03,9
1186,T47D,Torin2,4.774638e-03,9
1187,T47D,Trametinib,5.604863e-03,9
1188,T47D,Volasertib,3.321583e-02,9
