# USE THIS INSTEAD:
# CLASSIFICATION - Choosing the best chemical per patient.

# LIMITATION NOTES TO EXPRESS TO AUDIENCE:

PROBLEM: 
- We have only 1 IC50 value per cell line x drug. This means there is no variation in IC50 values per cell line x drug. This means demographic information has little affect on our results.¶

FIX:
- CELL LINE IS A PROXY FOR THE PATIENT.
- <mark>IF WE ASSUME THE CELL LINE REPRESENTS THE PATIENT, AND WANT PATIENT DEMOGRAPHICS TO INFLUENCE DRUG RECOMMENDATION, THEN PATIENT DEMOGRAPHICS SHOULD BE MOST IMPORTANT TO IMPUTE CELL LINE. 
- CONSIDER OTHER FEATURES TO IMPROVE ACCURACY: BMI, MENOPAUSE STATUS

In [49]:
import pandas as pd
import random
valid_IC50s = pd.read_csv("valid_IC50s_within_range.csv")
merged_df = pd.read_csv("final_merged.csv")

  merged_df = pd.read_csv("final_merged.csv")


In [51]:
valid_IC50s.drop(columns = ['Unnamed: 0', 'N Points'], inplace = True)

In [53]:
import numpy as np

# Get value counts as probabilities
race_dist = merged_df['Race'].value_counts(normalize=True)

# Get the indices where race is missing
missing_indices = merged_df['Race'].isna()

# Sample values based on observed distribution
imputed_values = np.random.choice(race_dist.index, size=missing_indices.sum(), p=race_dist.values)

# Assign the sampled values to the missing positions
merged_df.loc[missing_indices, 'Race'] = imputed_values


In [55]:
def T_stage_by_size(size):
    if size == 0:
        return 0
    if size > 0 and size <= 20:
        return 1
    if size > 20 and size <= 50:
        return 2
    if size > 50:
        return 3

In [57]:
merged_df['T_stage_by_size'] = merged_df.apply(lambda row: row['T Stage'] if pd.notnull(row['T Stage']) else T_stage_by_size(row['Tumor Size']), axis=1)

In [59]:
merged_df['T_stage_by_size'].isna().sum()

134

In [61]:
columns = ['Patient ID', 'Age', 'Race and Ethnicity', 'T_stage_by_size']
patients_df = merged_df[columns]

In [63]:
patients_df

Unnamed: 0,Patient ID,Age,Race and Ethnicity,T_stage_by_size
0,Breast_MRI_001,41,2.0,2.0
1,Breast_MRI_001,41,2.0,2.0
2,Breast_MRI_002,38,2.0,2.0
3,Breast_MRI_003,62,1.0,2.0
4,Breast_MRI_003,62,1.0,2.0
...,...,...,...,...
9217,,69,,4.0
9218,,69,,4.0
9219,,69,,4.0
9220,,69,,4.0


In [65]:
valid_IC50s

Unnamed: 0,Cell Name,Small Molecule Name,EC50 (uM)
0,BT-20,A-1210477,0.005488
1,BT-20,AZD7762,1.650602
2,BT-20,Bleomycin,0.754695
3,BT-20,Buparlisib,1.336570
4,BT-20,Cabozantinib,3.789538
...,...,...,...
821,T47D,Topotecan,0.006967
822,T47D,Torin2,0.004775
823,T47D,Trametinib,0.005605
824,T47D,Volasertib,0.033216


In [67]:
random.seed(42)

In [69]:
# ASK LUCAS FOR THIS DATA
patients_df = patients_df.copy()

# Assign a random integer between 1 and 34 for each row
patients_df['cell_line'] = np.random.randint(1, 36, size=len(patients_df))

In [71]:
patients_df

Unnamed: 0,Patient ID,Age,Race and Ethnicity,T_stage_by_size,cell_line
0,Breast_MRI_001,41,2.0,2.0,13
1,Breast_MRI_001,41,2.0,2.0,5
2,Breast_MRI_002,38,2.0,2.0,4
3,Breast_MRI_003,62,1.0,2.0,18
4,Breast_MRI_003,62,1.0,2.0,31
...,...,...,...,...,...
9217,,69,,4.0,1
9218,,69,,4.0,21
9219,,69,,4.0,12
9220,,69,,4.0,9


In [73]:
# ASK AUSTIN FOR THIS DATA
patients_df['Race and Ethnicity'].value_counts()

# Identify missing entries
missing_mask = patients_df['Race and Ethnicity'].isna()

# Fill missing values with random integers from 1 to 7
patients_df.loc[missing_mask, 'Race and Ethnicity'] = np.random.randint(1, 8, size=missing_mask.sum())

In [75]:
# ASK TEAM TO HELP IMPUTE MISSING T-STAGE OR DROP THEIR ROWS 
# MAYBE ASK AUSTIN FOR SIMILAR WORKFLOW USED FOR RACE BUT FOR T-STAGE 
patients_df.isna().sum()

Patient ID            1750
Age                      0
Race and Ethnicity       0
T_stage_by_size        134
cell_line                0
dtype: int64

In [77]:
# Get unique cell line names
unique_cell_lines = valid_IC50s['Cell Name'].unique()

# Create a mapping from cell line name to number (1 to 34)
cell_line_map = {name: i+1 for i, name in enumerate(unique_cell_lines)}

# Preview the result
print(cell_line_map)

valid_IC50s['Cell_Name_Mapped'] = valid_IC50s['Cell Name'].map(cell_line_map)

{'BT-20': 1, 'BT-549': 2, 'CAL-120': 3, 'CAL-51': 4, 'CAL-85-1': 5, 'CAMA-1': 6, 'HCC1143': 7, 'HCC1395': 8, 'HCC1419': 9, 'HCC1428': 10, 'HCC1500': 11, 'HCC1806': 12, 'HCC1937': 13, 'HCC1954': 14, 'HCC38': 15, 'HCC70': 16, 'HME1': 17, 'Hs 578T': 18, 'MCF 10A': 19, 'MCF7': 20, 'MDA-MB-134-VI': 21, 'MDA-MB-157': 22, 'MDA-MB-231': 23, 'MDA-MB-361': 24, 'MDA-MB-436': 25, 'MDA-MB-453': 26, 'MDA-MB-468': 27, 'PDX1258': 28, 'PDX1328': 29, 'PDXHCI002': 30, 'SK-BR-3': 31, 'SUM1315MO2': 32, 'SUM149PT': 33, 'SUM159PT': 34, 'T47D': 35}


In [79]:
valid_IC50s

Unnamed: 0,Cell Name,Small Molecule Name,EC50 (uM),Cell_Name_Mapped
0,BT-20,A-1210477,0.005488,1
1,BT-20,AZD7762,1.650602,1
2,BT-20,Bleomycin,0.754695,1
3,BT-20,Buparlisib,1.336570,1
4,BT-20,Cabozantinib,3.789538,1
...,...,...,...,...
821,T47D,Topotecan,0.006967,35
822,T47D,Torin2,0.004775,35
823,T47D,Trametinib,0.005605,35
824,T47D,Volasertib,0.033216,35


In [81]:
patient_drug_df = pd.merge(patients_df, valid_IC50s, left_on='cell_line', right_on='Cell_Name_Mapped', how='inner')

# Display the merged dataframe
patient_drug_df

Unnamed: 0,Patient ID,Age,Race and Ethnicity,T_stage_by_size,cell_line,Cell Name,Small Molecule Name,EC50 (uM),Cell_Name_Mapped
0,Breast_MRI_001,41,2.0,2.0,13,HCC1937,A-1210477,5.938855,13
1,Breast_MRI_001,41,2.0,2.0,13,HCC1937,AZD7762,0.182102,13
2,Breast_MRI_001,41,2.0,2.0,13,HCC1937,Bleomycin,0.143103,13
3,Breast_MRI_001,41,2.0,2.0,13,HCC1937,Buparlisib,2.540692,13
4,Breast_MRI_001,41,2.0,2.0,13,HCC1937,Cisplatin,1.138559,13
...,...,...,...,...,...,...,...,...,...
217253,,69,3.0,4.0,33,SUM149PT,Tivantinib,0.321572,33
217254,,69,3.0,4.0,33,SUM149PT,Torin2,7.532754,33
217255,,69,3.0,4.0,33,SUM149PT,Trametinib,0.006241,33
217256,,69,3.0,4.0,33,SUM149PT,Volasertib,0.013809,33


In [87]:
# Check to make sure each patient has a row per drug (At most 34 rows per patient). 
condition = (patient_drug_df['Patient ID'] == 'Breast_MRI_001') & (patient_drug_df['cell_line'] == 13)
print(patient_drug_df[condition].shape)
patient_drug_df[condition]

(23, 9)


Unnamed: 0,Patient ID,Age,Race and Ethnicity,T_stage_by_size,cell_line,Cell Name,Small Molecule Name,EC50 (uM),Cell_Name_Mapped
0,Breast_MRI_001,41,2.0,2.0,13,HCC1937,A-1210477,5.938855,13
1,Breast_MRI_001,41,2.0,2.0,13,HCC1937,AZD7762,0.182102,13
2,Breast_MRI_001,41,2.0,2.0,13,HCC1937,Bleomycin,0.143103,13
3,Breast_MRI_001,41,2.0,2.0,13,HCC1937,Buparlisib,2.540692,13
4,Breast_MRI_001,41,2.0,2.0,13,HCC1937,Cisplatin,1.138559,13
5,Breast_MRI_001,41,2.0,2.0,13,HCC1937,Dasatinib,0.097868,13
6,Breast_MRI_001,41,2.0,2.0,13,HCC1937,Dinaciclib,0.021468,13
7,Breast_MRI_001,41,2.0,2.0,13,HCC1937,Doxorubicin,0.012733,13
8,Breast_MRI_001,41,2.0,2.0,13,HCC1937,Etoposide,0.493682,13
9,Breast_MRI_001,41,2.0,2.0,13,HCC1937,Everolimus,0.008601,13


### Each patient needs a one hot encoded version of the chemical for the cell line they represent...

In [91]:
# Model: Recommend drugs based on ONLY patient's information:

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
import pandas as pd

# Assuming you already have patient demographics and drug information combined
df = patient_drug_df.dropna(subset=['Age', 'Race and Ethnicity', 'cell_line', 'Small Molecule Name'])

# Encode drugs as labels (drug name -> numeric)
drug_encoder = OrdinalEncoder()
df['drug_id'] = drug_encoder.fit_transform(df[['Small Molecule Name']])

# Features (excluding drug name)
features = ['Age', 'Race and Ethnicity', 'cell_line']
target = 'drug_id'

X = df[features]
y = df[target]

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train a classifier (e.g., RandomForest)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Evaluate model performance
print("Model Accuracy:", clf.score(X_test, y_test))


Model Accuracy: 0.00046027800791678176


In [103]:
def recommend_multiple_drugs(patient_features, model, scaler, drug_encoder, top_n=5):
    # Standardize the input data (patient features)
    X_input = scaler.transform([patient_features])
    
    # Predict probabilities for all drugs
    prob_predictions = model.predict_proba(X_input)[0]  # Shape: (num_drugs,)
    
    # Get top N drug predictions (highest probabilities)
    top_indices = prob_predictions.argsort()[-top_n:][::-1]
    
    # Map drug IDs back to drug names using the correct shape (2D array)
    top_drugs = drug_encoder.inverse_transform(top_indices.reshape(-1, 1))  # Reshape to 2D for inverse_transform
    
    # Show recommended drugs and their probability
    recommendations = [(top_drugs[i][0], prob_predictions[top_indices[i]]) for i in range(top_n)]
    
    return recommendations


In [105]:
# Example input: [Age, Race, Cell Line]
new_patient_features = [45, 2, 12]  # Replace with actual patient data

recommended_drugs = recommend_multiple_drugs(new_patient_features, clf, scaler, drug_encoder, top_n=5)

print("\nTop 5 recommended drugs for the patient:")
for drug, prob in recommended_drugs:
    print(f"{drug}: {prob:.4f}")



Top 5 recommended drugs for the patient:
Taxol: 0.0532
Topotecan: 0.0523
Cisplatin: 0.0519
Doxorubicin: 0.0517
ABT-737: 0.0506




# LIMITATION NOTES TO EXPRESS TO AUDIENCE:

PROBLEM: 
- We have only 1 IC50 value per cell line x drug. This means there is no variation in IC50 values per cell line x drug. This means demographic information has little affect on our results.¶

FIX:
- CELL LINE IS A PROXY FOR THE PATIENT.
- <mark>IF WE ASSUME THE CELL LINE REPRESENTS THE PATIENT, AND WANT PATIENT DEMOGRAPHICS TO INFLUENCE DRUG RECOMMENDATION, THEN PATIENT DEMOGRAPHICS SHOULD BE MOST IMPORTANT TO IMPUTE CELL LINE. 
- INCLUDE THE FOLLOWING FEATURES TO IMPROVE ACCRUACY: BMI, MENOPAUSE STATUS
