# CLASSIFICATION - Choosing the best chemical per patient

In [194]:
import pandas as pd
valid_IC50s = pd.read_csv("valid_IC50s_within_range.csv")
merged_df = pd.read_csv("final_merged.csv")

  merged_df = pd.read_csv("final_merged.csv")


In [196]:
valid_IC50s.drop(columns = ['Unnamed: 0', 'N Points'], inplace = True)

In [198]:
import numpy as np

# Get value counts as probabilities
race_dist = merged_df['Race'].value_counts(normalize=True)

# Get the indices where race is missing
missing_indices = merged_df['Race'].isna()

# Sample values based on observed distribution
imputed_values = np.random.choice(race_dist.index, size=missing_indices.sum(), p=race_dist.values)

# Assign the sampled values to the missing positions
merged_df.loc[missing_indices, 'Race'] = imputed_values


In [200]:
def T_stage_by_size(size):
    if size == 0:
        return 0
    if size > 0 and size <= 20:
        return 1
    if size > 20 and size <= 50:
        return 2
    if size > 50:
        return 3

In [202]:
merged_df['T_stage_by_size'] = merged_df.apply(lambda row: row['T Stage'] if pd.notnull(row['T Stage']) else T_stage_by_size(row['Tumor Size']), axis=1)

In [204]:
merged_df['T_stage_by_size'].isna().sum()

134

In [206]:
columns = ['Patient ID', 'Age', 'Race and Ethnicity', 'T_stage_by_size']
patients_df = merged_df[columns]

In [208]:
patients_df.head()

Unnamed: 0,Patient ID,Age,Race and Ethnicity,T_stage_by_size
0,Breast_MRI_001,41,2.0,2.0
1,Breast_MRI_001,41,2.0,2.0
2,Breast_MRI_002,38,2.0,2.0
3,Breast_MRI_003,62,1.0,2.0
4,Breast_MRI_003,62,1.0,2.0


In [210]:
valid_IC50s

Unnamed: 0,Cell Name,Small Molecule Name,EC50 (uM)
0,BT-20,A-1210477,0.005488
1,BT-20,AZD7762,1.650602
2,BT-20,Bleomycin,0.754695
3,BT-20,Buparlisib,1.336570
4,BT-20,Cabozantinib,3.789538
...,...,...,...
821,T47D,Topotecan,0.006967
822,T47D,Torin2,0.004775
823,T47D,Trametinib,0.005605
824,T47D,Volasertib,0.033216


In [212]:
# ASK LUCAS FOR THIS DATA
patients_df = patients_df.copy()

# Assign a random integer between 1 and 34 for each row
patients_df['cell_line'] = np.random.randint(1, 35, size=len(patients_df))

In [214]:
patients_df

Unnamed: 0,Patient ID,Age,Race and Ethnicity,T_stage_by_size,cell_line
0,Breast_MRI_001,41,2.0,2.0,28
1,Breast_MRI_001,41,2.0,2.0,20
2,Breast_MRI_002,38,2.0,2.0,2
3,Breast_MRI_003,62,1.0,2.0,20
4,Breast_MRI_003,62,1.0,2.0,21
...,...,...,...,...,...
9217,,69,,4.0,7
9218,,69,,4.0,10
9219,,69,,4.0,29
9220,,69,,4.0,26


In [216]:
# ASK AUSTIN FOR THIS DATA
patients_df['Race and Ethnicity'].value_counts()

# Identify missing entries
missing_mask = patients_df['Race and Ethnicity'].isna()

# Fill missing values with random integers from 1 to 7
patients_df.loc[missing_mask, 'Race and Ethnicity'] = np.random.randint(1, 8, size=missing_mask.sum())

In [222]:
# ASK TEAM TO HELP IMPUTE MISSING T-STAGE OR DROP THEIR ROWS 
# MAYBE ASK AUSTIN FOR SIMILAR WORKFLOW USED FOR RACE BUT FOR T-STAGE 
patients_df.isna().sum()

Patient ID            1750
Age                      0
Race and Ethnicity       0
T_stage_by_size        134
cell_line                0
dtype: int64

### Each patient needs a one hot encoded version of the chemical for the cell line they represent...

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Step 1: Merge drug data into patient data
df = patients_df.merge(df_drug, on='cell_line')

# Step 2: For each patient, find the chemical with the lowest IC50
df['IC50'] = df['IC50'].astype(float)
best_chems = df.loc[df.groupby('patient_id')['IC50'].idxmin()][['patient_id', 'chemical']]

# Step 3: Create the labeled dataset
df_labeled = patients_df.merge(best_chems, on='patient_id', how='inner')

# Step 4: Encode categorical variables
le_gender = LabelEncoder()
le_cell = LabelEncoder()
le_chem = LabelEncoder()

df_labeled['gender_enc'] = le_gender.fit_transform(df_labeled['gender'])
df_labeled['cell_line_enc'] = le_cell.fit_transform(df_labeled['cell_line'])
df_labeled['chemical_enc'] = le_chem.fit_transform(df_labeled['chemical'])  # Target

# Step 5: Build features and target
X = df_labeled[['age', 'gender_enc', 'cell_line_enc']]
y = df_labeled['chemical_enc']

# Optional: Standardize age
scaler = StandardScaler()
X['age'] = scaler.fit_transform(X[['age']])

# Step 6: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Train classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Step 8: Evaluate model
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=le_chem.classes_))
