<a href="https://colab.research.google.com/github/PutriBalqis134/FYP-HNSCC/blob/main/FYP_Balqis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
#!pip install -q google-generativeai

In [2]:
import google.generativeai as genai
from google.colab import userdata # Import userdata to access Colab Secrets

# Load API key from Colab Secrets
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split # Ensure this is imported
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import re # Import regex for string cleaning


In [3]:
# Load mRNA expression data
# Assuming the first column is a cell line identifier (e.g., 'DepMap_ID')
expression_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Datasets/CCLE_expression.csv', index_col=0)
expression_df.index.name = 'DepMap_ID'
print("mRNA Expression Data Head:")
display(expression_df.head())
print(f"mRNA Expression Data Shape: {expression_df.shape}\n")

mRNA Expression Data Head:


Unnamed: 0_level_0,TSPAN6 (7105),TNMD (64102),DPM1 (8813),SCYL3 (57147),C1orf112 (55732),FGR (2268),CFH (3075),FUCA2 (2519),GCLC (2729),NFYA (4800),...,H3C2 (8358),H3C3 (8352),AC098582.1 (8916),DUS4L-BCAP29 (115253422),C8orf44-SGK3 (100533105),ELOA3B (728929),NPBWR1 (2831),ELOA3D (100506888),ELOA3 (162699),CDR1 (1038)
DepMap_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ACH-001113,4.331992,0.0,7.364397,2.792855,4.470537,0.028569,1.226509,3.042644,6.499686,4.739848,...,2.689299,0.189034,0.201634,2.130931,0.555816,0.0,0.275007,0.0,0.0,0.0
ACH-001289,4.566815,0.584963,7.106537,2.543496,3.50462,0.0,0.189034,3.813525,4.221104,3.481557,...,1.286881,1.049631,0.321928,1.464668,0.632268,0.0,0.014355,0.0,0.0,0.0
ACH-001339,3.15056,0.0,7.379032,2.333424,4.227279,0.056584,1.31034,6.687061,3.682573,3.273516,...,0.594549,1.097611,0.831877,2.946731,0.475085,0.0,0.084064,0.0,0.0,0.042644
ACH-001538,5.08534,0.0,7.154109,2.545968,3.084064,0.0,5.868143,6.165309,4.489928,3.956986,...,0.214125,0.632268,0.298658,1.641546,0.443607,0.0,0.028569,0.0,0.0,0.0
ACH-000242,6.729145,0.0,6.537607,2.456806,3.867896,0.799087,7.208381,5.569856,7.127014,4.568032,...,1.117695,2.358959,0.084064,1.910733,0.0,0.0,0.464668,0.0,0.0,0.0


mRNA Expression Data Shape: (1406, 19221)



In [None]:


# Load drug response data (logIC50)
drug_response_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Datasets/PANCANCER_IC_Fri Oct 24 08_57_14 2025.csv')
print("Drug Response Data Head:")
display(drug_response_df.head())
print(f"Drug Response Data Shape: {drug_response_df.shape}\n")

# Load sample info for cell line ID mapping
sample_info_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Datasets/sample_info.csv')
print("Sample Info Data Head:")
display(sample_info_df.head())
print(f"Sample Info Data Shape: {sample_info_df.shape}\n")

# Rename 'Cell Line Name' to 'CCLE_Name' in drug_response_df and 'IC50' to 'logIC50'
drug_response_df = drug_response_df.rename(columns={'Cell Line Name': 'CCLE_Name', 'IC50': 'logIC50'})

# --- Standardization for merging ---
# Standardize drug_response_df['CCLE_Name'] to match stripped_cell_line_name format
drug_response_df['CCLE_Name_Standardized'] = drug_response_df['CCLE_Name'].apply(
    lambda x: re.sub(r'[^a-zA-Z0-9]', '', str(x)).upper()
)

sample_info_df['stripped_cell_line_name_Standardized'] = sample_info_df['stripped_cell_line_name'].apply(
    lambda x: re.sub(r'[^a-zA-Z0-9]', '', str(x)).upper()
)

# Merge drug_response_df with sample_info_df to get the DepMap_ID (ACH-XXXXX format)
# Use the standardized names for merging
drug_response_mapped_df = pd.merge(
    drug_response_df,
    sample_info_df[['DepMap_ID', 'stripped_cell_line_name_Standardized']],
    left_on='CCLE_Name_Standardized',
    right_on='stripped_cell_line_name_Standardized',
    how='inner'
)

# Now, merge the expression data with the mapped drug response data on the common 'DepMap_ID' (ACH-XXXXX)
# Ensure expression_df index is in a comparable format or reset_index before merge
merged_df = pd.merge(
    expression_df.reset_index(), # Reset index to make 'DepMap_ID' a column
    drug_response_mapped_df[['DepMap_ID', 'logIC50']],
    on='DepMap_ID',
    how='inner'
)

# Prepare X (features) and y (target)
X = merged_df.drop(columns=['DepMap_ID', 'logIC50']).values
y = merged_df['logIC50'].values

print(f"Final X shape: {X.shape}")
print(f"Final y shape: {y.shape}")

mRNA Expression Data Head:


Unnamed: 0_level_0,TSPAN6 (7105),TNMD (64102),DPM1 (8813),SCYL3 (57147),C1orf112 (55732),FGR (2268),CFH (3075),FUCA2 (2519),GCLC (2729),NFYA (4800),...,H3C2 (8358),H3C3 (8352),AC098582.1 (8916),DUS4L-BCAP29 (115253422),C8orf44-SGK3 (100533105),ELOA3B (728929),NPBWR1 (2831),ELOA3D (100506888),ELOA3 (162699),CDR1 (1038)
DepMap_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ACH-001113,4.331992,0.0,7.364397,2.792855,4.470537,0.028569,1.226509,3.042644,6.499686,4.739848,...,2.689299,0.189034,0.201634,2.130931,0.555816,0.0,0.275007,0.0,0.0,0.0
ACH-001289,4.566815,0.584963,7.106537,2.543496,3.50462,0.0,0.189034,3.813525,4.221104,3.481557,...,1.286881,1.049631,0.321928,1.464668,0.632268,0.0,0.014355,0.0,0.0,0.0
ACH-001339,3.15056,0.0,7.379032,2.333424,4.227279,0.056584,1.31034,6.687061,3.682573,3.273516,...,0.594549,1.097611,0.831877,2.946731,0.475085,0.0,0.084064,0.0,0.0,0.042644
ACH-001538,5.08534,0.0,7.154109,2.545968,3.084064,0.0,5.868143,6.165309,4.489928,3.956986,...,0.214125,0.632268,0.298658,1.641546,0.443607,0.0,0.028569,0.0,0.0,0.0
ACH-000242,6.729145,0.0,6.537607,2.456806,3.867896,0.799087,7.208381,5.569856,7.127014,4.568032,...,1.117695,2.358959,0.084064,1.910733,0.0,0.0,0.464668,0.0,0.0,0.0


mRNA Expression Data Shape: (1406, 19221)

Drug Response Data Head:


Unnamed: 0,Drug Name,Drug ID,Cell Line Name,Cosmic ID,TCGA Classification,Tissue,Tissue Sub-type,IC50,AUC,Max Conc,RMSE,Z score,Dataset Version
0,Camptothecin,1003,PFSK-1,683667,MB,nervous_system,medulloblastoma,-1.463887,0.93022,0.1,0.089052,0.433123,GDSC2
1,Camptothecin,1003,A673,684052,UNCLASSIFIED,soft_tissue,rhabdomyosarcoma,-4.869455,0.61497,0.1,0.111351,-1.4211,GDSC2
2,Camptothecin,1003,ES5,684057,UNCLASSIFIED,bone,ewings_sarcoma,-3.360586,0.791072,0.1,0.142855,-0.599569,GDSC2
3,Camptothecin,1003,ES7,684059,UNCLASSIFIED,bone,ewings_sarcoma,-5.04494,0.59266,0.1,0.135539,-1.516647,GDSC2
4,Camptothecin,1003,EW-11,684062,UNCLASSIFIED,bone,ewings_sarcoma,-3.741991,0.734047,0.1,0.128059,-0.807232,GDSC2


Drug Response Data Shape: (243466, 13)

Sample Info Data Head:


Unnamed: 0,DepMap_ID,cell_line_name,stripped_cell_line_name,CCLE_Name,alias,COSMICID,sex,source,RRID,WTSI_Master_Cell_ID,...,lineage_sub_subtype,lineage_molecular_subtype,default_growth_pattern,model_manipulation,model_manipulation_details,patient_id,parent_depmap_id,Cellosaurus_NCIt_disease,Cellosaurus_NCIt_id,Cellosaurus_issues
0,ACH-000016,SLR 21,SLR21,SLR21_KIDNEY,,,,Academic lab,CVCL_V607,,...,,,,,,PT-JnARLB,,Clear cell renal cell carcinoma,C4033,
1,ACH-000032,MHH-CALL-3,MHHCALL3,MHHCALL3_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,,,Female,DSMZ,CVCL_0089,,...,b_cell,,,,,PT-p2KOyI,,Childhood B acute lymphoblastic leukemia,C9140,
2,ACH-000033,NCI-H1819,NCIH1819,NCIH1819_LUNG,,,Female,Academic lab,CVCL_1497,,...,NSCLC_adenocarcinoma,,,,,PT-9p1WQv,,Lung adenocarcinoma,C3512,
3,ACH-000043,Hs 895.T,HS895T,HS895T_FIBROBLAST,,,Female,ATCC,CVCL_0993,,...,,,2D: adherent,,,PT-rTUVZQ,,Melanoma,C3224,
4,ACH-000049,HEK TE,HEKTE,HEKTE_KIDNEY,,,,Academic lab,CVCL_WS59,,...,,,,immortalized,,PT-qWYYgr,,,,No information is available about this cell li...


Sample Info Data Shape: (1840, 29)



### Train-Test Split and Data Normalization

We'll split the data into training (80%) and testing (20%) sets. Then, the input features (mRNA expression) will be normalized using `StandardScaler` to have a mean of 0 and a standard deviation of 1. This is crucial for neural network performance.

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}\n")

# Data Normalization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Data normalized successfully.")

NameError: name 'train_test_split' is not defined

### Build and Compile the DNN Models

Here, we define a fully connected feed-forward neural network. The model will consist of an input layer, several hidden layers with ReLU activation, and a single output neuron with no activation (suitable for regression). The model will be compiled with the Adam optimizer and Mean Squared Error (MSE) as the loss function.

In [None]:
# Define the DNN model
def build_dnn_model(input_shape):
    model = keras.Sequential([
        layers.Dense(256, activation='relu', input_shape=(input_shape,)),
        layers.Dropout(0.2), # Adding dropout for regularization
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.2),
        layers.Dense(64, activation='relu'),
        layers.Dense(1) # Output layer for regression
    ])
    return model

# Get input shape
input_shape = X_train_scaled.shape[1]
model = build_dnn_model(input_shape)

# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mse'])

model.summary()

NameError: name 'X_train_scaled' is not defined

### Train the Model

Now, we'll train the DNN model using the normalized training data. We'll use a `validation_split` to monitor performance on a portion of the training data that is held out during training.

In [None]:
# Train the model
history = model.fit(
    X_train_scaled,
    y_train,
    epochs=50, # You can adjust the number of epochs
    batch_size=32,
    validation_split=0.1, # Use 10% of training data for validation
    verbose=1
)

print("\nModel training complete.")

NameError: name 'model' is not defined

### Evaluate the Model

Finally, we'll evaluate the trained model on the unseen test data. We'll report the Mean Squared Error (MSE) directly from the model's evaluation and calculate the Coefficient of Determination (R²) using scikit-learn for a more interpretable measure of model fit.

In [None]:
# Evaluate the model on test data
loss, mse = model.evaluate(X_test_scaled, y_test, verbose=0)
print(f"\nTest Loss (MSE): {loss:.4f}")

# Make predictions on the test set
y_pred = model.predict(X_test_scaled).flatten()

# Calculate R-squared
r2 = r2_score(y_test, y_pred)
print(f"Test R-squared (R²): {r2:.4f}")

NameError: name 'model' is not defined