In [115]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.datasets import make_regression
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, mutual_info_regression
import xgboost as xgb
import joblib

In [19]:
print("=" * 70)
print("LOADING GDSC DRUG RESPONSE DATA")
print("=" * 70)


df_raw = pd.ExcelFile('GDSC1_fitted_dose_response.xlsx')
print(f'column names: {df_raw.sheet_names}')
if len(df_raw.sheet_names) == 1:
    name = df_raw.sheet_names[0]
    df_raw = pd.read_excel(df_raw, sheet_name=f'{df_raw.sheet_names[0]}')
    print('Loaded single sheet: ', name)


LOADING GDSC DRUG RESPONSE DATA
column names: ['Sheet 1']
Loaded single sheet:  Sheet 1


In [20]:
print(df_raw.columns)
print(len(df_raw.columns))
df_raw.head()

Index(['DATASET', 'NLME_RESULT_ID', 'NLME_CURVE_ID', 'COSMIC_ID',
       'CELL_LINE_NAME', 'SANGER_MODEL_ID', 'TCGA_DESC', 'DRUG_ID',
       'DRUG_NAME', 'PUTATIVE_TARGET', 'PATHWAY_NAME', 'COMPANY_ID',
       'WEBRELEASE', 'MIN_CONC', 'MAX_CONC', 'LN_IC50', 'AUC', 'RMSE',
       'Z_SCORE'],
      dtype='object')
19


Unnamed: 0,DATASET,NLME_RESULT_ID,NLME_CURVE_ID,COSMIC_ID,CELL_LINE_NAME,SANGER_MODEL_ID,TCGA_DESC,DRUG_ID,DRUG_NAME,PUTATIVE_TARGET,PATHWAY_NAME,COMPANY_ID,WEBRELEASE,MIN_CONC,MAX_CONC,LN_IC50,AUC,RMSE,Z_SCORE
0,GDSC1,342,15580432,684057,ES5,SIDM00263,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,Y,0.007813,2.0,3.966813,0.985678,0.026081,1.299144
1,GDSC1,342,15580806,684059,ES7,SIDM00269,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,Y,0.007813,2.0,2.69209,0.97269,0.110059,0.156076
2,GDSC1,342,15581198,684062,EW-11,SIDM00203,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,Y,0.007813,2.0,2.47799,0.944459,0.087019,-0.035912
3,GDSC1,342,15581542,684072,SK-ES-1,SIDM01111,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,Y,0.007813,2.0,2.033564,0.950758,0.01629,-0.434437
4,GDSC1,342,15581930,687448,COLO-829,SIDM00909,SKCM,1,Erlotinib,EGFR,EGFR signaling,1045,Y,0.007813,2.0,2.966007,0.954778,0.180255,0.401702


In [None]:
print("\n" + "=" * 70)
print("CLEANING COLUMN NAMES")
print("=" * 70)

df_raw.columns = df_raw.columns.str.strip()

# Standardizing column names to a consistent format makes the rest of the pipeline cleaner
column_mapping = {
    'DRUG_NAME': 'Drug_Name',
    'CELL_LINE_NAME': 'Cell_Line_Name',
    'TCGA_DESC': 'TCGA_Class',
    'AUC': 'AUC',
    'LN_IC50': 'LN_IC50',
    'COSMIC_ID': 'Cosmic_ID',
    'PATHWAY_NAME': 'Pathway_Name',
    'PUTATIVE_TARGET': 'Putative_Target',
    'DRUG_ID': 'Drug_ID',
    'SANGER_MODEL_ID': 'Sanger_Model_ID',
    'MIN_CONC': 'Min_Conc',
    'MAX_CONC': 'Max_Conc',
    'RMSE': 'RMSE',
    'Z_SCORE': 'Z_Score'
}

# Apply column mapping (only rename if column exists)
for old_name, new_name in column_mapping.items():
    if old_name in df_raw.columns:
        df_raw.rename(columns={old_name: new_name}, inplace=True)
        if old_name != new_name:
            print(f"   Mapped: '{old_name}' ‚Üí '{new_name}'")

print(f"\n‚úÖ Column mapping complete")
print(f"   Available columns after mapping: {list(df_raw.columns)}")
print(len(df_raw.columns))



CLEANING COLUMN NAMES
   Mapped: 'DRUG_NAME' ‚Üí 'Drug_Name'
   Mapped: 'CELL_LINE_NAME' ‚Üí 'Cell_Line_Name'
   Mapped: 'TCGA_DESC' ‚Üí 'TCGA_Class'
   Mapped: 'COSMIC_ID' ‚Üí 'Cosmic_ID'
   Mapped: 'PATHWAY_NAME' ‚Üí 'Pathway_Name'
   Mapped: 'PUTATIVE_TARGET' ‚Üí 'Putative_Target'
   Mapped: 'DRUG_ID' ‚Üí 'Drug_ID'
   Mapped: 'SANGER_MODEL_ID' ‚Üí 'Sanger_Model_ID'
   Mapped: 'MIN_CONC' ‚Üí 'Min_Conc'
   Mapped: 'MAX_CONC' ‚Üí 'Max_Conc'
   Mapped: 'Z_SCORE' ‚Üí 'Z_Score'

‚úÖ Column mapping complete
   Available columns after mapping: ['DATASET', 'NLME_RESULT_ID', 'NLME_CURVE_ID', 'Cosmic_ID', 'Cell_Line_Name', 'Sanger_Model_ID', 'TCGA_Class', 'Drug_ID', 'Drug_Name', 'Putative_Target', 'Pathway_Name', 'COMPANY_ID', 'WEBRELEASE', 'Min_Conc', 'Max_Conc', 'LN_IC50', 'AUC', 'RMSE', 'Z_Score']
19


In [70]:


len(df_raw['Cosmic_ID'].unique())


970

In [None]:
print("\n" + "=" * 70)
print(" SELECTING RELEVANT COLUMNS")
print("=" * 70)

# Focusing on features that are biologically meaningful for drug response prediction
columns_to_keep = [
    'Drug_Name',
    'Cell_Line_Name',
    'AUC',
    'TCGA_Class',
    'Pathway_Name',
    'Putative_Target',
    'LN_IC50'
]



# Keep only columns that exist in the dataset
available_cols = [col for col in columns_to_keep if col in df_raw.columns]
missing_cols = [col for col in columns_to_keep if col not in df_raw.columns]


print(f"‚úÖ Available columns for modeling: {available_cols}")
if missing_cols:
    print(f"‚ö†Ô∏è  Missing columns (will be skipped): {missing_cols}")

# Create working dataframe with available columns
df = df_raw[available_cols].copy()

# Display column info
print(f"\nüìã Column Summary:")
print(f"   Total columns in file: {len(df_raw.columns)}")
print(f"   Columns selected for modeling: {len(available_cols)}")
print(f"   Columns: {', '.join(available_cols)}")


 SELECTING RELEVANT COLUMNS
‚úÖ Available columns for modeling: ['Drug_Name', 'Cell_Line_Name', 'AUC', 'TCGA_Class', 'Pathway_Name', 'Putative_Target', 'LN_IC50']

üìã Column Summary:
   Total columns in file: 19
   Columns selected for modeling: 7
   Columns: Drug_Name, Cell_Line_Name, AUC, TCGA_Class, Pathway_Name, Putative_Target, LN_IC50


In [72]:
df.isnull().sum()

Drug_Name             0
Cell_Line_Name        0
AUC                   0
TCGA_Class          580
Pathway_Name          0
Putative_Target    3652
LN_IC50               0
dtype: int64

In [None]:
print("\n" + "=" * 70)
print(" TARGET VARIABLE SELECTION")
print("=" * 70)

target_col = 'AUC'

if target_col not in df.columns:
    print(f"‚ùå Error: Target column '{target_col}' not found!")
    print(f"   Available columns: {df.columns.tolist()}")
    if 'LN_IC50' in df.columns:
        print(f"   Using 'LN_IC50' as alternative target")
        target_col = 'LN_IC50'
    else:
        raise ValueError("No suitable target column found!")

print(f"‚úÖ Using '{target_col}' as target variable")

print(f"\nüìä Data cleaning:")
initial_rows = len(df)
print(f"   Initial rows: {initial_rows:,}")

df = df.dropna(subset=[target_col])
print(f"   After removing missing {target_col}: {len(df):,} rows")
print(f"   Removed: {initial_rows - len(df):,} rows ({(initial_rows - len(df))/initial_rows*100:.2f}%)")

df[target_col] = pd.to_numeric(df[target_col], errors='coerce')
df = df.dropna(subset=[target_col])

print(f"   After numeric conversion: {len(df):,} rows")

# Removing extreme outliers that could skew the model
Q1 = df[target_col].quantile(0.25)
Q3 = df[target_col].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 3 * IQR
upper_bound = Q3 + 3 * IQR

outliers = ((df[target_col] < lower_bound) | (df[target_col] > upper_bound)).sum()
if outliers > 0:
    print(f"   ‚ö†Ô∏è  Found {outliers} potential outliers (outside 3*IQR)")
    print(f"      Range: [{lower_bound:.3f}, {upper_bound:.3f}]")
    df = df[(df[target_col] >= lower_bound) & (df[target_col] <= upper_bound)]

print(f"\n‚úÖ Final data shape: {df.shape[0]:,} rows √ó {df.shape[1]} columns")


 TARGET VARIABLE SELECTION
‚úÖ Using 'AUC' as target variable

üìä Data cleaning:
   Initial rows: 333,161
   After removing missing AUC: 333,161 rows
   Removed: 0 rows (0.00%)
   After numeric conversion: 333,161 rows
   ‚ö†Ô∏è  Found 3192 potential outliers (outside 3*IQR)
      Range: [0.151, 1.590]

‚úÖ Final data shape: 329,969 rows √ó 7 columns


In [74]:
# The GDSC dataset has 333K+ samples. For faster training, you can sample it.
# For production, use the full dataset.

print("\n" + "=" * 70)
print("DATA SAMPLING (OPTIONAL)")
print("=" * 70)

#  Use full dataset (recommended for final model)
USE_FULL_DATA = False

#  Sample for faster training/experimentation
SAMPLE_SIZE = 20000  # Number of samples to use (if sampling)

if not USE_FULL_DATA and len(df) > SAMPLE_SIZE:
    print(f"üìâ Sampling {SAMPLE_SIZE:,} samples from {len(df):,} total samples")
    print(f"   This speeds up training for experimentation")
    print(f"   For final model, set USE_FULL_DATA = True")
    
    # Stratified sampling by drug (to maintain drug diversity)
    if 'Drug_Name' in df.columns:
        # Sample proportionally from each drug
        df_sampled = df.groupby('Drug_Name', group_keys=False).apply(
            lambda x: x.sample(min(len(x), int(SAMPLE_SIZE * len(x) / len(df))), random_state=42)
        )
        # If we need more samples, randomly sample the rest
        if len(df_sampled) < SAMPLE_SIZE:
            remaining = df[~df.index.isin(df_sampled.index)]
            n_needed = SAMPLE_SIZE - len(df_sampled)
            df_sampled = pd.concat([df_sampled, remaining.sample(n_needed, random_state=42)])
        df = df_sampled.sample(n=min(SAMPLE_SIZE, len(df_sampled)), random_state=42).reset_index(drop=True)
    else:
        df = df.sample(n=SAMPLE_SIZE, random_state=42).reset_index(drop=True)
    
    print(f"   ‚úÖ Sampled dataset: {len(df):,} rows")
else:
    print(f"‚úÖ Using full dataset: {len(df):,} rows")
    print(f"   Note: Training may take longer with full dataset")

# 


DATA SAMPLING (OPTIONAL)
üìâ Sampling 20,000 samples from 329,969 total samples
   This speeds up training for experimentation
   For final model, set USE_FULL_DATA = True
   ‚úÖ Sampled dataset: 20,000 rows


In [None]:
print("\n" + "=" * 70)
print("EXPLORATORY DATA ANALYSIS")
print("=" * 70)

print(f"\nüìä Dataset Overview:")
print(f"   Total samples: {len(df):,}")
print(f"   Total features: {len(df.columns)}")

# Target variable statistics
print(f"\nüéØ Target Variable ({target_col}) Statistics:")
print(df[target_col].describe())

# Categorical feature counts
print(f"\nüìã Categorical Features:")
if 'Drug_Name' in df.columns:
    n_drugs = df['Drug_Name'].nunique()
    print(f"   Drugs: {n_drugs:,} unique")
    print(f"   Top 10 drugs by frequency:")
    print(df['Drug_Name'].value_counts().head(10).to_string())

if 'Cell_Line_Name' in df.columns:
    n_cell_lines = df['Cell_Line_Name'].nunique()
    print(f"\n   Cell Lines: {n_cell_lines:,} unique")

if 'TCGA_Class' in df.columns:
    n_tcga = df['TCGA_Class'].nunique()
    print(f"   TCGA Classes: {n_tcga:,} unique")
    print(f"   Top 10 TCGA classes:")
    print(df['TCGA_Class'].value_counts().head(10).to_string())

# Missing values check
print(f"\n‚ö†Ô∏è  Missing Values:")
missing_summary = df.isnull().sum()
missing_summary = missing_summary[missing_summary > 0]
if len(missing_summary) > 0:
    for col, count in missing_summary.items():
        print(f"   {col}: {count:,} ({count/len(df)*100:.2f}%)")
else:
    print("   ‚úÖ No missing values in key columns!")


EXPLORATORY DATA ANALYSIS

üìä Dataset Overview:
   Total samples: 20,000
   Total features: 7

üéØ Target Variable (AUC) Statistics:
count    20000.000000
mean         0.842951
std          0.179049
min          0.151531
25%          0.776113
50%          0.919074
75%          0.974072
max          0.999530
Name: AUC, dtype: float64

üìã Categorical Features:
   Drugs: 378 unique
   Top 10 drugs by frequency:
Drug_Name
Cisplatin             113
Olaparib              111
JQ1                   111
AKT inhibitor VIII    111
Avagacestat           111
PLX-4720              111
AZD6482               110
AZD7762               110
SB505124              110
Afatinib              110

   Cell Lines: 958 unique
   TCGA Classes: 31 unique
   Top 10 TCGA classes:
TCGA_Class
UNCLASSIFIED    3977
LUAD            1585
SCLC            1088
COREAD          1087
SKCM            1074
BRCA            1028
NB               822
DLBC             744
HNSC             742
GBM              655

‚ö†Ô∏è  Miss

In [None]:
print("\n" + "=" * 70)
print("FEATURE ENGINEERING")
print("=" * 70)

print("\nüîç Checking and filling missing values in categorical features...")

categorical_cols_to_encode = []
categorical_cols = df.select_dtypes(include=['object']).columns

for col in categorical_cols:
    if col == 'Cell_Line_Name':
        continue
    
    categorical_cols_to_encode.append(col)
    if df[col].isna().sum() > 0:
        print(f"   ‚úÖ Filled {df[col].isna().sum()} missing values in {col}")
        df[col] = df[col].fillna('UNKNOWN')
    else:
        print(f"   ‚úÖ No missing values in {col}")

print(f"\n‚úÖ All missing values filled in categorical features")


FEATURE ENGINEERING

üîç Checking and filling missing values in categorical features...
   ‚úÖ No missing values in Drug_Name
   ‚úÖ Filled 38 missing values in TCGA_Class
   ‚úÖ No missing values in Pathway_Name
   ‚úÖ Filled 219 missing values in Putative_Target

‚úÖ All missing values filled in categorical features


In [None]:

print("\nüî¢ Encoding categorical features using OneHotEncoder...")

from sklearn.preprocessing import OneHotEncoder

if categorical_cols_to_encode:
    df_categorical = df[categorical_cols_to_encode].copy()
    
    # One-hot encoding preserves the nominal nature of these features without imposing artificial order
    ohe = OneHotEncoder(
        sparse_output=False,
        handle_unknown='ignore',
        drop='first'
    )
    
    X_categorical_ohe = ohe.fit_transform(df_categorical)
    categorical_feature_names = ohe.get_feature_names_out(categorical_cols_to_encode)
    
    print(f"   ‚úÖ OneHot encoded {len(categorical_cols_to_encode)} categorical columns")
    print(f"      Created {X_categorical_ohe.shape[1]} binary features")
    print(f"      Original categories:")
    for col in categorical_cols_to_encode:
        n_cats = df[col].nunique()
        print(f"         {col}: {n_cats} categories")
    print(f"      Example features: {list(categorical_feature_names[:5])}")
    
    ohe_encoder = ohe
else:
    X_categorical_ohe = np.array([]).reshape(len(df), 0)
    categorical_feature_names = []
    ohe_encoder = None
    print("   ‚ö†Ô∏è  No categorical columns to encode")


üî¢ Encoding categorical features using OneHotEncoder...
   Why OneHotEncoder? Drug names, TCGA classes have NO order (nominal data)
   ‚úÖ OneHot encoded 4 categorical columns
      Created 719 binary features
      Original categories:
         Drug_Name: 378 categories
         TCGA_Class: 32 categories
         Pathway_Name: 24 categories
         Putative_Target: 289 categories
      Example features: ['Drug_Name_5-Fluorouracil', 'Drug_Name_965-D2', 'Drug_Name_993-D2', 'Drug_Name_A-443654', 'Drug_Name_A-770041']


In [89]:
X_categorical_ohe.shape

(20000, 719)

In [90]:
len(categorical_feature_names)

719

In [None]:
print("\nüî¢ Handling high cardinality features...")

if 'Cell_Line_Name' in df.columns:
    n_cell_lines = df['Cell_Line_Name'].nunique()
    
    if n_cell_lines > 50:
        print(f"   ‚ö†Ô∏è  Cell_Line_Name: {n_cell_lines} categories (high cardinality)")
        print(f"      Using LabelEncoder to avoid creating {n_cell_lines} binary features")
        print(f"      Note: This loses nominal property, but keeps feature count manageable")
        
        # Practical trade-off: one feature instead of hundreds, though we lose the nominal property
        le_cell = LabelEncoder()
        cell_line_encoded = le_cell.fit_transform(df['Cell_Line_Name'])
        X_cell_line = cell_line_encoded.reshape(-1, 1)
        cell_line_feature_names = ['Cell_Line_Encoded']
        cell_line_handled = True
        
        print(f"      ‚úÖ Encoded as 1 feature (values: 0 to {n_cell_lines-1})")
    else:
        print(f"   ‚úÖ Cell_Line_Name: {n_cell_lines} categories (low cardinality)")
        print(f"      Using OneHotEncoder (correct for nominal data)")
        
        ohe_cell = OneHotEncoder(
            sparse_output=False,
            handle_unknown='ignore',
            drop='first'
        )
        X_cell_line = ohe_cell.fit_transform(df[['Cell_Line_Name']])
        cell_line_feature_names = ohe_cell.get_feature_names_out(['Cell_Line_Name'])
        cell_line_handled = True
        
        print(f"      ‚úÖ Created {X_cell_line.shape[1]} binary features")
else:
    X_cell_line = np.array([]).reshape(len(df), 0)
    cell_line_feature_names = []
    cell_line_handled = False


üî¢ Handling high cardinality features...
   ‚ö†Ô∏è  Cell_Line_Name: 958 categories (high cardinality)
      Using LabelEncoder to avoid creating 958 binary features
      Note: This loses nominal property, but keeps feature count manageable
      ‚úÖ Encoded as 1 feature (values: 0 to 957)


In [None]:
feature_names = list(categorical_feature_names) + list(cell_line_feature_names)
X = np.concatenate([X_categorical_ohe, X_cell_line], axis=1)
X = pd.DataFrame(X, columns=feature_names)
X = pd.concat([X, df[['LN_IC50']]], axis=1)
y = df['AUC']

In [134]:
X

Unnamed: 0,Drug_Name_5-Fluorouracil,Drug_Name_965-D2,Drug_Name_993-D2,Drug_Name_A-443654,Drug_Name_A-770041,Drug_Name_A-83-01,Drug_Name_ACY-1215,Drug_Name_AGI-6780,Drug_Name_AICA Ribonucleotide,Drug_Name_AKT inhibitor VIII,...,Putative_Target_mTOR,"Putative_Target_mTOR, LCK","Putative_Target_mTOR, PI3K","Putative_Target_mTORC1, mTORC2",Putative_Target_not defined,Putative_Target_p38,"Putative_Target_p38, JNK2","Putative_Target_p38alpha, p38beta",Cell_Line_Encoded,LN_IC50
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,684.0,4.978434
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,308.0,0.961609
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,523.0,4.434201
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,641.0,2.278400
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,846.0,1.726612
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,120.0,2.026394
19996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,529.0,3.942135
19997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,752.0,3.818997
19998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,219.0,5.013664


In [None]:
print("\n" + "=" * 70)
print(" DATA VISUALIZATION")
print("=" * 70)

# Create output directory for plots
os.makedirs('outputs', exist_ok=True)

# Create summary DataFrame for visualization
df_summary = X.copy()
df_summary[target_col] = y.copy()
if 'Drug_Name' in df.columns:
    df_summary['Drug_Name'] = df['Drug_Name'].values
if 'Cell_Line_Name' in df.columns:
    df_summary['Cell_Line_Name'] = df['Cell_Line_Name'].values

print("\nüìä Creating visualizations...")

# 1. Target variable distribution
plt.figure(figsize=(14, 5))

plt.subplot(1, 3, 1)
df_summary[target_col].hist(bins=50, edgecolor='black', alpha=0.7)
plt.xlabel(target_col)
plt.ylabel('Frequency')
plt.title(f'Distribution of {target_col}')
plt.grid(True, alpha=0.3)

plt.subplot(1, 3, 2)
df_summary[target_col].plot(kind='box', vert=True)
plt.ylabel(target_col)
plt.title(f'Box Plot of {target_col}')
plt.grid(True, alpha=0.3)

plt.subplot(1, 3, 3)
# Q-Q plot for normality check
from scipy import stats
stats.probplot(df_summary[target_col], dist="norm", plot=plt)
plt.title('Q-Q Plot (Normality Check)')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('outputs/target_distribution.png', dpi=150, bbox_inches='tight')
plt.close()
print("   ‚úÖ Saved: outputs/target_distribution.png")

# 2. Drug distribution
if 'Drug_Name' in df_summary.columns:
    plt.figure(figsize=(14, 6))
    drug_counts = df_summary['Drug_Name'].value_counts().head(20)  # Top 20
    plt.barh(range(len(drug_counts)), drug_counts.values)
    plt.yticks(range(len(drug_counts)), drug_counts.index)
    plt.xlabel('Number of Samples')
    plt.ylabel('Drug Name')
    plt.title('Top 20 Drugs by Sample Count')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.savefig('outputs/drug_distribution.png', dpi=150, bbox_inches='tight')
    plt.close()
    print("   ‚úÖ Saved: outputs/drug_distribution.png")

# 3. TCGA class distribution
if 'TCGA_Class' in df.columns:
    plt.figure(figsize=(14, 6))
    tcga_counts = df['TCGA_Class'].value_counts().head(15)  # Top 15
    plt.barh(range(len(tcga_counts)), tcga_counts.values)
    plt.yticks(range(len(tcga_counts)), tcga_counts.index)
    plt.xlabel('Number of Samples')
    plt.ylabel('TCGA Class')
    plt.title('Top 15 TCGA Classes by Sample Count')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.savefig('outputs/tcga_distribution.png', dpi=150, bbox_inches='tight')
    plt.close()
    print("   ‚úÖ Saved: outputs/tcga_distribution.png")

# 4. Target by drug (if not too many drugs)
if 'Drug_Name' in df_summary.columns and df_summary['Drug_Name'].nunique() <= 20:
    plt.figure(figsize=(14, 6))
    df_summary.boxplot(column=target_col, by='Drug_Name', ax=plt.gca())
    plt.xticks(rotation=45, ha='right')
    plt.title(f'{target_col} Distribution by Drug')
    plt.suptitle('')  # Remove default title
    plt.tight_layout()
    plt.savefig('outputs/target_by_drug.png', dpi=150, bbox_inches='tight')
    plt.close()
    print("   ‚úÖ Saved: outputs/target_by_drug.png")

print("\n‚úÖ All visualizations saved to outputs/ directory")


 DATA VISUALIZATION

üìä Creating visualizations...
   ‚úÖ Saved: outputs/target_distribution.png
   ‚úÖ Saved: outputs/drug_distribution.png
   ‚úÖ Saved: outputs/tcga_distribution.png

‚úÖ All visualizations saved to outputs/ directory


In [None]:
print("\n" + "=" * 70)
print("STEP 3: DATA PREPROCESSING")
print("=" * 70)

print("\nüìä Splitting data into train/test sets...")
# Splitting first prevents data leakage - all preprocessing will be fitted only on training data
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

print(f"   ‚úÖ Training set: {X_train.shape[0]:,} samples ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"   ‚úÖ Test set: {X_test.shape[0]:,} samples ({X_test.shape[0]/len(X)*100:.1f}%)")
print(f"   ‚úÖ Features: {X_train.shape[1]}")

X_test_original = X_test.copy()


STEP 3: DATA PREPROCESSING

üìä Splitting data into train/test sets...
   ‚úÖ Training set: 16,000 samples (80.0%)
   ‚úÖ Test set: 4,000 samples (20.0%)
   ‚úÖ Features: 721


In [144]:
X_train.shape

(16000, 721)

In [118]:
X_train.dtypes

Drug_Name_5-Fluorouracil    float64
Drug_Name_965-D2            float64
Drug_Name_993-D2            float64
Drug_Name_A-443654          float64
Drug_Name_A-770041          float64
                             ...   
Cell_Line_Encoded           float64
LN_IC50                     float64
AUC                         float64
Drug_Name                    object
Cell_Line_Name               object
Length: 724, dtype: object

In [None]:


print("\nüéØ Feature selection...")

if X_train.shape[0] > 15000:
    n_features_to_select = min(500, X_train.shape[1])
else:
    n_features_to_select = min(150, X_train.shape[1])

print(f"   Selecting top {n_features_to_select} features from {X_train.shape[1]} total")

# Mutual information captures both linear and non-linear relationships, making it more flexible than correlation-based methods
selector = SelectKBest(score_func=mutual_info_regression, k=n_features_to_select)

X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

selected_feature_indices = selector.get_support(indices=True)
if isinstance(X_train, pd.DataFrame):
    all_feature_names = list(X_train.columns)
else:
    all_feature_names = feature_names if len(feature_names) == X_train.shape[1] else list(range(X_train.shape[1]))
selected_feature_names = [all_feature_names[i] for i in selected_feature_indices]

print(f"   ‚úÖ Selected {X_train_selected.shape[1]} features")
print(f"      Reduction: {(1 - X_train_selected.shape[1]/X_train.shape[1])*100:.1f}%")

print(f"\n   Top 10 selected features:")
feature_scores = selector.scores_[selected_feature_indices]
top_features = sorted(zip(selected_feature_names, feature_scores), 
                      key=lambda x: x[1], reverse=True)[:10]
for feat_name, score in top_features:
    print(f"      {feat_name}: {score:.2f}")

X_train = X_train_selected
X_test = X_test_selected



üéØ Feature selection...
   Selecting top 500 features from 721 total
   ‚úÖ Selected 500 features
      Reduction: 30.7%

   Top 10 selected features:
      LN_IC50: 0.76
      Cell_Line_Encoded: 0.15
      Pathway_Name_Chromatin histone acetylation: 0.01
      Pathway_Name_Mitosis: 0.01
      TCGA_Class_DLBC: 0.01
      Pathway_Name_PI3K/MTOR signaling: 0.01
      TCGA_Class_ALL: 0.01
      Pathway_Name_DNA replication: 0.01
      Pathway_Name_Hormone-related: 0.01
      Pathway_Name_WNT signaling: 0.01


In [147]:
len (feature_names)

720

In [None]:
print("\nüìè Scaling features...")

# RobustScaler is more resistant to outliers than StandardScaler, which matters for real-world biological data
scaler = RobustScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("   ‚úÖ Features scaled using RobustScaler (fitted on train only)")
print(f"      Train range: [{X_train_scaled.min():.2f}, {X_train_scaled.max():.2f}]")
print(f"      Test range: [{X_test_scaled.min():.2f}, {X_test_scaled.max():.2f}]")

print("\n‚úÖ Data preprocessing complete (no data leakage!)")


üìè Scaling features...
   ‚úÖ Features scaled using RobustScaler (fitted on train only)
      Train range: [-3.46, 2.81]
      Test range: [-3.51, 2.61]

‚úÖ Data preprocessing complete (no data leakage!)


In [None]:

print("\n" + "=" * 70)
print(" MODEL TRAINING AND COMPARISON")
print("=" * 70)

models = {}
results = {}


STEP 4: MODEL TRAINING AND COMPARISON


In [None]:
print("\n" + "-" * 70)
print("MODEL 1: LINEAR REGRESSION (Baseline)")
print("-" * 70)

lr_model = LinearRegression()
print("   Training Linear Regression...")
lr_model.fit(X_train_scaled, y_train)

y_pred_lr = lr_model.predict(X_test_scaled)

mse_lr = mean_squared_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mse_lr)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

models['Linear Regression'] = lr_model
results['Linear Regression'] = {
    'RMSE': rmse_lr,
    'MAE': mae_lr,
    'R2': r2_lr,
    'predictions': y_pred_lr
}

print(f"   ‚úÖ Training complete!")
print(f"   üìä Results:")
print(f"      RMSE: {rmse_lr:.4f} (lower is better)")
print(f"      MAE:  {mae_lr:.4f} (lower is better)")
print(f"      R¬≤:   {r2_lr:.4f} (higher is better, max=1.0)")


----------------------------------------------------------------------
MODEL 1: LINEAR REGRESSION (Baseline)
----------------------------------------------------------------------
   Training Linear Regression...
   ‚úÖ Training complete!
   üìä Results:
      RMSE: 0.0764 (lower is better)
      MAE:  0.0530 (lower is better)
      R¬≤:   0.8197 (higher is better, max=1.0)


In [None]:
print("\n" + "-" * 70)
print("MODEL 2: RANDOM FOREST REGRESSION")
print("-" * 70)

rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1,
    verbose=0
)

print("   Training Random Forest (this may take a few minutes)...")
rf_model.fit(X_train_scaled, y_train)

y_pred_rf = rf_model.predict(X_test_scaled)

mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

models['Random Forest'] = rf_model
results['Random Forest'] = {
    'RMSE': rmse_rf,
    'MAE': mae_rf,
    'R2': r2_rf,
    'predictions': y_pred_rf
}

print(f"   ‚úÖ Training complete!")
print(f"   üìä Results:")
print(f"      RMSE: {rmse_rf:.4f}")
print(f"      MAE:  {mae_rf:.4f}")
print(f"      R¬≤:   {r2_rf:.4f}")

print(f"\n   üîç Feature Importance Analysis:")
feature_importance = pd.DataFrame({
    'Feature': selected_feature_names,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

print(f"   Top 10 Most Important Features:")
for idx, row in feature_importance.head(10).iterrows():
    print(f"      {row['Feature']:30s}: {row['Importance']:.4f}")

plt.figure(figsize=(10, 8))
top_20 = feature_importance.head(20)
plt.barh(range(len(top_20)), top_20['Importance'].values)
plt.yticks(range(len(top_20)), top_20['Feature'].values)
plt.xlabel('Feature Importance')
plt.title('Top 20 Most Important Features (Random Forest)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('outputs/feature_importance.png', dpi=150, bbox_inches='tight')
plt.close()
print(f"   ‚úÖ Saved: outputs/feature_importance.png")


----------------------------------------------------------------------
MODEL 2: RANDOM FOREST REGRESSION
----------------------------------------------------------------------
   Training Random Forest (this may take a few minutes)...
   ‚úÖ Training complete!
   üìä Results:
      RMSE: 0.0807
      MAE:  0.0533
      R¬≤:   0.7988

   üîç Feature Importance Analysis:
   Top 10 Most Important Features:
      LN_IC50                       : 0.7849
      Pathway_Name_Mitosis          : 0.0153
      Putative_Target_HSF1          : 0.0144
      Drug_Name_Bryostatin 1        : 0.0087
      Putative_Target_PKC           : 0.0079
      Drug_Name_DMOG                : 0.0077
      Putative_Target_PI3K (class 1), MTORC1, MTORC2: 0.0072
      Putative_Target_dsDNA break induction: 0.0067
      Putative_Target_HIF-PH        : 0.0064
      Cell_Line_Encoded             : 0.0061
   ‚úÖ Saved: outputs/feature_importance.png


In [None]:
print("\n" + "-" * 70)
print("MODEL 3: XGBOOST REGRESSION")
print("-" * 70)

xgb_model = xgb.XGBRegressor(
    n_estimators=200,
    max_depth=8,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    verbosity=0
)

print("   Training XGBoost (this may take a few minutes)...")
xgb_model.fit(X_train_scaled, y_train)

y_pred_xgb = xgb_model.predict(X_test_scaled)

mse_xgb = mean_squared_error(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mse_xgb)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

models['XGBoost'] = xgb_model
results['XGBoost'] = {
    'RMSE': rmse_xgb,
    'MAE': mae_xgb,
    'R2': r2_xgb,
    'predictions': y_pred_xgb
}

print(f"   ‚úÖ Training complete!")
print(f"   üìä Results:")
print(f"      RMSE: {rmse_xgb:.4f}")
print(f"      MAE:  {mae_xgb:.4f}")
print(f"      R¬≤:   {r2_xgb:.4f}")


----------------------------------------------------------------------
MODEL 3: XGBOOST REGRESSION
----------------------------------------------------------------------
   Training XGBoost (this may take a few minutes)...
   ‚úÖ Training complete!
   üìä Results:
      RMSE: 0.0700
      MAE:  0.0465
      R¬≤:   0.8487


In [None]:
print("\n" + "=" * 70)
print("STEP 5: MODEL COMPARISON")
print("=" * 70)

comparison_df = pd.DataFrame({
    'Model': list(results.keys()),
    'RMSE': [results[m]['RMSE'] for m in results.keys()],
    'MAE': [results[m]['MAE'] for m in results.keys()],
    'R2': [results[m]['R2'] for m in results.keys()]
})

comparison_df = comparison_df.sort_values('R2', ascending=False)

print("\nüìä Model Performance Comparison:")
print(comparison_df.to_string(index=False))

# Visualize comparison
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

metrics = ['RMSE', 'MAE', 'R2']
for i, metric in enumerate(metrics):
    axes[i].barh(comparison_df['Model'], comparison_df[metric], color=['#2ecc71', '#3498db', '#e74c3c'])
    axes[i].set_xlabel(metric, fontsize=12)
    axes[i].set_title(f'Model Comparison - {metric}', fontsize=14, fontweight='bold')
    axes[i].grid(True, alpha=0.3, axis='x')
    
    # Add value labels
    for j, v in enumerate(comparison_df[metric]):
        axes[i].text(v, j, f' {v:.4f}', va='center', fontsize=10)
    
    if metric == 'R2':
        axes[i].set_xlim([0, max(1.0, comparison_df[metric].max() * 1.1)])

plt.tight_layout()
plt.savefig('outputs/regression_model_comparison.png', dpi=150, bbox_inches='tight')
plt.close()
print("\n‚úÖ Saved: outputs/regression_model_comparison.png")

# Select best model
best_model_name = comparison_df.iloc[0]['Model']
best_model = models[best_model_name]
best_r2 = comparison_df.iloc[0]['R2']

print(f"\nüèÜ Best Model: {best_model_name}")
print(f"   R¬≤ Score: {best_r2:.4f}")
print(f"   RMSE: {comparison_df.iloc[0]['RMSE']:.4f}")
print(f"   MAE: {comparison_df.iloc[0]['MAE']:.4f}")


STEP 5: MODEL COMPARISON

üìä Model Performance Comparison:
            Model     RMSE      MAE       R2
          XGBoost 0.070001 0.046535 0.848689
Linear Regression 0.076421 0.053047 0.819662
    Random Forest 0.080725 0.053271 0.798778

‚úÖ Saved: outputs/regression_model_comparison.png

üèÜ Best Model: XGBoost
   R¬≤ Score: 0.8487
   RMSE: 0.0700
   MAE: 0.0465


In [None]:
print("\n" + "=" * 70)
print("STEP 6: HYPERPARAMETER TUNING")
print("=" * 70)

print(f"Tuning hyperparameters for: {best_model_name}")
print(f"Original R¬≤: {best_r2:.4f}")

if best_model_name == 'Random Forest':
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 15, 20, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    base_model = RandomForestRegressor(random_state=42, n_jobs=-1)
    
elif best_model_name == 'XGBoost':
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [6, 8, 10],
        'learning_rate': [0.01, 0.05, 0.1],
        'subsample': [0.8, 0.9, 1.0]
    }
    base_model = xgb.XGBRegressor(random_state=42, n_jobs=-1, verbosity=0)
    
elif best_model_name == 'Linear Regression':
    print("   ‚ö†Ô∏è  Linear Regression has limited hyperparameters to tune")
    print("   Skipping hyperparameter tuning for Linear Regression")
    tuned_model = best_model
    param_grid = None
else:
    print(f"   ‚ö†Ô∏è  Hyperparameter tuning not implemented for {best_model_name}")
    tuned_model = best_model
    param_grid = None

if param_grid is not None:
    print(f"\n   üîç Performing Grid Search with 5-fold cross-validation...")
    print(f"   This may take 10-30 minutes depending on dataset size...")
    
    if X_train.shape[0] > 50000:
        print("   ‚ö†Ô∏è  Large dataset detected. Using reduced parameter grid for speed...")
        if best_model_name == 'Random Forest':
            param_grid = {
                'n_estimators': [100, 200],
                'max_depth': [10, 15],
                'min_samples_split': [5, 10]
            }
        elif best_model_name == 'XGBoost':
            param_grid = {
                'n_estimators': [100, 200],
                'max_depth': [6, 8],
                'learning_rate': [0.05, 0.1]
            }
    
    grid_search = GridSearchCV(
        base_model,
        param_grid,
        cv=5,
        scoring='r2',
        n_jobs=-1,
        verbose=1,
        return_train_score=True
    )
    
    grid_search.fit(X_train_scaled, y_train)
    
    tuned_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    best_cv_score = grid_search.best_score_
    
    print(f"\n   ‚úÖ Grid Search Complete!")
    print(f"   Best Parameters: {best_params}")
    print(f"   Best CV R¬≤ Score: {best_cv_score:.4f}")
    
    y_pred_tuned = tuned_model.predict(X_test_scaled)
    r2_tuned = r2_score(y_test, y_pred_tuned)
    rmse_tuned = np.sqrt(mean_squared_error(y_test, y_pred_tuned))
    mae_tuned = mean_absolute_error(y_test, y_pred_tuned)
    
    improvement = r2_tuned - best_r2
    
    print(f"\n   üìä Comparison:")
    print(f"      Original R¬≤: {best_r2:.4f}")
    print(f"      Tuned R¬≤:    {r2_tuned:.4f}")
    print(f"      Improvement: {improvement:+.4f} ({improvement/best_r2*100:+.2f}%)")
    
    if improvement > 0:
        print(f"   ‚úÖ Tuned model is better! Using tuned model.")
        best_model = tuned_model
        models[best_model_name] = tuned_model
        results[best_model_name]['R2'] = r2_tuned
        results[best_model_name]['RMSE'] = rmse_tuned
        results[best_model_name]['MAE'] = mae_tuned
        results[best_model_name]['predictions'] = y_pred_tuned
    else:
        print(f"   ‚ö†Ô∏è  Tuned model didn't improve. Using original model.")
        best_model = models[best_model_name]
else:
    best_model = models[best_model_name]



STEP 6: HYPERPARAMETER TUNING
Tuning hyperparameters for: XGBoost
Original R¬≤: 0.8487

   üîç Performing Grid Search with 5-fold cross-validation...
   This may take 10-30 minutes depending on dataset size...
Fitting 5 folds for each of 81 candidates, totalling 405 fits

   ‚úÖ Grid Search Complete!
   Best Parameters: {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 300, 'subsample': 0.9}
   Best CV R¬≤ Score: 0.8978

   üìä Comparison:
      Original R¬≤: 0.8487
      Tuned R¬≤:    0.9027
      Improvement: +0.0540 (+6.36%)
   ‚úÖ Tuned model is better! Using tuned model.


In [None]:
print("\n" + "=" * 70)
print("STEP 7: PREDICTION VISUALIZATION")
print("=" * 70)

y_pred_best = best_model.predict(X_test_scaled)

# 1. Predictions vs Actual scatter plot
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.scatter(y_test, y_pred_best, alpha=0.5, s=20)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2, label='Perfect Prediction')
plt.xlabel(f'Actual {target_col}', fontsize=12)
plt.ylabel(f'Predicted {target_col}', fontsize=12)
plt.title(f'Predictions vs Actual - {best_model_name}', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)

# Calculate and display R¬≤ on plot
r2_best = r2_score(y_test, y_pred_best)
plt.text(0.05, 0.95, f'R¬≤ = {r2_best:.4f}', transform=plt.gca().transAxes,
         fontsize=12, verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

# 2. Residual plot
plt.subplot(1, 2, 2)
residuals = y_test - y_pred_best
plt.scatter(y_pred_best, residuals, alpha=0.5, s=20)
plt.axhline(y=0, color='r', linestyle='--', lw=2)
plt.xlabel(f'Predicted {target_col}', fontsize=12)
plt.ylabel('Residuals (Actual - Predicted)', fontsize=12)
plt.title('Residual Plot', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('outputs/predictions_vs_actual.png', dpi=150, bbox_inches='tight')
plt.close()
print("‚úÖ Saved: outputs/predictions_vs_actual.png")

# 3. Error distribution
plt.figure(figsize=(10, 6))
plt.hist(residuals, bins=50, edgecolor='black', alpha=0.7)
plt.xlabel('Residuals (Actual - Predicted)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.title('Distribution of Prediction Errors', fontsize=14, fontweight='bold')
plt.axvline(x=0, color='r', linestyle='--', lw=2, label='Zero Error')
plt.legend()
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.savefig('outputs/error_distribution.png', dpi=150, bbox_inches='tight')
plt.close()
print("‚úÖ Saved: outputs/error_distribution.png")


STEP 7: PREDICTION VISUALIZATION
‚úÖ Saved: outputs/predictions_vs_actual.png
‚úÖ Saved: outputs/error_distribution.png


In [None]:
print("\n" + "=" * 70)
print("STEP 8: SAVE MODEL")
print("=" * 70)

# Create models directory
os.makedirs('models', exist_ok=True)

# Save model and preprocessing components
model_filename = best_model_name.lower().replace(" ", "_")
model_path = f'models/{model_filename}_regression.pkl'
scaler_path = 'models/regression_scaler.pkl'
selector_path = 'models/regression_selector.pkl'

joblib.dump(best_model, model_path)
joblib.dump(scaler, scaler_path)
joblib.dump(selector, selector_path)

print(f"‚úÖ Model saved to: {model_path}")
print(f"‚úÖ Scaler saved to: {scaler_path}")
print(f"‚úÖ Feature selector saved to: {selector_path}")

# Save OneHotEncoder (for categorical features)
if ohe_encoder is not None:
    ohe_path = 'models/regression_onehot_encoder.pkl'
    joblib.dump(ohe_encoder, ohe_path)
    print(f"‚úÖ OneHotEncoder saved to: {ohe_path}")

# Save cell line encoder (if used)
if cell_line_handled and 'Cell_Line_Name' in df.columns:
    if n_cell_lines > 50:  # Was using LabelEncoder
        cell_encoder_path = 'models/regression_cell_line_encoder.pkl'
        joblib.dump(le_cell, cell_encoder_path)
        print(f"‚úÖ Cell line encoder (LabelEncoder) saved to: {cell_encoder_path}")

# Save feature names for reference
import json
feature_info = {
    'selected_features': selected_feature_names,
    'target_column': target_col,
    'model_type': best_model_name,
    'categorical_columns': categorical_cols_to_encode if categorical_cols_to_encode else [],
    'encoding_method': 'OneHotEncoder' if ohe_encoder is not None else 'None'
}
with open('models/feature_info.json', 'w') as f:
    json.dump(feature_info, f, indent=2)
print(f"‚úÖ Feature info saved to: models/feature_info.json")


STEP 8: SAVE MODEL
‚úÖ Model saved to: models/xgboost_regression.pkl
‚úÖ Scaler saved to: models/regression_scaler.pkl
‚úÖ Feature selector saved to: models/regression_selector.pkl
‚úÖ OneHotEncoder saved to: models/regression_onehot_encoder.pkl
‚úÖ Cell line encoder (LabelEncoder) saved to: models/regression_cell_line_encoder.pkl
‚úÖ Feature info saved to: models/feature_info.json


In [None]:
def predict_drug_response(gene_expression_data, categorical_features=None,
                          model_path=model_path, scaler_path=scaler_path,
                          selector_path=selector_path):
    """
    Predict drug response from gene expression and categorical features
    
    Parameters:
    -----------
    gene_expression_data : np.array or list
        Gene expression values (n_samples, n_genes)
        Should match the number of genes used in training
    categorical_features : dict, optional
        Dictionary with keys: 'TCGA_Class', 'Drug_Name', etc.
        Values should be the actual category names (will be encoded)
    model_path : str
        Path to saved model
    scaler_path : str
        Path to saved scaler
    selector_path : str
        Path to saved feature selector
    
    Returns:
    --------
    predictions : np.array
        Predicted drug response values
    """
    # Load models
    model = joblib.load(model_path)
    scaler = joblib.load(scaler_path)
    selector = joblib.load(selector_path)
    
    # Load OneHotEncoder if available
    ohe_path = 'models/regression_onehot_encoder.pkl'
    ohe = None
    if os.path.exists(ohe_path):
        ohe = joblib.load(ohe_path)
    
    # Convert to numpy array
    if isinstance(gene_expression_data, list):
        gene_expression_data = np.array(gene_expression_data)
    
    # Reshape if single sample
    if gene_expression_data.ndim == 1:
        gene_expression_data = gene_expression_data.reshape(1, -1)
    
    # Handle categorical features using OneHotEncoder
    X_categorical_encoded = None
    if categorical_features and ohe is not None:
        # Prepare categorical data
        categorical_data = []
        categorical_cols = []
        for col in ohe.feature_names_in_ if hasattr(ohe, 'feature_names_in_') else []:
            if col in categorical_features:
                categorical_data.append([categorical_features[col]])
                categorical_cols.append(col)
        
        if categorical_data:
            # Fill missing values
            categorical_df = pd.DataFrame(categorical_data, columns=categorical_cols)
            categorical_df = categorical_df.fillna('UNKNOWN')
            
            # OneHot encode
            X_categorical_encoded = ohe.transform(categorical_df)
    
    # Combine gene expression and categorical features
    if X_categorical_encoded is not None:
        X_combined = np.hstack([gene_expression_data, X_categorical_encoded])
    else:
        X_combined = gene_expression_data
        if categorical_features:
            print("‚ö†Ô∏è  Warning: Categorical features provided but OneHotEncoder not found")
            print("   Using only gene expression features")
    
    # Feature selection
    X_selected = selector.transform(X_combined)
    
    # Scale
    X_scaled = scaler.transform(X_selected)
    
    # Predict
    predictions = model.predict(X_scaled)
    
    return predictions

print("\n‚úÖ Inference function created!")
print("   Use predict_drug_response() to make predictions on new data")

# Test inference function
print("\nüß™ Testing inference function...")
# Use first test sample (all features - categorical encoded + numeric features)
# Note: The inference function expects gene_expression_data, but our data structure
# has categorical features (one-hot encoded) + numeric features (LN_IC50)
# So we pass all features as "gene_expression_data" for testing
if isinstance(X_test_original, pd.DataFrame):
    test_sample = X_test_original.iloc[:1].values
else:
    test_sample = X_test_original[:1, :]
pred_test = predict_drug_response(test_sample)
print(f"   Test prediction: {pred_test[0]:.4f}")
# Handle both Series and array for y_test
actual_value = y_test.iloc[0] if hasattr(y_test, 'iloc') else y_test[0]
print(f"   Actual value: {actual_value:.4f}")
print(f"   Error: {abs(pred_test[0] - actual_value):.4f}")


‚úÖ Inference function created!
   Use predict_drug_response() to make predictions on new data

üß™ Testing inference function...
   Test prediction: 0.9158
   Actual value: 0.9222
   Error: 0.0064




In [None]:
print("\n" + "=" * 70)
print("PROJECT 3 COMPLETE! üéâ")
print("=" * 70)

print(f"\nüìä Final Results:")
print(f"   Best Model: {best_model_name}")
print(f"   R¬≤ Score: {results[best_model_name]['R2']:.4f}")
print(f"   RMSE: {results[best_model_name]['RMSE']:.4f}")
print(f"   MAE: {results[best_model_name]['MAE']:.4f}")

print(f"\nüìÅ Outputs saved:")
print(f"   - Model: {model_path}")
print(f"   - Visualizations: outputs/ directory")
print(f"   - Feature importance: outputs/feature_importance.png")




PROJECT 3 COMPLETE! üéâ

üìä Final Results:
   Best Model: XGBoost
   R¬≤ Score: 0.9027
   RMSE: 0.0561
   MAE: 0.0321

üìÅ Outputs saved:
   - Model: models/xgboost_regression.pkl
   - Visualizations: outputs/ directory
   - Feature importance: outputs/feature_importance.png
