In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

# Load synthetic data (combined genomic, drug response, and EHR data)
combined_data = pd.read_csv('synthetic_data.csv')

# Identify features (X) and target variable (y)
features = combined_data.drop(['Patient_ID', 'Aspirin_Efficacy_Score', 'Insulin_Efficacy_Score', 'Statins_Efficacy_Score'], axis=1)
target_aspirin = combined_data['Aspirin_Side_Effects']
target_insulin = combined_data['Insulin_Side_Effects']
target_statins = combined_data['Statins_Side_Effects']

# Separate numeric and categorical features
numeric_features = features.drop(['Gender', 'Medical_History', 'Current_Medications'], axis=1)
categorical_features = features[['Gender', 'Medical_History', 'Current_Medications']]

# Create a preprocessor to handle numeric and categorical features separately
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_features.columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features.columns)
    ])

# Define the pipeline with preprocessing and the classifier
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Train and evaluate classifiers for each drug
def train_and_evaluate_classifier(X, y, model_name):
    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train the classifier
    pipeline.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = pipeline.predict(X_test)

    # Evaluate the classifier
    accuracy = accuracy_score(y_test, y_pred)
    print(f'{model_name} - Accuracy: {accuracy:.2f}')
    print(f'{model_name} - Classification Report:\n{classification_report(y_test, y_pred)}')

# Train and evaluate classifiers for each drug
train_and_evaluate_classifier(features, target_aspirin, 'Aspirin Classifier')
train_and_evaluate_classifier(features, target_insulin, 'Insulin Classifier')
train_and_evaluate_classifier(features, target_statins, 'Statins Classifier')


Aspirin Classifier - Accuracy: 1.00
Aspirin Classifier - Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00        10

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20

Insulin Classifier - Accuracy: 1.00
Insulin Classifier - Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         8
           1       1.00      1.00      1.00        12

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20

Statins Classifier - Accuracy: 1.00
Statins Classifier - Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        