In [None]:
# Install essential libraries
!pip install pandas scikit-learn matplotlib seaborn

In [24]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns

In [25]:
# Function to load and preprocess data
def load_and_preprocess_data(filepath):
    """Load and preprocess the dataset."""
    # Load the dataset
    data = pd.read_csv(filepath)

    # Drop rows with missing target values
    data = data.dropna(subset=['chemotherapy', 'hormone_therapy', 'radio_therapy'])

    # Separate features and target variables
    X = data.drop(['patient_id', 'chemotherapy', 'hormone_therapy', 'radio_therapy'], axis=1)

    # Creating the target variable as a multi-label problem
    y = data[['chemotherapy', 'hormone_therapy', 'radio_therapy']].values

    # Preprocessing: Handling categorical variables
    categorical_cols = X.select_dtypes(include=['object']).columns

    # Apply Label Encoding to categorical columns
    label_encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))
        label_encoders[col] = le

    # Preprocessing: Handling missing values
    imputer = SimpleImputer(strategy='mean')
    X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

    # Feature scaling
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    
    return X, y

In [34]:
# Load and preprocess data
X, y = load_and_preprocess_data('datasets/metabric.csv')

In [27]:
# Function to train the model
def train_model(X_train, y_train):
    """Train the RandomForestClassifier model."""
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    return model

In [28]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
# Train the model
model = train_model(X_train, y_train)

In [30]:
# Function to visualize test samples
def visualize_test_samples(y_test, y_pred):
    """Visualize the confusion matrix for each therapy."""
    therapies = ['Chemotherapy', 'Hormone Therapy', 'Radiation Therapy']
    for i in range(y_test.shape[1]):
        cm = confusion_matrix(y_test[:, i], y_pred[:, i])
        plt.figure(figsize=(6, 4))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
        plt.ylabel('Actual')
        plt.xlabel('Predicted')
        plt.title(f'Confusion Matrix for {therapies[i]}')
        plt.show()

In [None]:
# Predict on test data
y_pred = model.predict(X_test)

# Visualize the test samples
visualize_test_samples(y_test, y_pred)

In [32]:
# Function to evaluate the model
def evaluate_model(model, X_test, y_test):
    """Evaluate the model and print metrics."""
    # Predict on test data
    y_pred = model.predict(X_test)

    # Evaluation metrics for each therapy
    therapies = ['Chemotherapy', 'Hormone Therapy', 'Radiation Therapy']
    for i in range(y_test.shape[1]):
        print(f"\nClassification Report for {therapies[i]}:")
        print(classification_report(y_test[:, i], y_pred[:, i]))

In [None]:
# Evaluate the model
evaluate_model(model, X_test, y_test)