In [None]:
# Install essential libraries
!pip install pandas scikit-learn matplotlib seaborn

In [14]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns

In [15]:
# Function to load and preprocess data
def load_and_preprocess_data(filepath):
    """Load and preprocess the dataset."""
    # Load the dataset
    data = pd.read_csv(filepath)
    
    # Drop rows with missing target values
    data = data.dropna(subset=['overall_survival'])
    
    # Separate features and target variable
    X = data.drop(['overall_survival', 'patient_id'], axis=1)
    y = data['overall_survival']
    
    # Preprocessing: Handling categorical variables
    # Identify categorical features
    categorical_cols = X.select_dtypes(include=['object']).columns
    
    # Apply Label Encoding to categorical columns
    label_encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))
        label_encoders[col] = le
    
    # Preprocessing: Handling missing values
    imputer = SimpleImputer(strategy='mean')
    X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

    # Feature scaling
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    
    return X, y

In [24]:
# Load and preprocess data
X, y = load_and_preprocess_data('datasets/metabric.csv')

In [17]:
# Function to train the model
def train_model(X_train, y_train):
    """Train the RandomForestClassifier model."""
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    return model

In [18]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
# Train the model
model = train_model(X_train, y_train)

In [20]:
def visualize_test_samples(y_test, y_pred):
    """Visualize the confusion matrix."""
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Survived', 'Not Survived'], yticklabels=['Survived', 'Not Survived'])
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.title('Confusion Matrix')
    plt.show()

In [None]:
# Predict on test data
y_pred = model.predict(X_test)

# Visualize the test samples
visualize_test_samples(y_test, y_pred)

In [22]:
# Function to evaluate the model
def evaluate_model(model, X_test, y_test):
    """Evaluate the model and print metrics."""
    # Predict on test data
    y_pred = model.predict(X_test)

    # Evaluation metrics
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    print("\nAUC-ROC Score:")
    print(roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))

In [None]:
# Evaluate the model
evaluate_model(model,X_test, y_test)