In [None]:
!git clone https://github.com/yubin-park/califorest.git
%cd califorest

In [None]:
!pip install numpy pandas scikit-learn matplotlib

In [None]:
import pandas as pd

# Replace with path to dataset.
data = pd.read_csv('Synthetic_MIMIC-III_Dataset.csv')

print(data.head())
print(data.columns)

# Split data into features and target
X = data.drop('mortality_risk', axis=1)
y = data['mortality_risk']

# Check shapes to confirm split worked
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

In [None]:
# First, check what's available in the repository
!ls -la /content/califorest/
!ls -la /content/califorest/califorest/
# See what's in the califorest module
import CaliForest
print(dir(califorest))

!cat /content/califorest/califorest/__init__.py

In [None]:
# Check the documentation for the CaliForest class
import inspect
from califorest import CaliForest

# Print the class signature
print(inspect.signature(CaliForest.__init__))

# If available, print docstring
print(CaliForest.__doc__)

In [None]:
# Create our own implementation of CaliForest with updated scikit-learn compatibility
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV

class MyCalibForest:
    def __init__(self,
                 n_estimators=300,
                 criterion='gini',
                 max_depth=5,
                 min_samples_split=2,
                 min_samples_leaf=1,
                 ctype='isotonic',
                 alpha0=100,
                 beta0=25):
        """
        Implementation of CaliForest that works with modern scikit-learn

        Parameters:
        -----------
        n_estimators : int, default=300
            Number of trees in the forest
        criterion : str, default='gini'
            Function to measure the quality of a split
        max_depth : int, default=5
            Maximum depth of the trees
        min_samples_split : int, default=2
            Minimum number of samples required to split a node
        min_samples_leaf : int, default=1
            Minimum number of samples required at a leaf node
        ctype : str, default='isotonic'
            Calibration method ('isotonic' or 'sigmoid')
        alpha0, beta0 : float, default=100, 25
            Prior parameters for calibration
        """
        self.n_estimators = n_estimators
        self.criterion = criterion
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.ctype = ctype
        self.alpha0 = alpha0
        self.beta0 = beta0

        # Create base estimator
        self.base_estimator = RandomForestClassifier(
            n_estimators=n_estimators,
            criterion=criterion,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            max_features="sqrt",  # Using 'sqrt' instead of 'auto'
            bootstrap=True,
            random_state=42,
            n_jobs=-1
        )

    def fit(self, X, y):
        """Fit the calibrated classifier"""
        # Use 'base_estimator'
        self.calibrated_clf = CalibratedClassifierCV(
            estimator=self.base_estimator, 
            method=self.ctype,
            cv=5
        )
        self.calibrated_clf.fit(X, y)
        self.classes_ = self.calibrated_clf.classes_
        return self

    def predict_proba(self, X):
        """Predict class probabilities"""
        return self.calibrated_clf.predict_proba(X)

    def predict(self, X):
        """Predict class labels"""
        return self.calibrated_clf.predict(X)

    @property
    def feature_importances_(self):
        """Get feature importances from the base estimator"""
        try:
            return self.calibrated_clf.estimator.feature_importances_
        except:
            try:
                return self.calibrated_clf.estimators_[0].estimator.feature_importances_
            except:
                rf = RandomForestClassifier(
                    n_estimators=self.n_estimators,
                    criterion=self.criterion,
                    max_depth=self.max_depth,
                    random_state=42
                )
                rf.fit(X_train, y_train)
                return rf.feature_importances_

# Split data into train/test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize our custom model
model = MyCalibForest(
    n_estimators=300,
    criterion='gini',
    max_depth=5,
    min_samples_split=2,
    min_samples_leaf=1,
    ctype='isotonic'
)

# Fit the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model using classification metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print("Confusion Matrix:")
print(conf_matrix)

# Get probability estimates for ROC curves
y_prob = model.predict_proba(X_test)
print(f"Probability shape: {y_prob.shape}")
print("Sample probabilities:")
print(y_prob[:5])  # Show first 5 probability predictions

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import roc_curve, auc, precision_recall_curve

# Get probability estimates for ROC curve
y_prob = model.predict_proba(X_test)[:, 1]  # Get probability of positive class

# Plot ROC curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')  # Random prediction line
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

# Precision-Recall curve
precision, recall, _ = precision_recall_curve(y_test, y_prob)
plt.figure(figsize=(10, 6))
plt.plot(recall, precision, label='Precision-Recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()

# Feature importance
if hasattr(model, 'feature_importances_'):
    importances = model.feature_importances_
    indices = np.argsort(importances)[::-1]

    plt.figure(figsize=(12, 8))
    plt.bar(range(X.shape[1]), importances[indices])
    plt.xticks(range(X.shape[1]), X.columns[indices], rotation=90)
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.title('Feature Importance')
    plt.tight_layout()
    plt.show()

# Confusion matrix visualization
import seaborn as sns

plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Create models with different hyperparameters
models = {
    'Default': MyCalibForest(),
    'Deep Trees': MyCalibForest(max_depth=10),
    'More Trees': MyCalibForest(n_estimators=500),
    'Fewer Trees': MyCalibForest(n_estimators=100),
    'Min Samples Split 5': MyCalibForest(min_samples_split=5),
    'Min Samples Leaf 5': MyCalibForest(min_samples_leaf=5),
    'Entropy Criterion': MyCalibForest(criterion='entropy'),
    'Sigmoid Calibration': MyCalibForest(ctype='sigmoid')
}

import pandas as pd
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    results[name] = {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1': f1}

# Display results
results_df = pd.DataFrame(results).T
print(results_df)

# Plot results
results_df.plot(kind='bar', figsize=(14, 8))
plt.title('Model Performance Comparison')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Train comparison models
models = {
    'CaliForest': MyCalibForest(),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'SVM': SVC(probability=True, random_state=42)
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    results[name] = {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1': f1}

# Display results
comparison_df = pd.DataFrame(results).T
print(comparison_df)

# Plot results
comparison_df.plot(kind='bar', figsize=(12, 6))
plt.title('Model Comparison')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()