In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
data = pd.read_csv("patient_treatment_classification.csv")

In [None]:
data.head()

Unnamed: 0,HAEMATOCRIT,HAEMOGLOBINS,ERYTHROCYTE,LEUCOCYTE,THROMBOCYTE,MCH,MCHC,MCV,AGE,SEX,SOURCE
0,33.8,11.1,4.18,4.6,150,26.6,32.8,80.9,33,F,1
1,44.6,14.0,6.86,6.3,232,20.4,31.4,65.0,36,M,0
2,42.9,14.0,4.57,6.2,336,30.6,32.6,93.9,70,F,0
3,41.9,14.4,4.67,3.5,276,30.8,34.4,89.7,18,F,0
4,40.6,13.3,4.85,14.9,711,27.4,32.8,83.7,36,M,0


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3309 entries, 0 to 3308
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HAEMATOCRIT   3309 non-null   float64
 1   HAEMOGLOBINS  3309 non-null   float64
 2   ERYTHROCYTE   3309 non-null   float64
 3   LEUCOCYTE     3309 non-null   float64
 4   THROMBOCYTE   3309 non-null   int64  
 5   MCH           3309 non-null   float64
 6   MCHC          3309 non-null   float64
 7   MCV           3309 non-null   float64
 8   AGE           3309 non-null   int64  
 9   SEX           3309 non-null   object 
 10  SOURCE        3309 non-null   int64  
dtypes: float64(7), int64(3), object(1)
memory usage: 284.5+ KB


In [None]:
data.drop(["SEX"], axis=1, inplace=True)

In [None]:
data.duplicated().sum()

0

In [None]:
data["SOURCE"].value_counts()

Unnamed: 0_level_0,count
SOURCE,Unnamed: 1_level_1
0,1992
1,1317


In [None]:
data.isnull().sum()

Unnamed: 0,0
HAEMATOCRIT,0
HAEMOGLOBINS,0
ERYTHROCYTE,0
LEUCOCYTE,0
THROMBOCYTE,0
MCH,0
MCHC,0
MCV,0
AGE,0
SOURCE,0


In [None]:
# shuffle data
data = data.sample(frac=1)
data = data.reset_index(drop=True)  # reset index after shuffling

In [None]:
data.head()

Unnamed: 0,HAEMATOCRIT,HAEMOGLOBINS,ERYTHROCYTE,LEUCOCYTE,THROMBOCYTE,MCH,MCHC,MCV,AGE,SOURCE
0,41.7,13.7,5.41,6.0,75,25.3,32.9,77.1,9,1
1,27.0,8.8,3.24,76.6,300,27.2,32.6,83.3,69,1
2,39.4,13.4,4.89,2.7,25,27.4,34.0,80.6,29,1
3,40.2,13.4,4.64,8.2,296,28.9,33.3,86.6,33,1
4,40.3,13.3,4.81,4.3,245,27.7,33.0,83.8,25,0


In [None]:
X = data.drop('SOURCE', axis=1) 
Y = data['SOURCE']

In [None]:
X.shape, Y.shape

((3309, 9), (3309,))

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score

In [None]:
!pip install bayesian-optimization

Collecting bayesian-optimization
  Downloading bayesian_optimization-2.0.3-py3-none-any.whl.metadata (9.0 kB)
Collecting colorama<0.5.0,>=0.4.6 (from bayesian-optimization)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading bayesian_optimization-2.0.3-py3-none-any.whl (31 kB)
Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: colorama, bayesian-optimization
Successfully installed bayesian-optimization-2.0.3 colorama-0.4.6


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from bayes_opt import BayesianOptimization
import joblib
import time
import os

class MetaLearnerPredictor:
    def __init__(self, model_dir='saved_model'):
        """
        Load the saved model, scaler, and mappings
        """
        self.model = tf.keras.models.load_model(os.path.join(model_dir, 'meta_learner_model.keras'))
        self.feature_scaler = joblib.load(os.path.join(model_dir, 'feature_scaler.pkl'))

        # Load the mappings
        mappings = joblib.load(os.path.join(model_dir, 'feature_mappings.pkl'))
        self.max_features_mapping = mappings['max_features_mapping']
        self.inverse_max_features_mapping = mappings['inverse_max_features_mapping']

    def predict_accuracy(self, dataset_features, hyperparams):
        """
        Predict accuracy for given dataset features and hyperparameters
        """
        # Scale dataset features
        dataset_scaled = self.feature_scaler.transform(dataset_features)

        # Convert max_features to numerical value
        if isinstance(hyperparams['max_features'], str):
            max_features_encoded = self.max_features_mapping.get(hyperparams['max_features'], 1)  # default to 'sqrt'
        else:
            max_features_encoded = hyperparams['max_features']

        # Combine features
        X = np.hstack([
            dataset_scaled,
            np.array([[
                hyperparams['n_estimators'],
                hyperparams['max_depth'],
                max_features_encoded,
                hyperparams['min_samples_split']
            ]])
        ])

        return self.model.predict(X)[0][0]

def find_best_hyperparameters(meta_learner, dataset_features, param_grid):
    """
    Find best hyperparameters using meta-learner
    """
    start_time = time.time()
    best_accuracy = 0
    best_params = None

    # Try all combinations from param_grid
    for n_est in param_grid['n_estimators']:
        for depth in param_grid['max_depth']:
            for feat in param_grid['max_features']:
                for min_split in param_grid['min_samples_split']:
                    params = {
                        'n_estimators': n_est,
                        'max_depth': depth,
                        'max_features': feat,
                        'min_samples_split': min_split
                    }

                    pred_accuracy = meta_learner.predict_accuracy(dataset_features, params)

                    if pred_accuracy > best_accuracy:
                        best_accuracy = pred_accuracy
                        best_params = params

    # Convert max_features back to string if it's in the mapping
    if isinstance(best_params['max_features'], (int, float)):
        if best_params['max_features'] in meta_learner.inverse_max_features_mapping:
            best_params['max_features'] = meta_learner.inverse_max_features_mapping[round(best_params['max_features'])]

    time_taken = time.time() - start_time

    return best_params, best_accuracy, time_taken

def compare_methods(dataset_features, param_grid, X_train, y_train):
    """
    Compare meta-learner with traditional methods
    Returns time taken and best parameters for each method
    """
    results = {}

    # 1. Grid Search
    start_time = time.time()
    grid_search = GridSearchCV(
        RandomForestClassifier(random_state=42),
        param_grid,
        cv=3,
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    grid_time = time.time() - start_time

    results['grid_search'] = {
        'time': grid_time,
        'best_params': grid_search.best_params_,
        'best_score': grid_search.best_score_
    }

    # 2. Bayesian Optimization
    def bo_objective(n_estimators, max_depth, min_samples_split):
        clf = RandomForestClassifier(
            n_estimators=int(n_estimators),
            max_depth=int(max_depth),
            max_features='sqrt',  # Fixed to 'sqrt' for simplicity
            min_samples_split=int(min_samples_split),
            random_state=42
        )
        clf.fit(X_train, y_train)
        return clf.score(X_train, y_train)

    start_time = time.time()
    pbounds = {
        'n_estimators': (param_grid['n_estimators'][0], param_grid['n_estimators'][-1]),
        'max_depth': (param_grid['max_depth'][0], param_grid['max_depth'][-1]),
        'min_samples_split': (param_grid['min_samples_split'][0], param_grid['min_samples_split'][-1])
    }

    bo = BayesianOptimization(
        f=bo_objective,
        pbounds=pbounds,
        random_state=42
    )

    bo.maximize(init_points=5, n_iter=10)
    bo_time = time.time() - start_time

    results['bayesian_opt'] = {
        'time': bo_time,
        'best_params': {
            'n_estimators': int(bo.max['params']['n_estimators']),
            'max_depth': int(bo.max['params']['max_depth']),
            'max_features': 'sqrt',
            'min_samples_split': int(bo.max['params']['min_samples_split'])
        },
        'best_score': bo.max['target']
    }

    # 3. Meta-Learner
    meta_learner = MetaLearnerPredictor()
    best_params, best_accuracy, meta_time = find_best_hyperparameters(
        meta_learner, dataset_features, param_grid
    )

    results['meta_learner'] = {
        'time': meta_time,
        'best_params': best_params,
        'best_score': best_accuracy
    }

    return results

In [None]:

X = data.drop('SOURCE', axis=1)
Y = data['SOURCE']

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# User input for dataset characteristics
num_features = X_train.shape[1]  # Total number of features
num_samples = X_train.shape[0]   # Number of samples

# User inputs 
class_imbalance_ratio = 1.5125
num_classes = 2
num_categorical_features = 0
num_numerical_features = 9

# Create dataset features array
dataset_features = np.array([[
    num_features,
    num_samples,
    class_imbalance_ratio,
    num_classes,
    num_categorical_features,
    num_numerical_features
]])

# Define parameter grid
param_grid = {
    'n_estimators': [50, 75, 100, 150],
    'max_depth': [3, 5, 7, 10, 50],
    'max_features': ['sqrt', 'log2'],
    'min_samples_split': [2, 5, 10]
}

# Compare methods
results = compare_methods(dataset_features, param_grid, X_train, Y_train)

# Print results
for method, result in results.items():
    print(f"\n{method}:")
    print(f"Time taken: {result['time']:.2f} seconds")
    print("Best parameters:", result['best_params'])
    print(f"Best score: {result['best_score']:.4f}")

|   iter    |  target   | max_depth | min_sa... | n_esti... |
-------------------------------------------------------------
| [39m1        [39m | [39m0.9498   [39m | [39m20.6     [39m | [39m9.606    [39m | [39m123.2    [39m |
| [35m2        [39m | [35m0.9996   [39m | [35m31.14    [39m | [35m3.248    [39m | [35m65.6     [39m |
| [39m3        [39m | [39m0.7748   [39m | [39m5.73     [39m | [39m8.929    [39m | [39m110.1    [39m |
| [35m4        [39m | [35m1.0      [39m | [35m36.28    [39m | [35m2.165    [39m | [35m147.0    [39m |
| [39m5        [39m | [39m1.0      [39m | [39m42.12    [39m | [39m3.699    [39m | [39m68.18    [39m |
| [39m6        [39m | [39m0.9528   [39m | [39m46.78    [39m | [39m9.267    [39m | [39m128.1    [39m |
| [39m7        [39m | [39m1.0      [39m | [39m41.37    [39m | [39m2.258    [39m | [39m68.31    [39m |
| [39m8        [39m | [39m0.9142   [39m | [39m13.73    [39m | [39m8.413    [39m | [

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

def validate_rf_params(X_train, Y_train, params, name=""):
    """
    Train a Random Forest with given parameters and return the accuracy
    """
    # Create and train the model
    rf = RandomForestClassifier(
        n_estimators=params['n_estimators'],
        max_depth=params['max_depth'],
        max_features=params['max_features'],
        min_samples_split=params['min_samples_split'],
        random_state=42  # for reproducibility
    )

    # Fit and predict
    rf.fit(X_train, Y_train)
    y_pred = rf.predict(X_train)

    # Calculate accuracy
    acc = accuracy_score(Y_train, y_pred)

    print(f"\n{name} Parameters:")
    print(f"Parameters used: {params}")
    print(f"Actual Training Accuracy: {acc:.4f}")

    return acc

# Parameters from the output
grid_search_params = {
    'n_estimators': 150,
    'max_depth': 50,
    'max_features': 'sqrt',
    'min_samples_split': 10
}

bayesian_opt_params = {
    'n_estimators': 146,
    'max_depth': 36,
    'max_features': 'sqrt',
    'min_samples_split': 2
}

meta_learner_params = {
    'n_estimators': 50,
    'max_depth': 50,
    'max_features': 'sqrt',
    'min_samples_split': 5
}

# Validate all three parameter combinations
grid_acc = validate_rf_params(X_train, Y_train, grid_search_params, "Grid Search")
bayesian_acc = validate_rf_params(X_train, Y_train, bayesian_opt_params, "Bayesian Optimization")
meta_acc = validate_rf_params(X_train, Y_train, meta_learner_params, "Meta-learner")


Grid Search Parameters:
Parameters used: {'n_estimators': 150, 'max_depth': 50, 'max_features': 'sqrt', 'min_samples_split': 10}
Actual Training Accuracy: 0.9456

Bayesian Optimization Parameters:
Parameters used: {'n_estimators': 146, 'max_depth': 36, 'max_features': 'sqrt', 'min_samples_split': 2}
Actual Training Accuracy: 1.0000

Meta-learner Parameters:
Parameters used: {'n_estimators': 50, 'max_depth': 50, 'max_features': 'sqrt', 'min_samples_split': 5}
Actual Training Accuracy: 0.9924
