# Data Processing

In [5]:
import pandas as pd
import numpy as np

In [6]:
raw_data = pd.read_csv('GSE145668_combined_data.csv')

In [7]:
data = raw_data.drop(columns=['GeneName','GeneSymbol'])
data = data.transpose()
data.columns = raw_data['GeneSymbol']
data['Label'] = np.where(data.index.str.contains('Ctrl', na=False), 'Ctrl', 'AA')

In [8]:
data

GeneSymbol,A1BG,A1CF,A2M,A2ML1,A4GALT,AAAS,AACS,AADAC,AADAT,AAED1,...,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,Label
Ctrl1_L1_sc01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,0,0,Ctrl
Ctrl1_L1_sc02,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Ctrl
Ctrl1_L1_sc03,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Ctrl
Ctrl1_L1_sc04,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,Ctrl
Ctrl1_L1_sc05,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Ctrl
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P9_L3_sc92,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,AA
P9_L3_sc93,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,AA
P9_L3_sc94,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,AA
P9_L3_sc95,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,AA


# Batch Selection Method Functions

In [9]:
def calculate_batch_fisher_information(X_candidates, model, uncertainty_scores):
    fisher_scores = np.zeros(len(X_candidates))
    
    # Calculate feature-wise scores without constructing the full matrix
    for i in range(len(X_candidates)):
        x = X_candidates[i]
        proba = model.predict_proba(x.reshape(1, -1))[0, 1]
        
        # Clip probabilities to avoid numerical issues
        p = np.clip(proba, 1e-6, 1-1e-6)
        
        # For logistic regression, diagonal of Fisher Information is p(1-p)*x^2
        fisher_diag = p * (1 - p) * np.square(x)
        
        # Add a small offset to prevent zeros
        fisher_score = np.sum(fisher_diag) + 1e-10
        
        # Combine with uncertainty score
        fisher_scores[i] = uncertainty_scores[i] * fisher_score
    
    # Normalize scores to [0,1] range
    if np.max(fisher_scores) > 0:
        fisher_scores = fisher_scores / np.max(fisher_scores)
    else:
        # If all scores are zero, use uniform distribution
        fisher_scores = np.ones(len(fisher_scores)) / len(fisher_scores)
    
    return fisher_scores

def select_batch_min_fisher(indices, fisher_scores, batch_size):
    sorted_indices = np.argsort(-fisher_scores)  
    selected_batch = sorted_indices[:batch_size]
    return indices[selected_batch]

# Baseline Methods

## Passive Learning

In [10]:
import pandas as pd
import numpy as np
import random
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import KFold
from tqdm import tqdm

def random_sampling(data, n_simulations, n_folds, n_init, n_term, batch_size, seed):
    X = data.drop(columns=['Label']).values
    y = data['Label'].values

    training_results = np.zeros((n_folds, n_simulations, (n_term - n_init) // batch_size + 1))
    testing_results = np.zeros((n_folds, n_simulations, (n_term - n_init) // batch_size + 1))

    kf = KFold(n_splits=n_folds, shuffle=True, random_state=seed)
    for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X)):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        for iteration in tqdm(range(n_simulations)):
            indices = np.arange(len(X_train))
            starting_indices = np.random.choice(indices, size=n_init, replace=False)

            X_start_train = X_train[starting_indices]
            y_start_train = y_train[starting_indices]

            indices = np.setdiff1d(indices, starting_indices)
            
            batch_counter = 0
            model = SGDClassifier(loss="log_loss", random_state=seed)
            model.fit(X_start_train, y_start_train)
            training_results[fold_idx, iteration, batch_counter] = model.score(X_start_train, y_start_train)
            testing_results[fold_idx, iteration, batch_counter] = model.score(X_test, y_test)
            batch_counter += 1

            while len(X_start_train) < n_term and len(indices) >= batch_size:
                # Select a random batch
                batch_indices = np.random.choice(indices, size=batch_size, replace=False)
                indices = np.setdiff1d(indices, batch_indices)
                
                X_batch = X_train[batch_indices]
                y_batch = y_train[batch_indices]
                
                X_start_train = np.vstack([X_start_train, X_batch])
                y_start_train = np.append(y_start_train, y_batch)
                
                model = SGDClassifier(loss="log_loss", random_state=seed)
                model.fit(X_start_train, y_start_train)
                training_results[fold_idx, iteration, batch_counter] = model.score(X_start_train, y_start_train)
                testing_results[fold_idx, iteration, batch_counter] = model.score(X_test, y_test)
                batch_counter += 1

    return training_results, testing_results

## Uncertainty Sampling

In [11]:
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import KFold
from tqdm import tqdm

def uncertainty_sampling(data, n_simulations, n_folds, n_init, n_term, batch_size, seed):
    X = data.drop(columns=['Label']).values
    y = data['Label'].values

    feature_names = data.drop(columns=['Label']).columns

    training_results = np.zeros((n_folds, n_simulations, (n_term - n_init) // batch_size + 1))
    testing_results = np.zeros((n_folds, n_simulations, (n_term - n_init) // batch_size + 1))

    feature_selection_counts = {feature: 0 for feature in feature_names}
    feature_importance_values = {feature: [] for feature in feature_names}

    kf = KFold(n_splits=n_folds, shuffle=True, random_state=seed)
    for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X)):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        for iteration in tqdm(range(n_simulations)):
            indices = np.arange(len(X_train))
            starting_indices = np.random.choice(indices, size=n_init, replace=False)

            X_start_train = X_train[starting_indices]
            y_start_train = y_train[starting_indices]

            indices = np.setdiff1d(indices, starting_indices)
            
            batch_counter = 0
            model = SGDClassifier(loss="log_loss", random_state=seed)
            model.fit(X_start_train, y_start_train)
            training_results[fold_idx, iteration, batch_counter] = model.score(X_start_train, y_start_train)
            testing_results[fold_idx, iteration, batch_counter] = model.score(X_test, y_test)
            batch_counter += 1

            while len(X_start_train) < n_term and len(indices) >= batch_size:
                model = SGDClassifier(loss="log_loss", random_state=seed)
                model.fit(X_start_train, y_start_train)
                
                # Get uncertainty scores for remaining samples
                probabilities = model.predict_proba(X_train[indices])[:, 1]
                uncertainty_scores = np.abs(probabilities - 0.5)  # Distance from decision boundary
                
                # Compute Fisher Information Matrix for each candidate sample
                fisher_scores = calculate_batch_fisher_information(X_train[indices], model, uncertainty_scores)
                
                # Select batch that minimizes Fisher Information
                batch_indices = select_batch_min_fisher(indices, fisher_scores, batch_size)
                indices = np.setdiff1d(indices, batch_indices)
                
                X_batch = X_train[batch_indices]
                y_batch = y_train[batch_indices]

                selected_samples = X_train[batch_indices]
            
                for i, feature in enumerate(feature_names):
                    feature_importance_values[feature].extend(np.abs(selected_samples[:, i]).tolist())
                    significant_values = np.abs(selected_samples[:, i]) > np.mean(np.abs(X_train[:, i]))
                    feature_selection_counts[feature] += np.sum(significant_values)

                
                X_start_train = np.vstack([X_start_train, X_batch])
                y_start_train = np.append(y_start_train, y_batch)
                
                model = SGDClassifier(loss="log_loss", random_state=seed)
                model.fit(X_start_train, y_start_train)
                training_results[fold_idx, iteration, batch_counter] = model.score(X_start_train, y_start_train)
                testing_results[fold_idx, iteration, batch_counter] = model.score(X_test, y_test)
                batch_counter += 1
    
    avg_importance = {}
    for feature in feature_names:
        if feature_importance_values[feature]:
            avg_importance[feature] = np.mean(feature_importance_values[feature])
        else:
            avg_importance[feature] = 0
            
    print("\nTop 10 critical features for Uncertainty Sampling:")
    sorted_features = sorted(avg_importance.items(), key=lambda x: x[1], reverse=True)
    for feature, importance in sorted_features[:10]:
        print(f"  {feature}: {importance:.4f} (selected {feature_selection_counts[feature]} times)")
    

    return training_results, testing_results

## Query By Committee

In [12]:
import pandas as pd
import numpy as np
import random
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import KFold
from scipy.stats import entropy
from tqdm import tqdm

def qbc_sampling(data, n_simulations, n_folds, n_init, n_term, n_committee, batch_size, seed):
    X = data.drop(columns=['Label']).values
    y = data['Label'].values
    feature_names = data.drop(columns=['Label']).columns

    training_results = np.zeros((n_folds, n_simulations, (n_term - n_init) // batch_size + 1))
    testing_results = np.zeros((n_folds, n_simulations, (n_term - n_init) // batch_size + 1))

    feature_selection_counts = {feature: 0 for feature in feature_names}
    feature_importance_values = {feature: [] for feature in feature_names}

    kf = KFold(n_splits=n_folds, shuffle=True, random_state=seed)
    for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X)):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        for iteration in tqdm(range(n_simulations)):
            indices = np.arange(len(X_train))
            starting_indices = np.random.choice(indices, size=n_init, replace=False)

            X_start_train = X_train[starting_indices]
            y_start_train = y_train[starting_indices]

            indices = np.setdiff1d(indices, starting_indices)
            
            batch_counter = 0
            model = SGDClassifier(loss="log_loss", random_state=seed)
            model.fit(X_start_train, y_start_train)
            training_results[fold_idx, iteration, batch_counter] = model.score(X_start_train, y_start_train)
            testing_results[fold_idx, iteration, batch_counter] = model.score(X_test, y_test)
            batch_counter += 1

            while len(X_start_train) < n_term and len(indices) >= batch_size:
                
                committee = [SGDClassifier(loss="log_loss", random_state=seed + j) for j in range(n_committee)]
                for model in committee:
                    model.fit(X_start_train, y_start_train)
                
                # Calculate disagreement for each sample
                committee_probs = np.array([model.predict_proba(X_train[indices]) for model in committee])
                committee_probs = np.transpose(committee_probs, (1, 0, 2))  # (samples, models, classes)
                mean_pred_probs = np.mean(committee_probs, axis=1)

                # Calculate KL divergence for each sample and model
                disagreement_scores = []
                for i in range(committee_probs.shape[0]):
                    sample_kl = []
                    for j in range(committee_probs.shape[1]):
                        p = np.clip(committee_probs[i, j], 1e-10, 1)
                        q = np.clip(mean_pred_probs[i], 1e-10, 1)
                        sample_kl.append(entropy(p, q))
                    disagreement_scores.append(np.mean(sample_kl))
                disagreement_scores = np.array(disagreement_scores)
                
                # Compute Fisher Information Matrix for each candidate sample based on disagreement
                fisher_scores = calculate_batch_fisher_information(X_train[indices], committee[0], disagreement_scores)
                
                # Select batch that minimizes Fisher Information
                batch_indices = select_batch_min_fisher(indices, fisher_scores, batch_size)
                indices = np.setdiff1d(indices, batch_indices)
                
                X_batch = X_train[batch_indices]
                y_batch = y_train[batch_indices]

                selected_samples = X_train[batch_indices]
            
                for i, feature in enumerate(feature_names):
                    feature_importance_values[feature].extend(np.abs(selected_samples[:, i]).tolist())
                    significant_values = np.abs(selected_samples[:, i]) > np.mean(np.abs(X_train[:, i]))
                    feature_selection_counts[feature] += np.sum(significant_values)
                
                X_start_train = np.vstack([X_start_train, X_batch])
                y_start_train = np.append(y_start_train, y_batch)
                
                model = SGDClassifier(loss="log_loss", random_state=seed)
                model.fit(X_start_train, y_start_train)
                training_results[fold_idx, iteration, batch_counter] = model.score(X_start_train, y_start_train)
                testing_results[fold_idx, iteration, batch_counter] = model.score(X_test, y_test)
                batch_counter += 1
    
    avg_importance = {}
    for feature in feature_names:
        if feature_importance_values[feature]:
            avg_importance[feature] = np.mean(feature_importance_values[feature])
        else:
            avg_importance[feature] = 0
    
    print("\nTop 10 critical features for Query-by-Committee:")
    sorted_features = sorted(avg_importance.items(), key=lambda x: x[1], reverse=True)
    for feature, importance in sorted_features[:10]:
        print(f"  {feature}: {importance:.4f} (selected {feature_selection_counts[feature]} times)")
    
    return training_results, testing_results

# Fuzzy KNN

In [23]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.linear_model import SGDClassifier
from sklearn.metrics.pairwise import euclidean_distances
from scipy.spatial.distance import cdist
from tqdm import tqdm

def fuzzy_knn_sampling(data, n_simulations, n_folds, n_init, n_term, batch_size, seed, k=5, m=2):
    X = data.drop(columns=['Label']).values
    y = data['Label'].values
    feature_names = data.drop(columns=['Label']).columns
    
    unique_labels = np.unique(y)
    if not np.array_equal(np.sort(unique_labels), np.array([0, 1])):
        label_map = {unique_labels[0]: 0, unique_labels[1]: 1}
        y = np.array([label_map[label] for label in y])

    training_results = np.zeros((n_folds, n_simulations, (n_term - n_init) // batch_size + 1))
    testing_results = np.zeros((n_folds, n_simulations, (n_term - n_init) // batch_size + 1))
    
    # Track selected features and their importance
    feature_selection_counts = {feature: 0 for feature in feature_names}
    feature_importance_values = {feature: [] for feature in feature_names}
    
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=seed)
    for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X)):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        
        for iteration in tqdm(range(n_simulations)):
            indices = np.arange(len(X_train))
            starting_indices = np.random.choice(indices, size=n_init, replace=False)
            
            X_start_train = X_train[starting_indices]
            y_start_train = y_train[starting_indices]
            
            indices = np.setdiff1d(indices, starting_indices)
            
            batch_counter = 0
            model = SGDClassifier(loss="log_loss", random_state=seed)
            model.fit(X_start_train, y_start_train)
            
            training_results[fold_idx, iteration, batch_counter] = model.score(X_start_train, y_start_train)
            testing_results[fold_idx, iteration, batch_counter] = model.score(X_test, y_test)
            batch_counter += 1
            
            while len(X_start_train) < n_term and len(indices) >= batch_size:
                # Get unlabeled samples
                X_unlabeled = X_train[indices]
                
                # For binary classification, we can simplify the fuzzy KNN approach
                # by focusing on the membership to class 1
                
                # Calculate distances between unlabeled and labeled samples
                distances = cdist(X_unlabeled, X_start_train, 'euclidean')
                
                # For each unlabeled sample, get the k nearest neighbors
                nearest_neighbors = np.argsort(distances, axis=1)[:, :k]
                nearest_distances = np.take_along_axis(distances, nearest_neighbors, axis=1)
                
                # Apply fuzzy distance weighting
                # Avoid division by zero by adding a small epsilon
                weights = 1.0 / (nearest_distances ** (2 / (m - 1)) + 1e-10)
                
                # Normalize weights
                row_sums = weights.sum(axis=1, keepdims=True)
                weights = weights / row_sums
                
                # Calculate fuzzy membership to class 1
                membership_class_1 = np.zeros(len(X_unlabeled))
                
                for i in range(len(X_unlabeled)):
                    neighbor_labels = y_start_train[nearest_neighbors[i]]
                    membership_class_1[i] = np.sum(weights[i] * neighbor_labels)
                
                # For binary classification, uncertainty is highest at 0.5
                uncertainty_scores = 0.5 - np.abs(membership_class_1 - 0.5)
                
                # Select the most uncertain samples
                batch_indices = np.argsort(uncertainty_scores)[-batch_size:]
                
    
                original_batch_indices = indices[batch_indices]
                indices = np.setdiff1d(indices, original_batch_indices)
            
                selected_samples = X_train[original_batch_indices]
                for i, feature in enumerate(feature_names):
                    feature_importance_values[feature].extend(np.abs(selected_samples[:, i]).tolist())
                    significant_values = np.abs(selected_samples[:, i]) > np.mean(np.abs(X_train[:, i]))
                    feature_selection_counts[feature] += np.sum(significant_values)
                
                X_batch = X_train[original_batch_indices]
                y_batch = y_train[original_batch_indices]
                
                X_start_train = np.vstack([X_start_train, X_batch])
                y_start_train = np.append(y_start_train, y_batch)
                

                model = SGDClassifier(loss="log_loss", random_state=seed)
                model.fit(X_start_train, y_start_train)
                
                training_results[fold_idx, iteration, batch_counter] = model.score(X_start_train, y_start_train)
                testing_results[fold_idx, iteration, batch_counter] = model.score(X_test, y_test)
                batch_counter += 1
    

    avg_importance = {}
    for feature in feature_names:
        if feature_importance_values[feature]:
            avg_importance[feature] = np.mean(feature_importance_values[feature])
        else:
            avg_importance[feature] = 0

    print("\nTop 10 critical features for Fuzzy KNN Sampling:")
    sorted_features = sorted(avg_importance.items(), key=lambda x: x[1], reverse=True)
    for feature, importance in sorted_features[:10]:
        print(f"  {feature}: {importance:.4f} (selected {feature_selection_counts[feature]} times)")
    
    return training_results, testing_results

# QUIRE

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import KFold
from sklearn.metrics.pairwise import rbf_kernel
from numpy.linalg import pinv
from tqdm import tqdm
from collections import Counter

def compute_quire_scores(X_all, labeled_indices, unlabeled_indices, gamma=1.0):
    # Compute similarity matrix using RBF kernel
    W = rbf_kernel(X_all, gamma=gamma)

    # Construct Laplacian matrix
    D = np.diag(W.sum(axis=1))
    L = D - W

    # Compute L^{-1} (pseudo-inverse for numerical stability)
    L_inv = pinv(L + 1e-6 * np.eye(L.shape[0]))

    # Compute QUIRE scores using the diagonal of the inverse Laplacian
    scores = []
    for idx in unlabeled_indices:
        score = L_inv[idx, idx]
        scores.append(score)

    return np.array(scores)



def quire_sampling(data, n_simulations, n_folds, n_init, n_term, batch_size, seed):
    X = data.drop(columns=['Label']).values
    y = data['Label'].values

    training_results = np.zeros((n_folds, n_simulations, (n_term - n_init) // batch_size + 1))
    testing_results = np.zeros((n_folds, n_simulations, (n_term - n_init) // batch_size + 1))

    kf = KFold(n_splits=n_folds, shuffle=True, random_state=seed)
    feature_counter = Counter()
    for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X)):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        for iteration in tqdm(range(n_simulations), desc=f"Fold {fold_idx + 1}/{n_folds}"):
            indices = np.arange(len(X_train))
            starting_indices = np.random.choice(indices, size=n_init, replace=False)

            X_start_train = X_train[starting_indices]
            y_start_train = y_train[starting_indices]

            indices = np.setdiff1d(indices, starting_indices)

            batch_counter = 0
            model = SGDClassifier(loss="log_loss", random_state=seed)
            model.fit(X_start_train, y_start_train)
            training_results[fold_idx, iteration, batch_counter] = model.score(X_start_train, y_start_train)
            testing_results[fold_idx, iteration, batch_counter] = model.score(X_test, y_test)
            batch_counter += 1

            while len(X_start_train) < n_term and len(indices) >= batch_size:
                model = SGDClassifier(loss="log_loss", random_state=seed)
                model.fit(X_start_train, y_start_train)

                # Get current labeled and unlabeled indices
                labeled_indices = np.setdiff1d(np.arange(len(X_train)), indices)
                unlabeled_indices = indices

                # Compute QUIRE scores
                quire_scores = compute_quire_scores(X_train, labeled_indices, unlabeled_indices)

                # Select samples with the lowest QUIRE scores
                selected_relative_indices = np.argsort(quire_scores)[:batch_size]
                batch_indices = unlabeled_indices[selected_relative_indices]

                # Update indices
                indices = np.setdiff1d(indices, batch_indices)

                # Add selected batch to training set
                X_batch = X_train[batch_indices]
                y_batch = y_train[batch_indices]

                for row in X_batch:
                    top_feature_indices = np.argsort(np.abs(row))[-3:]  # top 3 important features
                    for idx in top_feature_indices:
                        feature_counter[idx] += 1

                X_start_train = np.vstack([X_start_train, X_batch])
                y_start_train = np.append(y_start_train, y_batch)

                model = SGDClassifier(loss="log_loss", random_state=seed)
                model.fit(X_start_train, y_start_train)
                training_results[fold_idx, iteration, batch_counter] = model.score(X_start_train, y_start_train)
                testing_results[fold_idx, iteration, batch_counter] = model.score(X_test, y_test)
                batch_counter += 1
            
            with open('quire_current.npy', 'wb') as f:
                np.save(f, training_results)
                np.save(f, testing_results)

    top_features = feature_counter.most_common(10)
    print("\nTop 10 Most Important Features (based on frequency in selected samples):")
    for feature, count in top_features:
        print(f"Feature {feature} selected {count} times")
        
    return training_results, testing_results

# Deep Learning 

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.model_selection import KFold
from tqdm import tqdm
import numpy as np
from collections import Counter


class DeepDropoutNN(nn.Module):
    def __init__(self, input_dim, dropout_prob=0.5):
        super(DeepDropoutNN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(dropout_prob),

            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(dropout_prob),

            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(dropout_prob),

            nn.Linear(128, 2)  # Binary classification output logits
        )

    def forward(self, x):
        return self.net(x)


def predict_with_uncertainty(model, X, T=20):
    model.train()  # Enable dropout during inference
    softmax = nn.Softmax(dim=1)
    predictions = []

    with torch.no_grad():
        for _ in range(T):
            logits = model(X)
            probs = softmax(logits)
            predictions.append(probs.unsqueeze(0))

    preds = torch.cat(predictions, dim=0)  # [T, batch, num_classes]
    mean_probs = preds.mean(dim=0)
    variance = preds.var(dim=0).mean(dim=1)
    return mean_probs, variance


def active_learning_dropout(data, n_simulations, n_folds, n_init, n_term, batch_size, seed):
    torch.manual_seed(seed)
    np.random.seed(seed)

    X = data.drop(columns=['Label']).values.astype(np.float32)
    y = data['Label'].astype('category').cat.codes.values

    training_results = np.zeros((n_folds, n_simulations, (n_term - n_init) // batch_size + 1))
    testing_results = np.zeros((n_folds, n_simulations, (n_term - n_init) // batch_size + 1))

    cell_counter = Counter()

    kf = KFold(n_splits=n_folds, shuffle=True, random_state=seed)
    for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X)):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        X_test_tensor = torch.tensor(X_test)
        y_test_tensor = torch.tensor(y_test, dtype=torch.long)

        for iteration in tqdm(range(n_simulations), desc=f"Fold {fold_idx + 1}/{n_folds}"):
            indices = np.arange(len(X_train))
            start_idx = np.random.choice(indices, size=n_init, replace=False)
            indices = np.setdiff1d(indices, start_idx)

            selected_X = torch.tensor(X_train[start_idx])
            selected_y = torch.tensor(y_train[start_idx], dtype=torch.long)

            batch_counter = 0
            model = DeepDropoutNN(X.shape[1])
            optimizer = optim.Adam(model.parameters(), lr=0.0005)
            loss_fn = nn.CrossEntropyLoss()

            def train_model(model, X, y, epochs=25):
                model.train()
                for _ in range(epochs):
                    optimizer.zero_grad()
                    outputs = model(X)
                    loss = loss_fn(outputs, y)
                    loss.backward()
                    optimizer.step()

            train_model(model, selected_X, selected_y)
            model.eval()
            with torch.no_grad():
                acc_train = (model(selected_X).argmax(dim=1) == selected_y).float().mean().item()
                acc_test = (model(X_test_tensor).argmax(dim=1) == y_test_tensor).float().mean().item()
            training_results[fold_idx, iteration, batch_counter] = acc_train
            testing_results[fold_idx, iteration, batch_counter] = acc_test
            batch_counter += 1

            while len(selected_X) < n_term and len(indices) >= batch_size:
                model.eval()
                X_pool = torch.tensor(X_train[indices])
                mean_probs, uncertainty = predict_with_uncertainty(model, X_pool, T=20)

                topk = torch.topk(uncertainty, batch_size)
                uncertain_indices = indices[topk.indices.numpy()]

                for idx in uncertain_indices:
                    global_idx = train_idx[idx]
                    cell_counter[global_idx] += 1

                batch_X = torch.tensor(X_train[uncertain_indices])
                batch_y = torch.tensor(y_train[uncertain_indices], dtype=torch.long)

                selected_X = torch.cat([selected_X, batch_X])
                selected_y = torch.cat([selected_y, batch_y])
                indices = np.setdiff1d(indices, uncertain_indices)

                model = DeepDropoutNN(X.shape[1])
                optimizer = optim.Adam(model.parameters(), lr=0.0005)
                train_model(model, selected_X, selected_y, epochs=25)

                with torch.no_grad():
                    acc_train = (model(selected_X).argmax(dim=1) == selected_y).float().mean().item()
                    acc_test = (model(X_test_tensor).argmax(dim=1) == y_test_tensor).float().mean().item()
                training_results[fold_idx, iteration, batch_counter] = acc_train
                testing_results[fold_idx, iteration, batch_counter] = acc_test
                batch_counter += 1

            with open('deep_mc_dropout_results.npy', 'wb') as f:
                np.save(f, training_results)
                np.save(f, testing_results)

    print("\nTop 10 Most Frequently Selected Cells:")
    for idx, count in cell_counter.most_common(10):
        print(f"Cell {idx} selected {count} times")

    return training_results, testing_results

## Perform Sampling Results

In [14]:
n_simulations = 3
n_folds = 4
n_init = int(len(data)/n_folds)
n_term = int(len(data) * (n_folds - 1)/(n_folds))
#n_init = 10
#n_term = 15
n_committee = 3
seed = 28
batch_size = 150

In [15]:
n_init, n_term, batch_size

(2240, 6721, 150)

In [16]:
passive_learning_training, passive_learning_testing = random_sampling(data, n_simulations, n_folds, n_init, n_term, batch_size, seed)

100%|██████████| 3/3 [07:20<00:00, 146.95s/it]
100%|██████████| 3/3 [07:47<00:00, 155.76s/it]
100%|██████████| 3/3 [07:40<00:00, 153.35s/it]
100%|██████████| 3/3 [06:57<00:00, 139.03s/it]


In [None]:
np.savez('passive_learning_results.npz', 
         training=passive_learning_training,
         testing=passive_learning_testing)

"\nwith open('passive_learning_results.npy', 'wb') as f:\n    np.save(f, passive_learning_training)\n    np.save(f, passive_learning_testing)\n"

In [18]:
uncertainty_sampling_training, uncertainty_sampling_testing = uncertainty_sampling(data, n_simulations, n_folds, n_init, n_term, batch_size, seed)

100%|██████████| 3/3 [16:04<00:00, 321.64s/it]
100%|██████████| 3/3 [16:51<00:00, 337.18s/it]
100%|██████████| 3/3 [16:22<00:00, 327.63s/it]
100%|██████████| 3/3 [16:29<00:00, 329.71s/it]



Top 10 critical features for Uncertainty Sampling:
  RPS27: 521.1275 (selected 14083 times)
  RPLP1: 346.1210 (selected 14486 times)
  MT-CO1: 340.7516 (selected 14040 times)
  RPL13: 320.5124 (selected 14917 times)
  RPS29: 312.1715 (selected 15543 times)
  MT-CO3: 284.8146 (selected 13135 times)
  TMSB4X: 284.6804 (selected 12227 times)
  RPS14: 276.2634 (selected 14874 times)
  RPL31: 260.7981 (selected 14565 times)
  MT-CYB: 249.2302 (selected 13442 times)


In [None]:
np.savez('uncertainty_sampling.npz', 
         training=uncertainty_sampling_training,
         testing=uncertainty_sampling_testing) 

"\nwith open('uncertainty_sampling.npy', 'wb') as f:\n    np.save(f, uncertainty_sampling_training)\n    np.save(f, uncertainty_sampling_testing)\n"

In [20]:
qbc_sampling_training, qbc_sampling_testing = qbc_sampling(data, n_simulations, n_folds, n_init, n_term, n_committee, batch_size, seed)

100%|██████████| 3/3 [29:44<00:00, 594.81s/it]
100%|██████████| 3/3 [31:08<00:00, 622.92s/it]
100%|██████████| 3/3 [32:16<00:00, 645.40s/it]
100%|██████████| 3/3 [26:26<00:00, 528.76s/it]



Top 10 critical features for Query-by-Committee:
  RPS27: 513.1295 (selected 13878 times)
  RPLP1: 337.3948 (selected 13944 times)
  MT-CO1: 332.1348 (selected 13592 times)
  RPL13: 312.4957 (selected 14455 times)
  RPS29: 305.5713 (selected 15238 times)
  TMSB4X: 279.5582 (selected 12123 times)
  MT-CO3: 278.4653 (selected 12797 times)
  RPS14: 270.2800 (selected 14449 times)
  RPL31: 254.9941 (selected 14193 times)
  MT-CYB: 244.5677 (selected 13194 times)


In [None]:
np.savez('query_by_committee.npz', 
         training=qbc_sampling_training,
         testing=qbc_sampling_testing) 

"\nwith open('query_by_committee.npy', 'wb') as f:\n    np.save(f, qbc_sampling_training)\n    np.save(f, qbc_sampling_testing)\n"

In [24]:
fuzzy_knn_training, fuzzy_knn_testing = fuzzy_knn_sampling(data, n_simulations, n_folds, n_init, n_term, batch_size, seed, k=5, m=2)

100%|██████████| 3/3 [1:06:41<00:00, 1333.74s/it]
100%|██████████| 3/3 [1:06:15<00:00, 1325.18s/it]
100%|██████████| 3/3 [1:11:29<00:00, 1430.00s/it]
100%|██████████| 3/3 [1:31:45<00:00, 1835.17s/it]



Top 10 critical features for Fuzzy KNN Sampling:
  RPS27: 511.8756 (selected 13883 times)
  RPLP1: 343.2138 (selected 14202 times)
  MT-CO1: 335.6958 (selected 13766 times)
  RPL13: 316.4106 (selected 14632 times)
  RPS29: 308.0835 (selected 15252 times)
  MT-CO3: 281.6321 (selected 12910 times)
  TMSB4X: 277.7790 (selected 12078 times)
  RPS14: 272.6684 (selected 14555 times)
  RPL31: 258.0197 (selected 14298 times)
  MT-CYB: 246.2987 (selected 13135 times)


In [25]:
np.savez('fuzzy_knn_results.npz', 
         training=fuzzy_knn_training,
         testing=fuzzy_knn_testing)

In [None]:
quire_training, quire_testing = quire_sampling(data, n_simulations, n_folds, n_init, n_term, batch_size, seed)

In [None]:
np.savez('quire_learning_results.npz', 
         training=quire_training,
         testing=quire_testing)

In [None]:
dl_training, dl_testing = active_learning_dropout(data, n_simulations, n_folds, n_init, n_term, batch_size, seed)

In [None]:
np.savez('nn_learning_results.npz', 
         training=dl_training,
         testing=dl_testing)