In [47]:
import numpy as np
from sklearn.neighbors import NearestNeighbors

In [48]:
def classify_examples(X: np.ndarray, Y: np.ndarray, k: int = 3) -> np.ndarray:
    """
    Classify examples as noisy or safe based on their k nearest neigbors.

    Args:
        X (numpy.ndarray): The feature matrix of shape (n_samples, n_features).
        Y (numpy.ndarray): The target labels of shape(n_samples,).
        k (int, optional): The number of nearest neighbors to consider, defaults to 3.
    
    Returns:
        numpy.ndarray: An array of flags indicating the type of each example (0 for safe, 1 for noisy).
    """
    n_samples = X.shape[0]
    flags = np.zeros(n_samples, dtype=int)  # Intialize flags for all examples as safe

    # Fit a k-nearest neighbors model
    nn = NearestNeighbors(n_neighbors=k)
    nn.fit(X)

    for i in range(n_samples):
        example = X[i]
        label = Y[i]

        # Find the indices of the k nearest neigbors
        indices = nn.kneighbors([example], return_distance=False)

        # Exclude the current example itself from the neigbors
        neighbors_indices = indices[0][1:]

        # Check if the majority of neighbors have the same label as the current example
        if np.sum(Y[neighbors_indices] == label) >= k // 2:
            flags[i] = 0    # Set flag as safe
        else:
            flags[i] = 1    # Set flag as noisy
    return flags

In [49]:
def find_knn_indices(k, majority_class, example, safe_noisy_class, flags, Y, X):
    """
    Find the indices of the safe examples among its k nearest neigbors.

    Args:
        k (int): The number of nearest neigbors to consider.
        majority_class (int): The label of the majority class.
        example (np.ndarray): The example for which to find the nearest neigbors.
        safe_noisy_class (int): The class type (0 for safe, 1 for noisy) to consider.
        flags (np.ndarray): The flags indicating the type of each example (0 for safe, 1 for noisy).
        Y (np.ndarray): The target labels of shape (n_samples,)
        X (np.ndarray): The feature matrix of shape (n_samples, n_features).

    Returns:
        np.ndarray: The indices of the safe examples among its k nearest neigbors.
    """
    # Find the indices of the safe examples among its k nearest neigbors
    safe_indices = np.where((flags == safe_noisy_class) & (Y == majority_class))[0]

    # Calculate distances between the example and safe examples
    distances = []
    for safe_ind in safe_indices:
        distances.append(np.linalg.norm(X[safe_ind] - example))

    # Combine distances and indices
    combined = zip(distances, safe_indices)

    # Sort the combined list based on distances
    sorted_combined = sorted(combined)
    sorted_distances, sorted_indices = zip(*sorted_combined)

    # Get the indices of the k nearest neigbors
    knn_indices = sorted_indices[:k]
    return knn_indices[:k]
    

In [50]:
def weak_amplification(X: np.ndarray, Y: np.ndarray, flags: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
    """
    Performs weak amplification by creating copies of noisy examples.

    Args:
        X (numpy.ndarray): The feature matrix of shape (n_samples, n_features).
        Y (numpy.ndarray): The target labels of shape (n_samples,).
        flags (numpy.ndarray): The flags indicating the type of each example (0 for safe, 1 for noisy).

    Returns:
        tuple[numpy.ndarray, numpy.ndarray]: The updated feature matrix after ampflication and 
                                             the updated target labels after amplification.
    """
    n_samples = X.shape[0]  # Get the number of samples in the feature matrix X
    new_X = X.copy()    # Create copy of the feature matrix X to store the updated values
    new_Y = Y.copy()    # Create copy of the target labels Y to store the updated values
    k = 3   # Set the value of k to determine the number of nearest neigbors to consider

    # Determine the majority class label based on the target labels
    majority_class_label = np.argmax(np.bincount(Y))

    # Determine the unique labels present in the target labels
    unique_labels = np.unique(Y)

    for i in range(n_samples):
        if flags[i] == 1:   # Check it the sample is flagged as noisy
            example = X[i]  # Get the example for the current sample
            label = Y[i]    # Get the label for the current sample

            # Find the indices of the safe example among its k nearest neighbors
            safe_indices = find_knn_indices(k, majority_class_label, example, 0, flags, Y, X)
            
            # Amplify the example by creating copies
            for idx in safe_indices:
                new_X = np.vstack((new_X, example))
                new_Y = np.hstack((new_Y, label))
    return new_X, new_Y

In [51]:
# Example usage
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]])
y = np.array([0, 0, 1, 1, 0])
flags = classify_examples(X, y)
print("flags", flags)

new_X, new_y = weak_amplification(X, y, flags)
print(new_X)
print(new_y)

flags [0 0 0 0 1]
[[ 1  2]
 [ 3  4]
 [ 5  6]
 [ 7  8]
 [ 9 10]
 [ 9 10]
 [ 9 10]]
[0 0 1 1 0 0 0]
