In [12]:
def generate_dataset_refined(n_samples=100000, n_outliers=100, dimensions=20):
    # Generate inliers uniformly within a range
    inliers = np.random.uniform(-1, 1, size=(n_samples - n_outliers, dimensions))

    # Prepare to generate outliers
    outlier_samples = []
    outlier_indices = []
    outlier_dims = []

    # Define different clusters of outliers
    cluster_definitions = [
        (0, 7),    # First 7 dimensions
        (5, 10),   # 5 dimensions in the middle
        (3, 8),    # Another set of 5 dimensions, overlapping with the first
        (1, 6),    # 5 dimensions starting from second
        (6, 10),   # Last 4 dimensions
        (0, 4),    # First 4 dimensions
        (2, 7),    # 5 dimensions starting from third
        (4, 9),    # 5 dimensions starting near the middle
        (3, 6),    # 3 dimensions in the middle
        (7, 10)    # Last 3 dimensions
    ]

    # Adjust if the number of dimensions is different
    if dimensions != 10:
        scaling_factor = dimensions // 10
        cluster_definitions = [(start * scaling_factor, min(end * scaling_factor, dimensions)) for start, end in cluster_definitions]

    # Generate outliers for each cluster
    for start, end in cluster_definitions:
        for _ in range(n_outliers // len(cluster_definitions)):
            # Normal values for non-deviating dimensions
            normal_dims = list(set(range(dimensions)) - set(range(start, end)))
            outlier = np.random.uniform(-1, 1, dimensions)
            # More extreme values for the deviating dimensions
            outlier[start:end] = np.random.uniform(1, 10, end - start)
            
            outlier_samples.append(outlier)
            outlier_indices.append(len(inliers) + len(outlier_samples) - 1)
            outlier_dims.append((start, end))

    # Combine inliers and outliers
    dataset = np.vstack([inliers, np.array(outlier_samples)])

    return dataset, outlier_indices, outlier_dims

# Example usage with 20 dimensions
dataset_refined, outlier_indices_refined, outlier_dims_refined = generate_dataset_refined(dimensions=20)

# Show some details about the generated dataset
dataset_refined.shape, outlier_indices_refined[:10], outlier_dims_refined[:10]  # Displaying the shape and some of the outlier indices and dimensions


((100000, 20),
 [99900, 99901, 99902, 99903, 99904, 99905, 99906, 99907, 99908, 99909],
 [(0, 14),
  (0, 14),
  (0, 14),
  (0, 14),
  (0, 14),
  (0, 14),
  (0, 14),
  (0, 14),
  (0, 14),
  (0, 14)])

In [13]:
print(dataset_refined[outlier_indices[1],:])
print(outlier_dims_refined[0])

[ 6.85014661e+00  9.52686681e+00  8.04773645e+00  1.58311848e+00
  1.11156149e+00  1.37681425e+00  6.63531996e+00  1.68197813e+00
  7.34665157e+00  7.79260837e+00  9.20191937e+00  1.33944793e+00
  6.84117393e+00  5.13482208e+00 -7.54419762e-01  1.28547983e-01
  2.21224127e-03 -8.09729614e-02 -9.60154386e-01 -4.09687961e-01]
(0, 14)


In [7]:
print(dataset[outlier_indices[0]-1,:])
# print(outlier_dims[0])

[-0.09381236 -0.29806589 -0.46524531 -0.38053464  0.0726339   0.37105881
 -0.59153716  0.91395219  0.52950104  0.65878209  0.271685   -0.49476214
  0.58861706  0.66072736  0.47341661 -0.64412631 -0.96381697 -0.21474503
  0.18271335 -0.21766542]
