In [55]:
import pandas as pd
import numpy as np

In [56]:
dataset = pd.read_csv("./datasets/hospital/dirty.csv")

In [57]:
from datasketch import MinHash, MinHashLSH

records = dataset.values.tolist()

# Create an LSH index with a threshold and number of permutations
lsh = MinHashLSH(threshold=0.5, num_perm=128)

# Insert each record into the LSH index
for i, record in enumerate(records):
    m = MinHash(num_perm=128)
    for feature in record:
        m.update(str(feature).encode("utf8"))  # Hashing the attributes of the record
    lsh.insert(i, m)

In [58]:
# To store the buckets (groups of similar records)
buckets = []
visited = set()  # To track records that have already been assigned to a bucket

# Querying similar records for each record
for i, record in enumerate(records):
    if i not in visited:  # Only process records that haven't been visited
        # Create MinHash for the current record
        m = MinHash(num_perm=128)
        for feature in record:
            m.update(str(feature).encode("utf8"))

        # Query LSH to get similar records
        similar_records = lsh.query(m)

        # Add the current record and its similar ones as a new bucket
        buckets.append(similar_records)

        # Mark all similar records as visited
        visited.update(similar_records)

# Print the buckets
# for idx, bucket in enumerate(buckets):
#     print(f"Bucket {idx + 1}: {bucket}")

In [59]:
# Calculate the sizes of all buckets
bucket_sizes = [(i, len(bucket)) for i, bucket in enumerate(buckets)]

# Sort buckets by size in descending order
sorted_buckets = sorted(bucket_sizes, key=lambda x: x[1], reverse=True)

# Retrieve the top 10 buckets
top_buckets_raw = sorted_buckets[:10]  # Get the top 10 buckets

# Define a minimum size threshold (e.g., 2)
min_size_threshold = 2

# Filter the top buckets based on the minimum size threshold
filtered_top_buckets = [
    bucket for bucket in top_buckets_raw if bucket[1] >= min_size_threshold
]

# Print the filtered top buckets and their sizes
print(f"Top Buckets by Size (Minimum Size Threshold: {min_size_threshold}):")
for idx, size in filtered_top_buckets:
    print(f"Bucket {idx} (Size: {size}): {buckets[idx]}")

# If you want to handle the case where no buckets meet the threshold
if not filtered_top_buckets:
    print("No buckets met the minimum size threshold.")

Top Buckets by Size (Minimum Size Threshold: 2):
Bucket 29 (Size: 39): [896, 898, 899, 900, 416, 418, 419, 420, 423, 424, 426, 429, 195, 69, 70, 73, 74, 76, 77, 78, 80, 84, 90, 91, 348, 92, 352, 878, 881, 882, 883, 884, 885, 886, 887, 888, 889, 892, 895]
Bucket 2 (Size: 27): [384, 8, 9, 10, 392, 396, 13, 11, 12, 14, 15, 17, 294, 295, 296, 452, 459, 460, 461, 463, 464, 465, 852, 853, 854, 986, 858]
Bucket 0 (Size: 20): [0, 1, 2, 3, 4, 5, 6, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 308, 309, 310]
Bucket 173 (Size: 20): [591, 592, 593, 594, 595, 596, 597, 598, 600, 603, 604, 607, 608, 609, 610, 611, 612, 613, 614, 615]
Bucket 225 (Size: 20): [774, 775, 776, 778, 781, 783, 784, 785, 787, 789, 790, 792, 793, 794, 795, 796, 798, 799, 800, 801]
Bucket 23 (Size: 19): [654, 529, 404, 790, 918, 282, 679, 815, 307, 57, 454, 326, 207, 479, 993, 232, 629, 504, 890]
Bucket 53 (Size: 19): [169, 170, 171, 172, 173, 174, 175, 177, 178, 181, 182, 183, 184, 187, 188, 191, 192, 724, 741]
Bucket 1