# LSH Running Testing by Double 2T
### Nguyen Minh Tuan
### Duong Minh Tung

Make sure the Libraries are imported

In [17]:
## Import Necessary Libraries
import pandas as pd
import numpy as np
import time
from numba import cuda, jit
from datasketch import MinHash, MinHashLSH
import LSH_class


In [18]:
## Load Data
# Load the CSV file
df = pd.read_csv('movies_with_genres.csv')
movies = {}
for index, row in df.iterrows():
    title = row['originalTitle']
    tags = str(row['genres']).split(', ')
    tags = [tag.strip() for tag in tags]
    movies[title] = tags

## Original LSH Implementation
def original_lsh(movies, num_hashes=128):
    start = time.time()
    lsh = MinHashLSH(threshold=0.5, num_perm=num_hashes)

    for title, tags in movies.items():
        m = MinHash(num_perm=num_hashes)
        for tag in tags:
            m.update(tag.encode('utf8'))
        lsh.insert(title, m)  

    end = time.time()
    return lsh, end - start

In [19]:
## Parallel LSH Implementation
@jit
def custom_hash(x, seed):
    return (x * 0x9e3779b9 + seed) & 0xFFFFFFFF

@cuda.jit
def cuda_update(tags, minhashes, num_hashes):
    pos = cuda.grid(1)
    if pos < tags.shape[0]:
        tag = tags[pos]
        for i in range(num_hashes):
            tag_hash = custom_hash(tag, i)
            cuda.atomic.min(minhashes, (pos, i), tag_hash)

def update_minhash_gpu(tags, num_hashes):
    tags_gpu = cuda.to_device(np.array(tags, dtype=np.int32))
    num_tags = tags_gpu.size
    minhashes_gpu = cuda.to_device(np.full((num_tags, num_hashes), np.inf, dtype=np.float32))

    threads_per_block = 256
    blocks_per_grid = (num_tags + (threads_per_block - 1)) // threads_per_block

    cuda_update[blocks_per_grid, threads_per_block](tags_gpu, minhashes_gpu, num_hashes)

    return minhashes_gpu.copy_to_host()

def parallel_lsh(movies, num_hashes=128):
    start = time.time()
    lsh = LSH_class.LSH(num_hashes=num_hashes)

    unique_tags = set(tag for tags in movies.values() for tag in tags)
    tag_to_int = {tag: i for i, tag in enumerate(unique_tags)}

    int_tags_dict = {title: np.array([tag_to_int[tag] for tag in tags], dtype=np.int32) for title, tags in movies.items()}
    all_tags = np.concatenate(list(int_tags_dict.values()))
    tags_offset = np.cumsum([0] + [len(tags) for tags in int_tags_dict.values()])

    all_tags_gpu = cuda.to_device(all_tags)
    minhashes_gpu = cuda.to_device(np.full((all_tags.size, num_hashes), np.inf, dtype=np.float32))

    threads_per_block = 256
    blocks_per_grid = (all_tags.size + (threads_per_block - 1)) // threads_per_block

    cuda_update[blocks_per_grid, threads_per_block](all_tags_gpu, minhashes_gpu, num_hashes)

    all_minhashes = minhashes_gpu.copy_to_host()

    for i, (title, tags) in enumerate(movies.items()):
        start_idx = tags_offset[i]
        end_idx = tags_offset[i + 1]
        minhash_signature = np.min(all_minhashes[start_idx:end_idx], axis=0)
        lsh.insert(title, minhash_signature)

    end = time.time()
    return lsh, end - start

In [20]:
## Comparison
num_hashes = 128

# Original LSH
# original_lsh_instance, original_time = original_lsh(movies, num_hashes)

# Parallel LSH
parallel_lsh_instance, parallel_time = parallel_lsh(movies, num_hashes)

# Print Results
# print(f"Original LSH Time: {original_time:.2f} seconds")
print(f"Parallel LSH Time: {parallel_time:.2f} seconds")

Parallel LSH Time: 72.34 seconds


In [21]:
# Parallel LSH
parallel_lsh_instance.show_signatures()

Key: Carmencita, Bands: [(np.float32(3362296600.0),)]
Key: Chinese Opium Den, Bands: [(np.float32(3362296600.0),)]
Key: Das boxende Känguruh, Bands: [(np.float32(3362296600.0),)]
Key: Les forgerons, Bands: [(np.float32(3362296600.0),)]
Key: Baignade en mer, Bands: [(np.float32(3362296600.0),)]
Key: Barnet Horse Fair, Bands: [(np.float32(3362296600.0),)]
Key: Bataille de neige, Bands: [(np.float32(3362296600.0),)]
Key: Le bivouac, Bands: [(np.float32(3362296600.0),)]
Key: Les blanchisseuses, Bands: [(np.float32(3362296600.0),)]
Key: Les chevaux de bois, Bands: [(np.float32(3362296600.0),)]


In [23]:
## Query Function
def query_lsh(lsh, query_str, tag_to_int, num_hashes):
    query = [tag.strip() for tag in query_str.split(',')]
    query_minhash = MinHash(num_perm=num_hashes)
    for tag in query:
        query_minhash.update(tag.encode('utf8'))
    result = lsh.query(query_minhash)
    return result

## Query Example
query_str = "Action, Adventure"
# original_result = query_lsh(original_lsh_instance, query_str, movies, num_hashes)
parallel_result = query_lsh(parallel_lsh_instance, query_str, movies, num_hashes)

# Print Query Results
# print("Original LSH Query Result:")
# print(original_result)

print("Parallel LSH Query Result:")
print(parallel_result)

TypeError: 'MinHash' object is not subscriptable