In [155]:
import pandas as pd
import numpy as np
import random
import cupy as cp
from numba import cuda,jit

In [156]:
df = pd.read_csv('movie_with_test.csv')
def lower_all_characters(text):
    return text.lower()

df['genres'] = df['genres'].astype(str).apply(lower_all_characters)

In [157]:
def create_shingles(text, min_length=1, max_length=None):
    tags = [tag.strip() for tag in text.split(',')]  # Split tags by comma and remove any extra whitespace

    all_shingles = set()
    for tag in tags:
        words = tag.split()  # Split tag into words
        if max_length is None:
            max_length = max(len(word) for word in words)  # Set max_length to the length of the longest word if not provided

        for word in words:
            # Generate shingles starting from the beginning of the word
            for i in range(min_length, len(word) + 1):
                all_shingles.add(word[:i])
                    
    return all_shingles

df['shingles'] = df['genres'].apply(create_shingles)

In [158]:

# Create hash functions with coefficients a and b
def create_hash_functions(num_hash_functions, max_val):
    hash_functions = []
    random.seed(40)
    for _ in range(num_hash_functions):
        a = random.randint(1, max_val)
        b = random.randint(0, max_val)
        hash_functions.append((a, b))
    return hash_functions

# Convert hash functions to arrays for CUDA
def hash_functions_to_arrays(hash_functions):
    a_array = np.array([func[0] for func in hash_functions], dtype=np.int32)
    b_array = np.array([func[1] for func in hash_functions], dtype=np.int32)
    return a_array, b_array

# CUDA kernel to compute hash functions and update signatures
@cuda.jit
def cuda_hash_function(shingles, num_hashes, signatures, a_array, b_array, max_val):
    idx = cuda.grid(1)
    if idx < shingles.shape[0]:
        for i in range(num_hashes):
            hash_value = 2**32 - 1  # A large constant instead of inf
            for j in range(shingles.shape[1]):
                if shingles[idx, j] == 0:
                    break
                current_hash = (a_array[i] * shingles[idx, j] + b_array[i]) % max_val
                hash_value = min(hash_value, current_hash)
            signatures[idx, i] = hash_value

# Compute MinHash signatures on GPU
def minhash_signatures_cuda(movies, num_hash_functions=30):
    all_shingles = set()
    for shingles in movies.values():
        all_shingles.update(shingles)
    max_val = len(all_shingles)  # Set max_val to the number of unique shingles

    hash_functions = create_hash_functions(num_hash_functions, max_val)
    a_array, b_array = hash_functions_to_arrays(hash_functions)

    # Convert shingles to numerical values for CUDA
    max_shingle_len = max(len(shingles) for shingles in movies.values())
    shingles_gpu = np.zeros((len(movies), max_shingle_len), dtype=np.int32)
    signatures_gpu = np.full((len(movies), num_hash_functions), 2**32 - 1, dtype=np.int32)  # Use large constant

    for i, shingles in enumerate(movies.values()):
        shingle_hashes = np.array([sum(ord(c) for c in shingle) for shingle in shingles], dtype=np.int32)
        shingles_gpu[i, :len(shingle_hashes)] = shingle_hashes

    # Transfer data to GPU
    shingles_gpu_device = cuda.to_device(shingles_gpu)
    signatures_gpu_device = cuda.to_device(signatures_gpu)
    a_array_device = cuda.to_device(a_array)
    b_array_device = cuda.to_device(b_array)

    # Launch CUDA kernel
    threads_per_block = 256
    blocks_per_grid = (len(movies) + threads_per_block - 1) // threads_per_block
    cuda_hash_function[blocks_per_grid, threads_per_block](shingles_gpu_device, num_hash_functions, signatures_gpu_device, a_array_device, b_array_device, max_val)

    # Copy signatures back to CPU
    signatures_cpu = signatures_gpu_device.copy_to_host()

    # Convert results to dictionary format
    signatures_dict = {title: signatures_cpu[i].tolist() for i, title in enumerate(movies.keys())}

    return signatures_dict


# Sample DataFrame processing
df['shingles'] = df['genres'].astype(str).apply(create_shingles)
new_signatures_cuda = minhash_signatures_cuda(df.set_index('originalTitle')['shingles'].to_dict())
df['minhash_signature'] = df['originalTitle'].map(new_signatures_cuda)
df.to_csv('temp_gpu.csv')




In [159]:
class LSH:
    def __init__(self, num_bands, num_rows_per_band, max_val):
        self.num_bands = num_bands
        self.num_rows_per_band = num_rows_per_band
        self.max_val = max_val
        self.df = pd.read_csv('movie_with_test.csv')
        self.buckets = {}

    def insert_parallel(self, signatures_dict):
        """
        Insert MinHash signatures into LSH buckets in parallel.
        """
        # Convert movie titles to unique indices
        titles = list(signatures_dict.keys())
        title_to_index = {title: i for i, title in enumerate(titles)}
        index_to_title = {i: title for title, i in title_to_index.items()}

        num_movies = len(titles)

        # Initialize arrays
        keys = np.array([title_to_index[title] for title in signatures_dict.keys()], dtype=np.int32)
        signatures = np.array(list(signatures_dict.values()), dtype=np.int32)

        # Use dictionary to store bucket entries
        buckets_dict = {}
        
        # Transfer data to GPU
        keys_device = cuda.to_device(keys)
        signatures_device = cuda.to_device(signatures)

        # Define kernel
        @cuda.jit
        def parallel_insert(keys, signatures, num_bands, num_rows_per_band, max_buckets, bucket_ids):
            idx = cuda.grid(1)
            if idx < keys.size:
                key = keys[idx]
                signature = signatures[idx]
                for band_index in range(num_bands):
                    start_row = band_index * num_rows_per_band
                    end_row = start_row + num_rows_per_band
                    band_signature = 0
                    for row in range(start_row, end_row):
                        band_signature = (band_signature * 31 + signature[row]) & 0xFFFFFFFF
                    bucket_id = band_signature % max_buckets
                    bucket_ids[idx, band_index] = bucket_id

        # Prepare data for bucket IDs
        bucket_ids = np.zeros((num_movies, self.num_bands), dtype=np.int32)
        bucket_ids_device = cuda.to_device(bucket_ids)

        # Launch kernel
        threads_per_block = 256
        blocks_per_grid = (num_movies + threads_per_block - 1) // threads_per_block
        parallel_insert[blocks_per_grid, threads_per_block](keys_device, signatures_device, self.num_bands, self.num_rows_per_band, self.max_val, bucket_ids_device)

        # Copy bucket_ids back to CPU
        bucket_ids_cpu = bucket_ids_device.copy_to_host()

        # Fill bucket dictionary
        for i in range(num_movies):
            key = titles[i]
            for band_index in range(self.num_bands):
                bucket_id = bucket_ids_cpu[i, band_index]
                if bucket_id not in buckets_dict:
                    buckets_dict[bucket_id] = []
                buckets_dict[bucket_id].append(key)

        self.buckets = buckets_dict

    def movie_similar(self, query_signature):
        """
        Query similar items based on MinHash signature of the query.
        """
        candidates = set()
        # Iterate through bands
        for band_index in range(self.num_bands):
            start_row = band_index * self.num_rows_per_band
            end_row = start_row + self.num_rows_per_band
            band_signature = 0
            # Compute band signature
            for row in range(start_row, end_row):
                band_signature = (band_signature * 31 + int(query_signature[row])) & 0xFFFFFFFF
            # Get bucket ID
            bucket_id = band_signature
            # Query the bucket and update candidates
            if bucket_id in self.buckets:
                candidates.update(self.buckets[bucket_id])
        return candidates

    def movie_query(self, tag_query):
        """
        Query movies similar based on tag query.
        """
        # Read the CSV file
        df = pd.read_csv('temp_gpu.csv')

        # Find the first movie with the specified tag
        movie_row = df[df['genres'].str.contains(tag_query, case=False, na=False)].iloc[0]
        movie_title = movie_row['originalTitle']

        # Compute MinHash signature of the selected movie
        query_signature = df[df['originalTitle'] == movie_title]['minhash_signature'].values[0]

        # Convert signature from string to list of integers
        query_signature = eval(query_signature)  # Assuming the signature is stored as a string representation of a list

        # Find similar movies using LSH
        candidate_movie_ids = self.movie_similar(query_signature)

        return candidate_movie_ids
    
    def show_buckets(self):
        print(len(self.buckets))


In [160]:
def generate_query_signature(query_tags, minhash_signatures_func, num_hash_functions=30):
    query_shingles = create_shingles(query_tags)
    query_dict = {'query_movie': query_shingles}  # Create a dictionary with a single entry for the query movie
    query_signatures = minhash_signatures_cuda(query_dict, num_hash_functions)
    return query_signatures['query_movie']


In [161]:
# Initialize LSH with parameters
max_val = 2**16  # Adjust based on your needs
lsh = LSH(num_bands=5, num_rows_per_band=2, max_val=max_val)  # Chia chữ ký MinHash thành 5 bands, mỗi band có 2 hàng

signatures_dict = df.set_index('originalTitle')['minhash_signature'].to_dict()

lsh.insert_parallel(signatures_dict)



In [162]:
# Truy vấn phim tương tự
query_movie = 'Der Hausmeister'
query_signature = df[df['originalTitle'] == query_movie]['minhash_signature'].iloc[0]

candidate_movies = lsh.movie_similar(query_signature)

In [163]:
tag = "fami"
query_result = lsh.movie_query(tag)
query_result

'The Millionaire'