# Algorithms for Data Science -- Laboratory 2
Author: Pablo Mollá Chárlez

## Finding Similar Items

The objective of this lab is to implement the Min-Hashing and Locality Sensitive Hashing. This lab needs Python and Jupyter, along with the NumPy package.

1. We first load the required libraries and the files containing the documents.

In [1]:
import sys
import numpy as np
import urllib.request
import re
import string
import random

file_location = 'https://phparis.net/slides/algo_4_ds/week2/tweets.txt' #you can change this to a local file on your computer

# Keeping document in memory
infile = urllib.request.urlopen(file_location)
docs = []
for line in infile: 
  docs.append(str(line.strip()).lower())
print("Number of documents: %d"%len(docs))
print(docs)

Number of documents: 497
["b'@stellargirl i loooooooovvvvvveee my kindle2. not that the dx is cool, but the 2 is fantastic in its own right.'", "b'reading my kindle2...  love it... lee childs is good read.'", "b'ok, first assesment of the #kindle2 ...it fucking rocks!!!'", 'b"@kenburbary you\'ll love your kindle2. i\'ve had mine for a few months and never looked back. the new big one is huge! no need for remorse! :)"', 'b"@mikefish  fair enough. but i have the kindle2 and i think it\'s perfect  :)"', 'b"@richardebaker no. it is too big. i\'m quite happy with the kindle2."', "b'fuck this economy. i hate aig and their non loan given asses.'", "b'jquery is my new best friend.'", "b'loves twitter'", "b'how can you not love obama? he makes jokes about himself.'", 'b"check this video out -- president obama at the white house correspondents\' dinner http://bit.ly/imxum"', 'b"@karoli i firmly believe that obama/pelosi have zero desire to be civil.  it\'s a charade and a slogan, but they want t

2. We transform the document into $k$-shingles, and we hash them to their integer values. We compute the Jaccard similarity between two documents given as sets of shingle ids.

In [2]:
# k for shingles
k = 5

shingle_id = {}
id_shingle = []
m = []
ids = 0

total_shingles = 0

for d in docs:
  # Removing whitespace
  d_new = ''.join(c for c in d if c.isalnum())
  char_shing = [d_new[i:i+k] for i in range(len(d_new)-k+1)]
  total_shingles += len(char_shing)
  sid = set()
  for sh in char_shing:
    if sh not in shingle_id:
      shingle_id[sh]=ids
      id_shingle.append(sh)
      ids=ids+1
    sid.add(shingle_id[sh])
  m.append(sid)

print ("Unique shingles: %d"%len(id_shingle))
print ("Total shingles: %d"%total_shingles)
print(shingle_id)
print(id_shingle)
print(m)
m1 = [m[0], m[1]]
  

Unique shingles: 19532
Total shingles: 28150
{'bstel': 0, 'stell': 1, 'tella': 2, 'ellar': 3, 'llarg': 4, 'largi': 5, 'argir': 6, 'rgirl': 7, 'girli': 8, 'irlil': 9, 'rlilo': 10, 'liloo': 11, 'ilooo': 12, 'loooo': 13, 'ooooo': 14, 'oooov': 15, 'ooovv': 16, 'oovvv': 17, 'ovvvv': 18, 'vvvvv': 19, 'vvvve': 20, 'vvvee': 21, 'vveee': 22, 'veeem': 23, 'eeemy': 24, 'eemyk': 25, 'emyki': 26, 'mykin': 27, 'ykind': 28, 'kindl': 29, 'indle': 30, 'ndle2': 31, 'dle2n': 32, 'le2no': 33, 'e2not': 34, '2nott': 35, 'notth': 36, 'ottha': 37, 'tthat': 38, 'thatt': 39, 'hatth': 40, 'atthe': 41, 'tthed': 42, 'thedx': 43, 'hedxi': 44, 'edxis': 45, 'dxisc': 46, 'xisco': 47, 'iscoo': 48, 'scool': 49, 'coolb': 50, 'oolbu': 51, 'olbut': 52, 'lbutt': 53, 'butth': 54, 'utthe': 55, 'tthe2': 56, 'the2i': 57, 'he2is': 58, 'e2isf': 59, '2isfa': 60, 'isfan': 61, 'sfant': 62, 'fanta': 63, 'antas': 64, 'ntast': 65, 'tasti': 66, 'astic': 67, 'stici': 68, 'ticin': 69, 'icini': 70, 'cinit': 71, 'inits': 72, 'nitso': 73, 'i

In [3]:
def jaccard_similarity(doc1, doc2):
  if len(doc1)==0 or len(doc2)==0:
    return 0.0
  else:
    inter = doc1.intersection(doc2)
    #print(inter)
    union = doc1.union(doc2)
    #print(union)
    return float(len(inter))/float(len(union))

# Example
print(m[0])

print(jaccard_similarity(m[0],m[1]))

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80}
0.043859649122807015


3. We implement the method for min-hashing given a permutation.

In [4]:
def min_hash(doc, perm):
  for d in perm:
    if d in doc:
      return d

perm = list(range(len(id_shingle))) #([0,...,19532])
random.shuffle(perm)

min_hash(m[0],perm)

print(len(m))
print(len(id_shingle))


497
19532


4. Implement the full Min-Hashing signature matrix for a given number $n$ of permutations. Implement the similarity estimation based on Min-Hashing (i.e., the number of permutation on which two documents agree).

In [14]:
# YOUR CODE HERE
# Small example to understand
#m4 = [{0,1,2,3}, {1,2,4,5}, {3,1,5,0}]
#id_shingle = ['bstel', 'stell', 'tella', 'ellar', 'llarg', 'largi']


def compute_signature_matrix(docs, n_permutations):
    signature_matrix = np.full((n_permutations, len(docs)), np.inf)  # Initialize with infinity
    #print("Signature Matrix Empty:", signature_matrix)

    # Generate multiple permutations
    perm_list = [list(range(len(id_shingle))) for _ in range(n_permutations)]
    #print("Perm List:", perm_list)
    
    for perm in perm_list:
        random.shuffle(perm)

    for perm_idx, perm in enumerate(perm_list):
        for doc_idx, doc in enumerate(docs):
            min_hash_value = min_hash(doc, perm)
            #print(doc, perm, min_hash_value)
            signature_matrix[perm_idx, doc_idx] = min_hash_value
    
    return signature_matrix

# Example: 100 permutations
n_permutations = 500
signature_matrix = compute_signature_matrix(m, n_permutations)

print("Signature Matrix:\n\n", signature_matrix)

def minhash_similarity(sig_matrix, doc1_idx, doc2_idx):
    sig1 = sig_matrix[:, doc1_idx]
    #print(sig1)
    sig2 = sig_matrix[:, doc2_idx]
    #print(sig2)
    #print(sig1 == sig2)
    # Instead of calling the jaccard_similarity function (we would need to convert to sets)
    # We can use basic numpy operations
    return np.mean(sig1 == sig2)

# Example usage
sim_est = minhash_similarity(signature_matrix, 0, 1)
print(f"\nMinHash similarity estimation: {sim_est}")

Signature Matrix:

 [[ 3. nan nan ... nan nan nan]
 [ 0. nan nan ... nan nan nan]
 [ 5. nan nan ... nan nan nan]
 ...
 [ 5. nan nan ... nan nan nan]
 [ 3. nan nan ... nan nan nan]
 [ 1. nan nan ... nan nan nan]]

MinHash similarity estimation: 0.0


5. __TASK__ Implement Locality-Sensitive Hashing, given $b$ bands of $r$ rows such that $br=n$. Compute the similarity threshold needed using the formula in the lecture $t=(1/b)^{1/r}$. Assume that signatures in the same band are similar only if the are the same (i.e., they agree on all columns). Check for similarity all documents that agree in at least one band, and compare with the true jaccard similarity.

In [12]:
# YOUR CODE HERE
def compute_threshold(b, r):
    t = (1 / b) ** (1 / r)
    #print(f"Threshold: {t}")
    return t

def lsh(sig_matrix, b, r):
    print("Signature Matrix:\n", np.transpose(sig_matrix))
    num_docs = sig_matrix.shape[1]
    print("Number of Documents:", num_docs)
    candidates = set()

    for band_idx in range(b):
        print("Band number:", band_idx)
        buckets = {}
        for doc_idx in range(num_docs):
            print("     Document Index:", doc_idx)
            # Getting the rows corresponding to the current band
            band = tuple(sig_matrix[band_idx*r:(band_idx+1)*r, doc_idx])
            print("     Band:", band)
            if band in buckets:
                buckets[band].append(doc_idx)
            else:
                buckets[band] = [doc_idx]
            #print("     Buckets:", buckets)
        
        # Adding documents that hash to the same value in this band
        for docs_in_band in buckets.values():
            #print("Documents in Band:",docs_in_band)
            if len(docs_in_band) > 1:
                for i in range(len(docs_in_band)):
                    for j in range(i + 1, len(docs_in_band)):
                        candidates.add((docs_in_band[i], docs_in_band[j]))
        #print(candidates)

    return candidates

# Parameters
b = 100  # Number of bands (for dataset should be 20)
r = 5   # Rows per band (for dataset should be 5)
n = b * r  # Total number of permutations (should match n_permutations from step 4) (for dataset should be 100)

# Apply LSH
candidate_pairs = lsh(signature_matrix, b, r)
print(f"Candidate pairs: {candidate_pairs}")

# Compute the similarity threshold
threshold = compute_threshold(b, r)
print("Threshold:", threshold)

# Compute Min-Hash similarity for the candidates and compare with the threshold
for (doc1, doc2) in candidate_pairs:
    minhash_sim = minhash_similarity(signature_matrix, doc1, doc2)
    print(f"        MinHash similarity between doc {doc1} and doc {doc2}: {minhash_sim}")
    
    if minhash_sim >= threshold:
        print(f"    Documents {doc1} and {doc2} are similar based on MinHash (similarity: {minhash_sim})")

print("\n")
# Compute the actual/true Jaccard similarity for comparison
for (doc1, doc2) in candidate_pairs:
    true_sim = jaccard_similarity(m[doc1], m[doc2])
    #print("Docs:", m4[doc1], m4[doc2])
    print(f"Jaccard similarity between doc {doc1} and doc {doc2}: {true_sim}")

Candidate pairs: {(301, 303), (110, 352), (107, 348), (244, 245), (303, 305), (124, 363), (125, 362), (109, 352), (183, 423), (53, 288), (109, 110)}
Threshold: 0.39810717055349726
        MinHash similarity between doc 301 and doc 303: 0.218
        MinHash similarity between doc 110 and doc 352: 0.67
    Documents 110 and 352 are similar based on MinHash (similarity: 0.67)
        MinHash similarity between doc 107 and doc 348: 0.516
    Documents 107 and 348 are similar based on MinHash (similarity: 0.516)
        MinHash similarity between doc 244 and doc 245: 0.672
    Documents 244 and 245 are similar based on MinHash (similarity: 0.672)
        MinHash similarity between doc 303 and doc 305: 0.524
    Documents 303 and 305 are similar based on MinHash (similarity: 0.524)
        MinHash similarity between doc 124 and doc 363: 0.608
    Documents 124 and 363 are similar based on MinHash (similarity: 0.608)
        MinHash similarity between doc 125 and doc 362: 0.318
        MinHa