# Homework 1: Finding Similar Items: Textually Similar Documents

The task is to implement shingling, minhashing and LSH algorithms in order to compare text-based documents.

In [17]:
#read data
import pandas as pd
import numpy as np

#import dataset
df = pd.read_csv('dataset-CalheirosMoroRita-2017.csv',  error_bad_lines=False, engine='python', sep="\t")
df.head()

Skipping line 5: '	' expected after '"'


Unnamed: 0,Review
0,Test one two three
1,Test one two four
2,"Everything from the weather, staff, food, prop..."
3,"We full enjoyed the place, and facilities."
4,"Thanks for the """"cidreira"""" and """"madalenas"""" ..."


In [18]:
#clean the files from puncuation signs, spaces and newlines
df['Review'] = df['Review'].str.replace(r'[^\w\s\n]', '')
df['Review'] = df['Review'].str.replace('\"', '')
df.head()

Unnamed: 0,Review
0,Test one two three
1,Test one two four
2,Everything from the weather staff food propert...
3,We full enjoyed the place and facilities
4,Thanks for the cidreira and madalenas tea at r...


## Shingling phase

Here we perform the shinglign phase - Divide the documents in word-touples of size in the variable `shingle_size`

1. Assign shingling size
1. Split the documents in list of words
1. Group words in tuples of `shingle_size`
1. Transform the list of tuples in sets of tuples to avoid duplication and put in in `shingles` column in the dataframe
1. Create a new column with a hashed version of shingles:
    * join the tuples into a string
    * hash the byte-encoded string using `s_hash` function

In [21]:
#shingling phase
shingle_size = 3

#method to group words in tupples of shingle_size size
def shingles(words, n = shingle_size):
    return [words[i:i+n] for i in range(len(words) - n + 1) if len(words[i]) < 5]

df_shingled = df.copy()
df_shingled['Review'] = df.Review.map(lambda x : x.split())
df['shingles'] = df_shingled.Review.map(shingles)
df['shingles'] = df.shingles.map(lambda shingles : [x for x in set(tuple(x) for x in shingles)])

df.head(5)

Unnamed: 0,Review,shingles,hashes
0,Test one two three,"[(Test, one, two), (one, two, three)]","[2080834851, 2015744983]"
1,Test one two four,"[(one, two, four), (Test, one, two)]","[515743642, 2080834851]"
2,Everything from the weather staff food propert...,"[(dcor, spa, rooms), (spa, rooms, and), (the, ...","[2981500306, 184128390, 3069561215, 1132586348..."
3,We full enjoyed the place and facilities,"[(full, enjoyed, the), (We, full, enjoyed), (t...","[2575702922, 3949555090, 3903531572]"
4,Thanks for the cidreira and madalenas tea at r...,"[(and, madalenas, tea), (for, the, cidreira), ...","[1353929374, 1824094365, 813168974, 2672588445]"


In [22]:
import binascii

def s_hash(shingle):
    return binascii.crc32(shingle) & 0xffffffff

df['hashes'] = df.shingles.map(lambda shingles: [s_hash((" ".join(word for word in shingle)).encode()) for shingle in shingles])

df.head(5)

Unnamed: 0,Review,shingles,hashes
0,Test one two three,"[(Test, one, two), (one, two, three)]","[2080834851, 2015744983]"
1,Test one two four,"[(one, two, four), (Test, one, two)]","[515743642, 2080834851]"
2,Everything from the weather staff food propert...,"[(dcor, spa, rooms), (spa, rooms, and), (the, ...","[2981500306, 184128390, 3069561215, 1132586348..."
3,We full enjoyed the place and facilities,"[(full, enjoyed, the), (We, full, enjoyed), (t...","[2575702922, 3949555090, 3903531572]"
4,Thanks for the cidreira and madalenas tea at r...,"[(and, madalenas, tea), (for, the, cidreira), ...","[1353929374, 1824094365, 813168974, 2672588445]"


## MinHash phase
In this phase we calculate the minhashes for each document.

1. Set the number of hashes per document in `numhashes`
2. Generate the two lists of coefficients for the minhash functions
3. Create the signature lists using generated coefficients and save it to the dataframe

In [24]:
import random

#number of hashesh per document
numhashes = 10

#we took the max integer as the upper limit of the minhashes
max_shingle_id = 2**32-1
# a prime number bigger than maxInt
big_prime = 4294967311

#generation of random coeffiencents for all k ax+b functions 
def rand_coefficients(k):
    rand_coeff = []  
    while k > 0:
        randIndex = random.randint(0, max_shingle_id) 
        while randIndex in rand_coeff:
                randIndex = random.randint(0, max_shingle_id) 
        rand_coeff.append(randIndex)
        k = k - 1
    return rand_coeff


#generate a for ax+b
coeff1 = rand_coefficients(numhashes)
#generate b for ax+b
coeff2 = rand_coefficients(numhashes)
print("coefficient array created for " + str(numhashes) + "numhashes")

coefficient array created for 10numhashes


In [47]:
#filter datafram to eliminate empty hash lists
df = df[df.hashes.apply(len) > 0]

#create a column of empty lists
df["signature"] = np.empty((len(df),0)).tolist()

#loop over hash functions
def min_hash(row):
    signature = []
    f_results = []
    #apply (ax+b)%c for every document using above generated coefficients to generate signatures
    for i in range(numhashes):
        f_results = [((coeff1[i] * h  + coeff2[i]) % big_prime) for h in  row.hashes]
        #select the minimum of the results
        signature.append(min(f_results))
    row.signature = signature
    return row

#apply the min_hash on the dataframe
df = df.apply(min_hash, axis=1)
df.head()


Unnamed: 0,Review,shingles,hashes,signature
0,Test one two three,"[(Test, one, two), (one, two, three)]","[2080834851, 2015744983]","[1877537571, 1759969086, 3674356561, 670456082..."
1,Test one two four,"[(one, two, four), (Test, one, two)]","[515743642, 2080834851]","[1374638825, 1369335934, 3283383602, 670456082..."
2,Everything from the weather staff food propert...,"[(dcor, spa, rooms), (spa, rooms, and), (the, ...","[2981500306, 184128390, 3069561215, 1132586348...","[1034231098, 144710362, 989412207, 41469827, 7..."
3,We full enjoyed the place and facilities,"[(full, enjoyed, the), (We, full, enjoyed), (t...","[2575702922, 3949555090, 3903531572]","[1035613178, 1079444216, 159366595, 922632930,..."
4,Thanks for the cidreira and madalenas tea at r...,"[(and, madalenas, tea), (for, the, cidreira), ...","[1353929374, 1824094365, 813168974, 2672588445]","[262562206, 599345854, 670457660, 844186963, 1..."


In [39]:
#implement jaccard similarity
def jaccard(sig1, sig2):
    return len(set(sig1).intersection(set(sig2)))/len(set(sig1).union(set(sig2)))

#function to compare two signature on similarity
def min_hash_sim(sig1, sig2):
    count = 0
    for i in range(numhashes):
        if (sig1[i] == sig2[i]):
            count += 1
    return count/numhashes


In [40]:
#fast testing the min_hash_sim on the first two document
set(df.signature.iloc[0]).union(df.signature.iloc[1])
set(df.signature.iloc[0]).intersection(df.signature.iloc[1])
min_hash_sim(df.signature.iloc[1], df.signature.iloc[0])

0.3

The `threshold` is used to set the lowest bownd for similarity.
We iterate over all pairs of documents and select only document pairs that have the similarity score above the set threshold.

In [41]:
threshold = 0.1

for i in range (0, len(df)-1):
    for j in range (i+1, len(df)):
        sim = min_hash_sim(df.signature.iloc[i], df.signature.iloc[j]);
        if sim >= threshold: 
            print(i," and ", j, ": ", sim)

0  and  1 :  0.3
3  and  135 :  0.1
8  and  12 :  0.2
8  and  20 :  0.1
8  and  44 :  0.4
8  and  45 :  0.1
8  and  50 :  0.1
8  and  57 :  0.2
8  and  88 :  0.3
8  and  90 :  0.3
8  and  92 :  0.4
8  and  125 :  0.3
8  and  130 :  0.3
8  and  134 :  0.4
8  and  146 :  0.1
8  and  149 :  0.4
8  and  184 :  0.2
8  and  192 :  0.3
8  and  225 :  0.2
8  and  360 :  0.3
9  and  143 :  0.2
9  and  334 :  0.1
10  and  67 :  0.1
10  and  186 :  0.1
10  and  192 :  0.1
10  and  228 :  0.1
10  and  229 :  0.2
12  and  20 :  0.1
12  and  44 :  0.2
12  and  45 :  0.2
12  and  50 :  0.1
12  and  56 :  0.1
12  and  57 :  0.1
12  and  88 :  0.2
12  and  90 :  0.2
12  and  92 :  0.2
12  and  125 :  0.2
12  and  130 :  0.2
12  and  134 :  0.2
12  and  146 :  0.1
12  and  149 :  0.2
12  and  184 :  0.1
12  and  192 :  0.2
12  and  224 :  0.1
12  and  225 :  0.1
12  and  360 :  0.2
13  and  46 :  0.2
13  and  179 :  0.1
13  and  330 :  0.3
13  and  345 :  0.1
13  and  366 :  0.2
13  and  385 :  0.1
13  

144  and  259 :  0.1
144  and  357 :  0.1
144  and  386 :  0.2
145  and  182 :  0.1
145  and  204 :  0.1
145  and  227 :  0.3
145  and  228 :  0.2
145  and  338 :  0.1
145  and  396 :  0.1
146  and  149 :  0.2
146  and  184 :  0.1
146  and  192 :  0.1
146  and  225 :  0.1
146  and  289 :  0.1
146  and  360 :  0.1
148  and  322 :  0.3
148  and  395 :  0.1
149  and  184 :  0.1
149  and  192 :  0.2
149  and  225 :  0.1
149  and  360 :  0.2
152  and  153 :  1.0
154  and  157 :  0.1
154  and  162 :  0.1
154  and  182 :  0.1
154  and  197 :  0.1
154  and  226 :  0.1
154  and  237 :  0.1
154  and  245 :  0.1
154  and  251 :  0.1
154  and  253 :  0.1
154  and  271 :  0.1
154  and  274 :  0.1
154  and  275 :  0.1
154  and  281 :  0.1
154  and  354 :  0.1
154  and  358 :  0.1
154  and  363 :  0.1
157  and  162 :  0.1
157  and  274 :  0.1
157  and  275 :  0.1
157  and  281 :  0.1
161  and  304 :  0.1
162  and  219 :  0.1
162  and  274 :  0.1
162  and  275 :  0.1
162  and  281 :  0.1
162  and  295

## LSH phase

We set the elements per bnad and add all the possible candidate pairs to a set. 
To do this we iterate over all pairs exactly once, but we itare over the lists of signatures as well and also bandwise. This leads to an inneficient code, but it is ok for small number of documents (<500), like this test. 

In [42]:
#lsh implementation
elem_per_band = 2
candidate_pairs = set()

for b in range (0, numhashes//elem_per_band):
    for i in range (0, len(df)-1):
        for j in range (i+1, len(df)):
            s1 = 0
            s2 = 0
            for e in range(b*elem_per_band, b*elem_per_band+elem_per_band):
                s1 += df.signature.iloc[i][e]
                s2 += df.signature.iloc[j][e]
            if (s1 == s2): 
                candidate_pairs.add((i, j))
candidate_pairs

{(8, 44),
 (8, 134),
 (44, 134),
 (55, 136),
 (55, 227),
 (55, 228),
 (75, 76),
 (90, 92),
 (90, 125),
 (90, 130),
 (90, 192),
 (90, 360),
 (92, 125),
 (92, 130),
 (92, 192),
 (92, 360),
 (97, 131),
 (97, 163),
 (112, 133),
 (125, 130),
 (131, 163),
 (134, 187),
 (136, 227),
 (136, 228),
 (144, 242),
 (144, 386),
 (148, 322),
 (152, 153),
 (192, 360),
 (206, 252),
 (226, 251),
 (227, 228),
 (242, 386)}

In [46]:
#test example: take a pair from the canditate set
signatures = next(iter(candidate_pairs))
sig1, sig2 = signatures[0], signatures[1]

#print the text
print(df.Review.iloc[sig1])
print(df.Review.iloc[sig2])
print()
#print the signatures
print(df.signature.iloc[sig1])
print(df.signature.iloc[sig2])

#print the minhash similarity
min_hash_sim(df.signature.iloc[sig1], df.signature.iloc[sig2])

     Absolutely gorgeous and wonderful We will repeat for sure Thanks for the kindness        
 
Yesterday I drive to the restaurant Areas do Seixo to enjoy the peace and praised the quality of the environment and flavors
However I had not made a reservation while browsing the site after reading the Evases magazine about the reopening I did not get the idea that it was necessary
Regards the chef Leonardo Pereira authorized an exception and we could have lunch
There are no words to express the gratitude for the kindness
And then it was a memorable dining experience  Since the care and education of employees the combination of flavors the friendliness of the chef Leonardo Pereira Who kindly presented us with a surprise dessert
It is with great appreciation that mean that approx trip 700km worth of all the wonderful experience
In this regard I must complement all the elements of your team            

[876264128, 1003172853, 114275788, 218140582, 144069921, 636831237, 1123852149, 12456300

0.2