In [1]:
# dev/python3


A class Shingling that constructs k–shingles of a given length k (e.g., 10) from a given document, computes a hash value for each unique shingle, and represents the document in the form of an ordered set of its hashed k-shingles.

A class CompareSets that computes the Jaccard similarity of two sets of integers – two sets of hashed shingles.

A class MinHashing that builds a minHash signature (in the form of a vector or a set) of a given length n from a given set of integers (a set of hashed shingles).

A class CompareSignatures that estimates similarity of two integer vectors – minhash signatures – as a fraction of components, in which they agree.

(Optional task for extra 2 bonus) A class LSH that implements the LSH technique: given a collection of minhash signatures (integer vectors) and a similarity threshold t, the LSH class (using banding and hashing) finds all candidate pairs of signatures that agree on at least fraction t of their components.

In [2]:
#read data
import pandas as pd
import numpy as np

df = pd.read_csv('dataset-CalheirosMoroRita-2017.csv',  error_bad_lines=False, engine='python', sep="\t")
df.count()

Skipping line 5: '	' expected after '"'


Review    404
dtype: int64

In [3]:
#clean punctuation
df['Review'] = df['Review'].str.replace(r'[^\w\s\n]', '')
df['Review'] = df['Review'].str.replace('\"', '')

df.head(10)

Unnamed: 0,Review
0,Test one two three
1,Test one two four
2,Everything from the weather staff food propert...
3,We full enjoyed the place and facilities
4,Thanks for the cidreira and madalenas tea at r...
5,One dream Cozy and comfortable Hotel The bes...
6,Hotel concept is hard to grasp They communicat...
7,This is a wonderful hotel for a romantic escap...
8,Gold room fantastic We loved and we will come ...
9,One of the best Hotels in the world and in the...


In [35]:
#shingling 
shingle_size = 3


def shingles(words, n = shingle_size):
    return [words[i:i+n] for i in range(len(words) - n + 1) if len(words[i]) < 5]

df_shingled = df.copy()
df_shingled['Review'] = df.Review.map(lambda x : x.split())
df['shingles'] = df_shingled.Review.map(shingles)
df['shingles'] = df.shingles.map(lambda shingles : [x for x in set(tuple(x) for x in shingles)])

df.head(5)

Unnamed: 0,Review,shingles,hashes,signature
0,Test one two three,"[(Test, one, two), (one, two, three)]","[392562732, 3439823151, 3376122026]","[987681289, 132727393, 1119435899, 401586331, ..."
1,Test one two four,"[(Test, one, two), (one, two, four)]","[3439823151, 3977348428, 3376122026]","[1992896621, 319662534, 3105089854, 1987699199..."
2,Everything from the weather staff food propert...,"[(dcor, spa, rooms), (were, top, notch), (the,...","[803584749, 1253960632, 2359637912, 3648159412...","[214855121, 180436794, 1188378776, 476518534, ..."
3,We full enjoyed the place and facilities,"[(full, enjoyed, the), (the, place, and), (We,...","[609615837, 601731606, 3925338499, 3621188725]","[2064255174, 860178008, 360105669, 1084418888,..."
4,Thanks for the cidreira and madalenas tea at r...,"[(the, cidreira, and), (and, madalenas, tea), ...","[2136692008, 1821160267, 1070508967, 411529165...","[1440009519, 2670617479, 318438173, 132802080,..."


In [36]:
#hashing
import binascii
def s_hash(shingle):
    return binascii.crc32(shingle) & 0xffffffff

df['hashes'] = df.shingles.map(lambda shingles: [s_hash((" ".join(word for word in shingle)).encode()) for shingle in shingles])

df.head(5)

Unnamed: 0,Review,shingles,hashes,signature
0,Test one two three,"[(Test, one, two), (one, two, three)]","[2080834851, 2015744983]","[987681289, 132727393, 1119435899, 401586331, ..."
1,Test one two four,"[(Test, one, two), (one, two, four)]","[2080834851, 515743642]","[1992896621, 319662534, 3105089854, 1987699199..."
2,Everything from the weather staff food propert...,"[(dcor, spa, rooms), (were, top, notch), (the,...","[2981500306, 3207390694, 3069561215, 109145352...","[214855121, 180436794, 1188378776, 476518534, ..."
3,We full enjoyed the place and facilities,"[(full, enjoyed, the), (the, place, and), (We,...","[2575702922, 3903531572, 3949555090]","[2064255174, 860178008, 360105669, 1084418888,..."
4,Thanks for the cidreira and madalenas tea at r...,"[(the, cidreira, and), (and, madalenas, tea), ...","[2672588445, 1353929374, 1824094365, 813168974]","[1440009519, 2670617479, 318438173, 132802080,..."


In [37]:
#minhash signatures
import random
import time

t0 = time.time()
max_shingle_id = 2**32-1
big_prime = 4294967311

def rand_coefficients(k):
    rand_coeff = []  
    while k > 0:
        randIndex = random.randint(0, max_shingle_id) 
        while randIndex in rand_coeff:
                randIndex = random.randint(0, max_shingle_id) 
        rand_coeff.append(randIndex)
        k = k - 1
    return rand_coeff
numhashes = 10
coeff1 = rand_coefficients(numhashes)
coeff2 = rand_coefficients(numhashes)
print("coefficient array created for " + str(numhashes))

coefficient array created for 10


In [38]:
df = df[df.hashes.apply(len) > 0]
df["signature"] = np.empty((len(df),0)).tolist()
df.head()

Unnamed: 0,Review,shingles,hashes,signature
0,Test one two three,"[(Test, one, two), (one, two, three)]","[2080834851, 2015744983]",[]
1,Test one two four,"[(Test, one, two), (one, two, four)]","[2080834851, 515743642]",[]
2,Everything from the weather staff food propert...,"[(dcor, spa, rooms), (were, top, notch), (the,...","[2981500306, 3207390694, 3069561215, 109145352...",[]
3,We full enjoyed the place and facilities,"[(full, enjoyed, the), (the, place, and), (We,...","[2575702922, 3903531572, 3949555090]",[]
4,Thanks for the cidreira and madalenas tea at r...,"[(the, cidreira, and), (and, madalenas, tea), ...","[2672588445, 1353929374, 1824094365, 813168974]",[]


In [39]:
#loop over hash functions

def min_hash(row):
    signature = []
    f_results = []
    for i in range(numhashes):
        f_results = [((coeff1[i] * h  + coeff2[i]) % big_prime) for h in  row.hashes]
        signature.append(min(f_results))
    row.signature = signature
    return row

df = df.apply(min_hash, axis=1)
df.signature.head()


0    [1533411855, 2664973751, 1503616671, 126040450...
1    [1428629154, 2328271854, 1459505473, 928314541...
2    [683400583, 838273357, 318838460, 554377663, 1...
3    [1084248128, 540453385, 2381694502, 164769707,...
4    [230031610, 479488910, 215881626, 49776448, 79...
Name: signature, dtype: object

In [40]:
#similarity
def jaccard(sig1, sig2):
    return len(set(sig1).intersection(set(sig2)))/len(set(sig1).union(set(sig2)))

def min_hash_sim(sig1, sig2):
    count = 0
    for i in range(numhashes):
        if (sig1[i] == sig2[i]):
            count += 1
    return count/numhashes


In [41]:
#test cases
# df.signature.iloc[1]
# set(df.signature.iloc[0]).union(df.signature.iloc[1])
# set(df.signature.iloc[0]).intersection(df.signature.iloc[1])
# jaccard(df.signature.iloc[1], df.signature.iloc[0])

In [42]:
for i in range (0, len(df)-1):
    for j in range (i+1, len(df)):
        sim = min_hash_sim(df.signature.iloc[i], df.signature.iloc[j]);
        if sim > 0.1: 
            print(i," and ", j, ": ", sim)
        
            

0  and  1 :  0.2
3  and  135 :  0.3
12  and  66 :  0.2
12  and  192 :  0.4
13  and  330 :  0.2
13  and  387 :  0.2
31  and  351 :  0.2
41  and  143 :  0.3
42  and  57 :  0.2
44  and  134 :  0.2
45  and  92 :  0.2
45  and  125 :  0.2
45  and  192 :  0.2
50  and  130 :  0.2
50  and  360 :  0.2
55  and  136 :  0.3
55  and  145 :  0.2
55  and  227 :  0.3
55  and  228 :  0.2
60  and  308 :  0.2
60  and  311 :  0.2
66  and  192 :  0.3
66  and  254 :  0.2
70  and  72 :  0.2
70  and  393 :  0.3
71  and  253 :  0.2
72  and  393 :  0.2
75  and  76 :  1.0
79  and  113 :  0.2
80  and  102 :  0.3
87  and  109 :  0.2
87  and  112 :  0.2
90  and  171 :  0.2
90  and  292 :  0.2
92  and  125 :  0.2
92  and  130 :  0.2
92  and  192 :  0.3
95  and  193 :  0.2
98  and  234 :  0.3
98  and  280 :  0.2
101  and  170 :  0.2
109  and  112 :  0.2
109  and  225 :  0.2
119  and  120 :  0.3
123  and  360 :  0.2
125  and  192 :  0.4
127  and  342 :  0.2
130  and  360 :  0.3
133  and  253 :  0.2
133  and  356 :  0.2