In [1]:
# dev/python3


A class Shingling that constructs k–shingles of a given length k (e.g., 10) from a given document, computes a hash value for each unique shingle, and represents the document in the form of an ordered set of its hashed k-shingles.

A class CompareSets that computes the Jaccard similarity of two sets of integers – two sets of hashed shingles.

A class MinHashing that builds a minHash signature (in the form of a vector or a set) of a given length n from a given set of integers (a set of hashed shingles).

A class CompareSignatures that estimates similarity of two integer vectors – minhash signatures – as a fraction of components, in which they agree.

(Optional task for extra 2 bonus) A class LSH that implements the LSH technique: given a collection of minhash signatures (integer vectors) and a similarity threshold t, the LSH class (using banding and hashing) finds all candidate pairs of signatures that agree on at least fraction t of their components.

In [2]:
#read data
import pandas as pd
import numpy as np

df = pd.read_csv('dataset-CalheirosMoroRita-2017.csv',  error_bad_lines=False, engine='python', sep="\t")
df.count()

Skipping line 5: '	' expected after '"'


Review    404
dtype: int64

In [3]:
#clean punctuation
df['Review'] = df['Review'].str.replace(r'[^\w\s\n]', '')
df['Review'] = df['Review'].str.replace('\"', '')

df.head(10)

Unnamed: 0,Review
0,Test one two three
1,Test one two four
2,Everything from the weather staff food propert...
3,We full enjoyed the place and facilities
4,Thanks for the cidreira and madalenas tea at r...
5,One dream Cozy and comfortable Hotel The bes...
6,Hotel concept is hard to grasp They communicat...
7,This is a wonderful hotel for a romantic escap...
8,Gold room fantastic We loved and we will come ...
9,One of the best Hotels in the world and in the...


In [35]:
#shingling 
shingle_size = 3


def shingles(words, n = shingle_size):
    return [words[i:i+n] for i in range(len(words) - n + 1) if len(words[i]) < 5]

df_shingled = df.copy()
df_shingled['Review'] = df.Review.map(lambda x : x.split())
df['shingles'] = df_shingled.Review.map(shingles)
df['shingles'] = df.shingles.map(lambda shingles : [x for x in set(tuple(x) for x in shingles)])

df.head(5)

Unnamed: 0,Review,shingles,hashes,signature
0,Test one two three,"[(Test, one, two), (one, two, three)]","[392562732, 3439823151, 3376122026]","[987681289, 132727393, 1119435899, 401586331, ..."
1,Test one two four,"[(Test, one, two), (one, two, four)]","[3439823151, 3977348428, 3376122026]","[1992896621, 319662534, 3105089854, 1987699199..."
2,Everything from the weather staff food propert...,"[(dcor, spa, rooms), (were, top, notch), (the,...","[803584749, 1253960632, 2359637912, 3648159412...","[214855121, 180436794, 1188378776, 476518534, ..."
3,We full enjoyed the place and facilities,"[(full, enjoyed, the), (the, place, and), (We,...","[609615837, 601731606, 3925338499, 3621188725]","[2064255174, 860178008, 360105669, 1084418888,..."
4,Thanks for the cidreira and madalenas tea at r...,"[(the, cidreira, and), (and, madalenas, tea), ...","[2136692008, 1821160267, 1070508967, 411529165...","[1440009519, 2670617479, 318438173, 132802080,..."


In [36]:
#hashing
import binascii
def s_hash(shingle):
    return binascii.crc32(shingle) & 0xffffffff

df['hashes'] = df.shingles.map(lambda shingles: [s_hash((" ".join(word for word in shingle)).encode()) for shingle in shingles])

df.head(5)

Unnamed: 0,Review,shingles,hashes,signature
0,Test one two three,"[(Test, one, two), (one, two, three)]","[2080834851, 2015744983]","[987681289, 132727393, 1119435899, 401586331, ..."
1,Test one two four,"[(Test, one, two), (one, two, four)]","[2080834851, 515743642]","[1992896621, 319662534, 3105089854, 1987699199..."
2,Everything from the weather staff food propert...,"[(dcor, spa, rooms), (were, top, notch), (the,...","[2981500306, 3207390694, 3069561215, 109145352...","[214855121, 180436794, 1188378776, 476518534, ..."
3,We full enjoyed the place and facilities,"[(full, enjoyed, the), (the, place, and), (We,...","[2575702922, 3903531572, 3949555090]","[2064255174, 860178008, 360105669, 1084418888,..."
4,Thanks for the cidreira and madalenas tea at r...,"[(the, cidreira, and), (and, madalenas, tea), ...","[2672588445, 1353929374, 1824094365, 813168974]","[1440009519, 2670617479, 318438173, 132802080,..."


In [37]:
#minhash signatures
import random
import time

t0 = time.time()
max_shingle_id = 2**32-1
big_prime = 4294967311

def rand_coefficients(k):
    rand_coeff = []  
    while k > 0:
        randIndex = random.randint(0, max_shingle_id) 
        while randIndex in rand_coeff:
                randIndex = random.randint(0, max_shingle_id) 
        rand_coeff.append(randIndex)
        k = k - 1
    return rand_coeff
numhashes = 10
coeff1 = rand_coefficients(numhashes)
coeff2 = rand_coefficients(numhashes)
print("coefficient array created for " + str(numhashes))

coefficient array created for 10


In [38]:
df = df[df.hashes.apply(len) > 0]
df["signature"] = np.empty((len(df),0)).tolist()
df.head()

Unnamed: 0,Review,shingles,hashes,signature
0,Test one two three,"[(Test, one, two), (one, two, three)]","[2080834851, 2015744983]",[]
1,Test one two four,"[(Test, one, two), (one, two, four)]","[2080834851, 515743642]",[]
2,Everything from the weather staff food propert...,"[(dcor, spa, rooms), (were, top, notch), (the,...","[2981500306, 3207390694, 3069561215, 109145352...",[]
3,We full enjoyed the place and facilities,"[(full, enjoyed, the), (the, place, and), (We,...","[2575702922, 3903531572, 3949555090]",[]
4,Thanks for the cidreira and madalenas tea at r...,"[(the, cidreira, and), (and, madalenas, tea), ...","[2672588445, 1353929374, 1824094365, 813168974]",[]


In [47]:
#loop over hash functions

def min_hash(row):
    signature = []
    f_results = []
    for i in range(numhashes):
        f_results = [((coeff1[i] * h  + coeff2[i]) % big_prime) for h in  row.hashes]
        signature.append(min(f_results))
    row.signature = signature
    return row

df = df.apply(min_hash, axis=1)
df.head(10)


Unnamed: 0,Review,shingles,hashes,signature
0,Test one two three,"[(Test, one, two), (one, two, three)]","[2080834851, 2015744983]","[1533411855, 2664973751, 1503616671, 126040450..."
1,Test one two four,"[(Test, one, two), (one, two, four)]","[2080834851, 515743642]","[1428629154, 2328271854, 1459505473, 928314541..."
2,Everything from the weather staff food propert...,"[(dcor, spa, rooms), (were, top, notch), (the,...","[2981500306, 3207390694, 3069561215, 109145352...","[683400583, 838273357, 318838460, 554377663, 1..."
3,We full enjoyed the place and facilities,"[(full, enjoyed, the), (the, place, and), (We,...","[2575702922, 3903531572, 3949555090]","[1084248128, 540453385, 2381694502, 164769707,..."
4,Thanks for the cidreira and madalenas tea at r...,"[(the, cidreira, and), (and, madalenas, tea), ...","[2672588445, 1353929374, 1824094365, 813168974]","[230031610, 479488910, 215881626, 49776448, 79..."
5,One dream Cozy and comfortable Hotel The bes...,"[(One, dream, Cozy), (the, fire, pits), (best,...","[4109749916, 2567802074, 490876368, 3653948711...","[32292070, 94859988, 117637266, 22034978, 4621..."
6,Hotel concept is hard to grasp They communicat...,"[(what, you, get), (is, friendly, although), (...","[3089389884, 3535329763, 2210686231, 423931707...","[203510417, 61824738, 37292788, 236492456, 156..."
7,This is a wonderful hotel for a romantic escap...,"[(have, to, make), (a, wonderful, hotel), (is,...","[3418884546, 620493595, 3660244085, 2887515362...","[211242738, 90357391, 178287620, 85619366, 827..."
8,Gold room fantastic We loved and we will come ...,"[(Gold, room, fantastic), (room, fantastic, We...","[1966932107, 2027626870, 1100955699, 316264339...","[508237506, 609161917, 696724894, 1205400795, ..."
9,One of the best Hotels in the world and in the...,"[(Ma, Bo, is), (all, over, the), (best, Room, ...","[4192145606, 3853051168, 21685393, 1934729424,...","[102259674, 139967230, 32761807, 329252261, 89..."


In [40]:
#similarity
def jaccard(sig1, sig2):
    return len(set(sig1).intersection(set(sig2)))/len(set(sig1).union(set(sig2)))

def min_hash_sim(sig1, sig2):
    count = 0
    for i in range(numhashes):
        if (sig1[i] == sig2[i]):
            count += 1
    return count/numhashes


In [41]:
#test cases
# df.signature.iloc[1]
# set(df.signature.iloc[0]).union(df.signature.iloc[1])
# set(df.signature.iloc[0]).intersection(df.signature.iloc[1])
# jaccard(df.signature.iloc[1], df.signature.iloc[0])

In [77]:
for i in range (0, len(df)-1):
    for j in range (i+1, len(df)):
        sim = min_hash_sim(df.signature.iloc[i], df.signature.iloc[j]);
        if sim >= 0.1: 
            print(i," and ", j, ": ", sim)

0  and  1 :  0.2
2  and  298 :  0.1
3  and  135 :  0.3
5  and  87 :  0.1
5  and  109 :  0.1
5  and  112 :  0.1
8  and  12 :  0.1
8  and  44 :  0.1
8  and  134 :  0.1
8  and  146 :  0.1
9  and  11 :  0.1
9  and  23 :  0.1
9  and  101 :  0.1
9  and  143 :  0.1
9  and  170 :  0.1
9  and  332 :  0.1
11  and  101 :  0.1
11  and  170 :  0.1
11  and  332 :  0.1
12  and  44 :  0.1
12  and  66 :  0.2
12  and  125 :  0.1
12  and  134 :  0.1
12  and  146 :  0.1
12  and  192 :  0.4
13  and  46 :  0.1
13  and  179 :  0.1
13  and  315 :  0.1
13  and  330 :  0.2
13  and  387 :  0.2
13  and  389 :  0.1
15  and  73 :  0.1
15  and  146 :  0.1
15  and  214 :  0.1
15  and  387 :  0.1
17  and  288 :  0.1
17  and  298 :  0.1
18  and  52 :  0.1
18  and  189 :  0.1
19  and  163 :  0.1
22  and  139 :  0.1
22  and  306 :  0.1
22  and  393 :  0.1
24  and  50 :  0.1
25  and  224 :  0.1
27  and  41 :  0.1
27  and  143 :  0.1
29  and  48 :  0.1
30  and  34 :  0.1
30  and  73 :  0.1
30  and  78 :  0.1
30  and  80 : 

In [89]:
#lsh implementation
elem_per_band = 1
candidate_pairs = set()

for b in range (0, numhashes//elem_per_band):
    for i in range (0, len(df)-1):
        for j in range (i+1, len(df)):
            s1 = 0
            s2 = 0
            for e in range(b*elem_per_band, b*elem_per_band+elem_per_band):
                s1 += df.signature.iloc[i][e]
                s2 += df.signature.iloc[j][e]
            if (s1 == s2): 
                candidate_pairs.add((i, j))
candidate_pairs

{(0, 1),
 (2, 298),
 (3, 135),
 (5, 87),
 (5, 109),
 (5, 112),
 (8, 12),
 (8, 44),
 (8, 134),
 (8, 146),
 (9, 11),
 (9, 23),
 (9, 101),
 (9, 143),
 (9, 170),
 (9, 332),
 (11, 101),
 (11, 170),
 (11, 332),
 (12, 44),
 (12, 66),
 (12, 125),
 (12, 134),
 (12, 146),
 (12, 192),
 (13, 46),
 (13, 179),
 (13, 315),
 (13, 330),
 (13, 387),
 (13, 389),
 (15, 73),
 (15, 146),
 (15, 214),
 (15, 387),
 (17, 288),
 (17, 298),
 (18, 52),
 (18, 189),
 (19, 163),
 (22, 139),
 (22, 306),
 (22, 393),
 (24, 50),
 (25, 224),
 (27, 41),
 (27, 143),
 (29, 48),
 (30, 34),
 (30, 73),
 (30, 78),
 (30, 80),
 (30, 155),
 (30, 229),
 (31, 105),
 (31, 125),
 (31, 171),
 (31, 189),
 (31, 281),
 (31, 315),
 (31, 351),
 (39, 127),
 (39, 342),
 (41, 51),
 (41, 143),
 (41, 286),
 (42, 57),
 (44, 134),
 (44, 146),
 (45, 50),
 (45, 57),
 (45, 61),
 (45, 88),
 (45, 90),
 (45, 92),
 (45, 125),
 (45, 130),
 (45, 192),
 (45, 360),
 (46, 98),
 (46, 149),
 (46, 330),
 (46, 331),
 (49, 136),
 (49, 189),
 (49, 204),
 (49, 282),


In [88]:
sig1 = 41
sig2 = 143

print(df.Review.iloc[sig1])
print(df.Review.iloc[sig2])
print()
print(df.signature.iloc[sig1])
print(df.signature.iloc[sig2])

min_hash_sim(df.signature.iloc[sig1], df.signature.iloc[sig2])

      Areas do Seixo is a realization of one dream and with your dream I found my dream       
     Areas do Seixo is the best Well come back        

[362575815, 200228398, 316249003, 581361384, 193860451, 1124273824, 560545114, 429417603, 82040367, 379650150]
[1025792058, 139967230, 316249003, 754296082, 155860831, 579184100, 890659176, 211692224, 82040367, 379650150]


0.3