In [1]:
# dev/python3


A class Shingling that constructs k–shingles of a given length k (e.g., 10) from a given document, computes a hash value for each unique shingle, and represents the document in the form of an ordered set of its hashed k-shingles.

A class CompareSets that computes the Jaccard similarity of two sets of integers – two sets of hashed shingles.

A class MinHashing that builds a minHash signature (in the form of a vector or a set) of a given length n from a given set of integers (a set of hashed shingles).

A class CompareSignatures that estimates similarity of two integer vectors – minhash signatures – as a fraction of components, in which they agree.

(Optional task for extra 2 bonus) A class LSH that implements the LSH technique: given a collection of minhash signatures (integer vectors) and a similarity threshold t, the LSH class (using banding and hashing) finds all candidate pairs of signatures that agree on at least fraction t of their components.

In [79]:
#read data
import pandas as pd
import numpy as np

df = pd.read_csv('dataset-CalheirosMoroRita-2017.csv',  error_bad_lines=False, engine='python', sep="\t")
df.head(10)

Unnamed: 0,Review
0,"Everything from the weather, staff, food, pro..."
1,"The hotel it is fantastic built by the sea, li..."
2,One dream! Cozy and comfortable Hotel! The b...
3,Hotel concept is hard to grasp. They communica...
4,"This is a wonderful hotel, for a romantic esca..."
5,Gold room fantastic. We loved and we will come...
6,"One of the best Hotels in the world, and in th..."
7,"Land Room is wonderful! An amazing place, as ..."
8,One of the greatest meals ever. Everything wa...
9,"A peaceful place, built with a lot of tas..."


In [94]:
#clean punctuation
df['Review'] = df['Review'].str.replace(r'[^\w\s]+\n', '')
df.head(10)

Unnamed: 0,Review
0,Everything from the weather staff food proper...
1,The hotel it is fantastic built by the sea liv...
2,One dream Cozy and comfortable Hotel The bes...
3,Hotel concept is hard to grasp They communicat...
4,This is a wonderful hotel for a romantic escap...
5,Gold room fantastic We loved and we will come ...
6,One of the best Hotels in the world and in the...
7,Land Room is wonderful An amazing place as so...
8,One of the greatest meals ever Everything was...
9,A peaceful place built with a lot of tast...


In [135]:
#shingling 
shingle_size = 5


def shingles(words, n = shingle_size):
    return [words[i:i+n] for i in range(len(words) - n + 1) if len(words[i]) < 5]

df_shingled = df.copy()
df_shingled['Review'] = df.Review.map(lambda x : x.split())
df['shingles'] = df_shingled.Review.map(shingles)
df['shingles'] = df.shingles.map(lambda shingles : [x for x in set(tuple(x) for x in shingles)])

print(df.iloc[0])

Review       Everything from the weather staff food proper...
shingles    [(and, beach, were, top, notch), (from, the, w...
Name: 0, dtype: object


In [159]:
#hashing
import binascii
def s_hash(shingle):
    return binascii.crc32(shingle) & 0xffffffff
df['hashes'] = df.shingles.map(lambda shingles: [s_hash((" ".join(word for word in shingle)).encode()) for shingle in shingles])

df.head(5)

Unnamed: 0,Review,shingles,hashes
0,Everything from the weather staff food proper...,"[(and, beach, were, top, notch), (from, the, w...","[2904611771, 1845932384, 4179125628, 353316970..."
1,The hotel it is fantastic built by the sea liv...,"[(is, fantastic, built, by, the), (The, hotel,...","[1474786912, 3874797931, 3410383113, 148981026..."
2,One dream Cozy and comfortable Hotel The bes...,"[(and, all, the, guests, were), (the, end, of,...","[330661309, 538340614, 2284279399, 3491894724,..."
3,Hotel concept is hard to grasp They communicat...,"[(it, lacks, in, choice, Service), (has, highq...","[3780940546, 1701566673, 3644347488, 804875039..."
4,This is a wonderful hotel for a romantic escap...,"[(a, toast, with, the, free), (free, red, wine...","[2099714984, 3526732456, 2610462274, 332096853..."


In [175]:
#minhash signatures
import time
import random

t0 = time.time()
max_shingle_id = 2**32-1
big_prime = 4294967311



def rand_coefficients(k):
    rand_coeff = []
    while k > 0:
        randIndex = random.randint(0, maxShingleID) 

    while randIndex in randList:
          randIndex = random.randint(0, maxShingleID) 

    randList.append(randIndex)
    k = k - 1
    return rand_coeff

numhashes = df.hashes.map(lambda x: len(x)).sum()

coeff1 = rand_coefficients(numhashes)
coeff2 = rand_coefficients(numhashes)








10766