In [1]:
# dev/python3


A class Shingling that constructs k–shingles of a given length k (e.g., 10) from a given document, computes a hash value for each unique shingle, and represents the document in the form of an ordered set of its hashed k-shingles.

A class CompareSets that computes the Jaccard similarity of two sets of integers – two sets of hashed shingles.

A class MinHashing that builds a minHash signature (in the form of a vector or a set) of a given length n from a given set of integers (a set of hashed shingles).

A class CompareSignatures that estimates similarity of two integer vectors – minhash signatures – as a fraction of components, in which they agree.

(Optional task for extra 2 bonus) A class LSH that implements the LSH technique: given a collection of minhash signatures (integer vectors) and a similarity threshold t, the LSH class (using banding and hashing) finds all candidate pairs of signatures that agree on at least fraction t of their components.

In [2]:
#read data
import pandas as pd
import numpy as np

df = pd.read_csv('dataset-CalheirosMoroRita-2017.csv',  error_bad_lines=False, engine='python', sep="\t")
df.head(10)

Skipping line 3: '	' expected after '"'


Unnamed: 0,Review
0,"Everything from the weather, staff, food, prop..."
1,"We full enjoyed the place, and facilities."
2,"Thanks for the """"cidreira"""" and """"madalenas"""" ..."
3,One dream! Cozy and comfortable Hotel! The b...
4,Hotel concept is hard to grasp. They communica...
5,"This is a wonderful hotel, for a romantic esca..."
6,Gold room fantastic. We loved and we will come...
7,"One of the best Hotels in the world, and in th..."
8,"Land Room is wonderful! An amazing place, as ..."
9,One of the greatest meals ever. Everything wa...


In [3]:
#clean punctuation
df['Review'] = df['Review'].str.replace(r'[^\w\s]+\n', '')
df.head(10)

Unnamed: 0,Review
0,"Everything from the weather, staff, food, prop..."
1,"We full enjoyed the place, and facilities."
2,"Thanks for the """"cidreira"""" and """"madalenas"""" ..."
3,One dream! Cozy and comfortable Hotel! The b...
4,Hotel concept is hard to grasp. They communica...
5,"This is a wonderful hotel, for a romantic esca..."
6,Gold room fantastic. We loved and we will come...
7,"One of the best Hotels in the world, and in th..."
8,"Land Room is wonderful! An amazing place, as ..."
9,One of the greatest meals ever. Everything wa...


In [4]:
#shingling 
shingle_size = 5


def shingles(words, n = shingle_size):
    return [words[i:i+n] for i in range(len(words) - n + 1) if len(words[i]) < 5]

df_shingled = df.copy()
df_shingled['Review'] = df.Review.map(lambda x : x.split())
df['shingles'] = df_shingled.Review.map(shingles)
df['shingles'] = df.shingles.map(lambda shingles : [x for x in set(tuple(x) for x in shingles)])

print(df.iloc[0])

Review      Everything from the weather, staff, food, prop...
shingles    [(and, beach, were, top, notch), (from, the, w...
Name: 0, dtype: object


In [5]:
#hashing
import binascii
def s_hash(shingle):
    return binascii.crc32(shingle) & 0xffffffff
df['hashes'] = df.shingles.map(lambda shingles: [s_hash((" ".join(word for word in shingle)).encode()) for shingle in shingles])

df.head(5)

Unnamed: 0,Review,shingles,hashes
0,"Everything from the weather, staff, food, prop...","[(and, beach, were, top, notch), (from, the, w...","[2904611771, 791644852, 3029275621, 4254103425..."
1,"We full enjoyed the place, and facilities.","[(We, full, enjoyed, the, place,), (full, enjo...","[2635458344, 1813270520]"
2,"Thanks for the """"cidreira"""" and """"madalenas"""" ...","[(for, the, """"cidreira"""", and, """"madalenas""""),...","[3444279829, 2888105924, 2008101651]"
3,One dream! Cozy and comfortable Hotel! The b...,"[(were, received, in, the, fire), (have, glute...","[3809192752, 1872724818, 3649858269, 102701028..."
4,Hotel concept is hard to grasp. They communica...,"[(is, hard, to, grasp., They), (was, ok,, but,...","[2972139538, 3843584761, 3946819817, 346379532..."


In [43]:
#minhash signatures
import time
import random

t0 = time.time()
max_shingle_id = 2**32-1
big_prime = 4294967311



def rand_coefficients(k):
    
    rand_coeff = []
        
    while k > 0:
        randIndex = random.randint(0, max_shingle_id) 

        while randIndex in rand_coeff:
                randIndex = random.randint(0, max_shingle_id) 

        rand_coeff.append(randIndex)
        k = k - 1
    return rand_coeff

numhashes = df.hashes.map(lambda x: len(x)).sum()

coeff1 = rand_coefficients(numhashes)
coeff2 = rand_coefficients(numhashes)

print("coefficient array created")

coefficient array created


In [64]:
#loop over hash functions

for i in range(0, 1):
    
    #minHashCode = big_prime + 1
    df["signatures"] = df.hashes.map(lambda hashes: [((coeff1[i] * h + coeff2[i]) % big_prime) for h in hashes])
    df["MinHash"] = [min(sig) for sig in df.signatures.tolist()]

df


ValueError: min() arg is an empty sequence