# Importing phase

In [16]:
import pandas as pd
import numpy as np

df = pd.read_csv('dataset-CalheirosMoroRita-2017.csv',  error_bad_lines=False, engine='python', sep="\t")
df['Review'] = df['Review'].str.replace(r'[^\w\s]+', '')
documents = df
documents.head()

Unnamed: 0,Review
0,Everything from the weather staff food propert...
1,The hotel it is fantastic built by the sea liv...
2,One dream Cozy and comfortable Hotel The bes...
3,Hotel concept is hard to grasp They communicat...
4,This is a wonderful hotel for a romantic escap...


# Shingling phase

In [17]:
import time
import binascii
import re

print("Shingling articles...")
print()

#initializing variables
shingle_size = 5
docsAsShingleSets = {}
docNames = []
totalShingles = 0
shingleNo = 0

# loop through all the documents
for index, row in documents.iterrows():
    
    # Read all of the words (they are all on one line)
    review = row.iloc[0]
    words = re.sub("[^\w]", " ",  review).split()

    # Retrieve the article ID
    docID = index

    # Maintain a list of all document IDs.
    docNames.append(docID)

    # 'shinglesInDoc' will hold all of the unique shingles present in the
    # current document. If a shingle ID occurs multiple times in the document,
    # it will only appear once in the set.

  
    # keep word shingles
    shinglesInDocWords = set()

    # keep hashed shingles
    shinglesInDocInts = set()

    shingle = []
    # For each word in the document...
    for index in range(len(words) - shingle_size + 1):
        # Construct the shingle text by combining k words together.
        shingle = words[index:index + shingle_size]
        shingle = ' '.join(shingle)
        #print(shingle)
        
        # Hash the shingle to a 32-bit integer.
        crc = binascii.crc32(shingle.encode("utf-8")) & 0xffffffff

        if shingle not in shinglesInDocWords:
            shinglesInDocWords.add(shingle)
        # Add the hash value to the list of shingles for the current document.
        # Note that set objects will only add the value to the set if the set
        # doesn't already contain it.

        if crc not in shinglesInDocInts:
            shinglesInDocInts.add(crc)
            # Count the number of shingles across all documents.
            shingleNo = shingleNo + 1
        
        else:
            del shingle
            index = index - 1

    # Store the completed list of shingles for this document in the dictionary.
    docsAsShingleSets[docID] = shinglesInDocInts

totalShingles = shingleNo

print('Total number of shingles', shingleNo)
# Report how long shingling took.
print('\nShingled ' + str(len(docsAsShingleSets)) + ' docs ')
print('\nAverage shingles per doc: %.2f' % (shingleNo / len(docsAsShingleSets)))


Shingling articles...

Total number of shingles 17370

Shingled 401 docs 

Average shingles per doc: 43.32


# MinHash phase

In [24]:
# Define virtual Triangle matrices to hold the similarity values. For storing
# similarities between pairs, we only need roughly half the elements of a full
# matrix. Using a triangle matrix requires less than half the memory of a full
# matrix. Using a triangle matrix requires less than half the memory of a full
# matrix, and can protect the programmer from inadvertently accessing one of
# the empty/invalid cells of a full matrix.

# Calculate the number of elements needed in our triangle matrix
numElems = int(len(docsAsShingleSets) * (len(docsAsShingleSets) - 1) / 2)

# Initialize two empty lists to store the similarity values.
# 'JSim' will be for the actual Jaccard Similarity values.
# 'estJSim' will be for the estimated Jaccard Similarities found by comparing
# the MinHash signatures.
JSim = [0 for x in range(numElems)]
estJSim = [0 for x in range(numElems)]


# Define a function to map a 2D matrix coordinate into a 1D index.
def getTriangleIndex(i, j):
    # If i == j that's an error.
    if i == j:
        sys.stderr.write("Can't access triangle matrix with i == j")
        sys.exit(1)
    # If j < i just swap the values.
    if j < i:
        temp = i
        i = j
        j = temp

    # Calculate the index within the triangular array.
    # This fancy indexing scheme is taken from pg. 211 of:
    # http://infolab.stanford.edu/~ullman/mmds/ch6.pdf
    # But I adapted it for a 0-based index.
    # Note: The division by two should not truncate, it
    #       needs to be a float.
    k = int(i * (len(docsAsShingleSets) - (i + 1) / 2.0) + j - i) - 1

    return k



In [34]:
#print '\nGenerating random hash functions...'

numHashes = 10

# Record the maximum shingle ID that we assigned.
maxShingleID = 2**32-1

# We need the next largest prime number above 'maxShingleID'.
# I looked this value up here: 
# http://compoasso.free.fr/primelistweb/page/prime/liste_online_en.php
nextPrime = 4294967311

# Our random hash function will take the form of:
#   h(x) = (a*x + b) % c
# Where 'x' is the input value, 'a' and 'b' are random coefficients, and 'c' is
# a prime number just greater than maxShingleID.


# For each of the 'numHashes' hash functions, generate a different coefficient 'a' and 'b'.   
coeffA = np.random.randint(10, size=numHashes)
coeffB = np.random.randint(10, size=numHashes)

#print '\nGenerating MinHash signatures for all documents...'

# List of documents represented as signature vectors
signatures = []

# Rather than generating a random permutation of all possible shingles, 
# we'll just hash the IDs of the shingles that are *actually in the document*,
# then take the lowest resulting hash code value. This corresponds to the index 
# of the first shingle that you would have encountered in the random order.

# For each document...
for docID in docNames:
  
  # Get the shingle set for this document.
  shingleIDSet = docsAsShingleSets[docID]
  
  # The resulting minhash signature for this document. 
  signature = []
  
  # For each of the random hash functions...
  for i in range(0, numHashes):
    
    # For each of the shingles actually in the document, calculate its hash code
    # using hash function 'i'. 
    
    # Track the lowest hash ID seen. Initialize 'minHashCode' to be greater than
    # the maximum possible value output by the hash.
    minHashCode = nextPrime + 1
    
    # For each shingle in the document...
    for shingleID in shingleIDSet:
      # Evaluate the hash function.
      hashCode = (coeffA[i] * shingleID + coeffB[i]) % nextPrime 
      
      # Track the lowest hash code seen.
      if hashCode < minHashCode:
        minHashCode = hashCode

    # Add the smallest hash code value as component number 'i' of the signature.
    signature.append(minHashCode)
  
  # Store the MinHash signature for this document.
  signatures.append(signature)

       
#print "\nGenerating MinHash signatures took %.2fsec" % elapsed  
#print(signatures)




[[465994212, 22061690, 371702080, 465994210, 1461071360, 285208245, 123900697, 285208238, 285208239, 123900696], [297962055, 59262782, 82285406, 297962053, 73602191, 148981033, 29631397, 148981026, 148981027, 29631396], [93110333, 59870251, 181920127, 93110331, 11757624, 46555172, 60640046, 46555165, 46555166, 60640045], [3908057, 11724162, 17586250, 3908055, 15632224, 1954034, 5862087, 1954027, 1954028, 5862086], [68358925, 94149560, 141224347, 68358923, 56689912, 34179468, 47074786, 34179461, 34179462, 47074785], [765916699, 416005388, 624008089, 765916697, 1344014333, 382958355, 208002700, 382958348, 382958349, 208002699], [60632665, 181897986, 247042178, 60632663, 4598504, 30316338, 90948999, 30316331, 30316332, 90948998], [84435424, 60191708, 401935924, 84435422, 178357328, 108482156, 133978645, 108482149, 108482150, 133978644], [122525415, 133664816, 152266527, 122525413, 121281462, 61262713, 66832414, 61262706, 61262707, 66832413], [410878307, 312734896, 58260140, 410878305, 684

In [36]:
numDocs = documents.size

# Creates a N x N matrix initialized to 0.

# Time this step.
t0 = time.time()

# For each of the test documents...
for i in range(0, numDocs):
  # Get the MinHash signature for document i.
  signature1 = signatures[i]
    
  # For each of the other test documents...
  for j in range(i + 1, numDocs):
    
    # Get the MinHash signature for document j.
    signature2 = signatures[j]
    
    count = 0
    # Count the number of positions in the minhash signature which are equal.
    for k in range(0, numHashes):
      count = count + (signature1[k] == signature2[k])
    
    # Record the percentage of positions which matched.    
    estJSim[getTriangleIndex(i, j)] = (count / numHashes)

In [47]:
s0 = len(docsAsShingleSets[0])
# For every document pair...
i = 90

# Print progress every 100 documents.
if (i % 100) == 0:
    print("  (" + str(i) + " / " + str(len(docsAsShingleSets)) + ")")

# Retrieve the set of shingles for document i.
s1 = docsAsShingleSets[docNames[i]]
neighbors_of_given_documentSHINGLES = {}

for j in range(0, len(docsAsShingleSets)):
    if j != i:
        # Retrieve the set of shingles for document j.
        s2 = docsAsShingleSets[docNames[j]]

        # Calculate and store the actual Jaccard similarity.
        JSim[getTriangleIndex(i, j)] = (len(s1.intersection(s2)) / float(len(s1.union(s2))))
        percsimilarity = JSim[getTriangleIndex(i, j)] * 100
        if (percsimilarity > 0):
            # Print out the match and similarity values with pretty spacing.
            print("  %5s --> %5s   %.2f%s   " % (docNames[i], docNames[j], percsimilarity, '%'))
            neighbors_of_given_documentSHINGLES[j] = percsimilarity

sorted_neigborsSHINGLES = sorted(neighbors_of_given_documentSHINGLES.items(), key=lambda x: x[1], reverse=True)

#print 'Comparing Shingles ...'