# Requiment 1

## Library

In [2]:
import pandas as pd
import numpy as np
from numpy import dot
from numpy.linalg import norm
import hashlib
import random

## Data

In [85]:
def read_file(filename):
      with open(filename, 'r') as file:
          text = file.readlines()
      df = pd.DataFrame(text, columns=["Text"])
      return df

## LSH

In [151]:
from sklearn.metrics.pairwise import cosine_similarity
class InMemoryMinHashLSH:
    def read_file(self, filename):
        with open(filename, 'r') as file:
            text = file.readlines()
        df = pd.DataFrame(text, columns=["Text"])
        return df
    def __init__(self, documents):
      ##Init self data
      data = self.read_file("/content/WebOfScience-5736.txt")
      self.documents = data
      self.minhashing_self = None
      self.lsh_self = None
      self.shingled_self = self.shingling()

      ## Init Query data
      self.documents = documents
      self.shingled_query = self.shingling()
      self.minhashing_query = None
      self.lsh_query = None

      self.boolean_vectors_self, self.boolean_vectors_query = self.convert_to_boolean_vectors()


    def convert_to_boolean_vectors(self):
      bool_vectors_query = []
      bool_vectors_self = []
      # Create a set to store all unique shingles from both self and query shingled data
      shingles_set = set()
      shingles_set.update(self.shingled_self)
      shingles_set.update(self.shingled_query)
      for eles in shingles_set:
        if eles in self.shingled_self:
          bool_vectors_self.append(1)
        else:
          bool_vectors_self.append(0)

      for eles in shingles_set:
        if eles in self.shingled_query:
          bool_vectors_query.append(1)
        else:
          bool_vectors_query.append(0)

      return bool_vectors_self, bool_vectors_query


    def shingling(self):
      k = 8  # Length of each shingle
      all_shingles = set()  # Set to store all unique shingles

      # Iterate over each document in the 'Text' column of self.documents
      for line in self.documents['Text']:
          # Split the line into words and remove punctuation
          words_list = line.replace(",","").replace(";","").replace(".","").replace(":","").split(" ")

          # Iterate over the words to create shingles of length k
          for i in range(len(words_list) - k + 1):
              # Extract a shingle of length k as a string
              shingle = " ".join(words_list[i:i+k])
              all_shingles.add(shingle)  # Add the shingle to the set of all shingles

      self.shingled_query = all_shingles  # Set self.shingled_query to the set containing all unique shingles
      if self.minhashing_self:
        self.convert_to_boolean_vectors()
      return all_shingles

    def minhashing(self, numhash):
      arr = list(range(1, len(self.boolean_vectors_query) + 1))
      minhash_self = []
      minhash_query =[]
      flag1 = 0
      flag2 = 0

      for num in range(numhash):
          random.shuffle(arr)
          for i in range(1, len(arr) + 1):
              if self.boolean_vectors_self[arr.index(i)] == 1 and flag1 == 0:
                  minhash_self.append(i)
                  flag1 = flag1 + 1
              if self.boolean_vectors_query[arr.index(i)] == 1 and flag2 == 0:
                  minhash_query.append(i)
                  flag2 = flag2 + 1
              if flag1 == flag2 == 1:
                  flag1 = 0
                  flag2 = 0
                  break

      return minhash_self, minhash_query



    def locality_sensity_hashing(self):
        # Your locality sensitive hashing implementation here
        pass

    def run(self):
      # Run the entire process
      self.shingled_query = self.shingling()
      self.boolean_vectors_self, self.boolean_vectors_query = self.convert_to_boolean_vectors()
      self.minhashing_self,self.minhashing_query = self.minhashing(10)
      print(self.boolean_vectors_query)
      return


    def approxNearestNeighbors(self, key, n):
        pass


In [152]:
data = read_file("/content/datatest.txt")
minhash_lsh = InMemoryMinHashLSH(data)
minhash_lsh.run()

[0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 

In [136]:
import random

arr1 = [1, 1, 0, 0, 0, 1, 1]
arr2 = [1, 0, 0, 0, 0, 1, 1]
numhash = 100
arr = list(range(1, len(arr1) + 1))
minhashing1 = []
minhashing2 = []
flag1 = 0
flag2 = 0
for num in range(numhash):
    random.shuffle(arr)
    for i in range(1, len(arr) + 1):
        if arr1[arr.index(i)] == 1 and flag1 == 0:
            minhashing1.append(i)
            flag1 = flag1 + 1
        if arr2[arr.index(i)] == 1 and flag2 == 0:
            minhashing2.append(i)
            flag2 = flag2 + 1
        if flag1 == flag2 == 1:
            flag1 = 0
            flag2 = 0
            break
def find_union(arr1,arr2):
  result = 0
  for i in range(len(arr1)):
    if(arr1[i]==arr2[i]):
      result += 1
  return result

similariy_arr = find_union(arr1,arr2)/len(arr1)
similariy_hash = find_union(minhashing1,minhashing2)/numhash

print(similariy_arr)
print(similariy_hash)

0.8571428571428571
0.78
