In [7]:
import random

# Reading Files function
def read_text_files(file_paths):
  contents = []
  for file_path in file_paths:
    with open(file_path, 'r', encoding='utf-8') as file:
      contents.append(file.read())
  return contents

# Generating Shingles k=75
def g_shingles(text, k):
  shingles = set()
  for i in range(len(text) - k + 1):
    shingles.add(text[i:i+k])
  return shingles

# Vocabulary and one-hot encoding
def c_vocab(shingles):
  vocabulary = {}
  for index, shingle in enumerate(shingles):
    vocabulary[shingle] = index
  return vocabulary

# Hash function
def simple_hash(string):
  hash_val = 0
  for char in string:
    hash_val = (hash_val * 31 + ord(char)) % 2**32
  return hash_val

# Signature using hash function
def g_signature(shingles, signature_length):
  signature = [float('inf')] * signature_length
  for shingle in shingles:
    hashed_shingle = simple_hash(shingle)
    for i in range(signature_length):
      a = random.randint(0, 2**32 - 1)
      b = random.randint(0, 2**32 - 1)
      hashed_value = (a * hashed_shingle + b) % (2**32)
      signature[i] = min(signature[i], hashed_value)
  return signature

# Jaccard similarity
def jaccard_similarity(set1, set2):
  intersection = len(set1.intersection(set2))
  union = len(set1.union(set2))
  return intersection / union

file_paths = ["text1.txt", "text2.txt", "text3.txt"]
roll_no = 15
signature_length = int(str(roll_no)[-2:])

file_contents = read_text_files(file_paths)

# Defining k
k = 15

file_shingles = [g_shingles(content, k) for content in file_contents]

vocabulary = c_vocab(set().union(*file_shingles))

file_signatures = [g_signature(shingles, signature_length) for shingles in file_shingles]

shingle_similarities = [
  jaccard_similarity(file_shingles[0], file_shingles[1]),
  jaccard_similarity(file_shingles[0], file_shingles[2]),
  jaccard_similarity(file_shingles[1], file_shingles[2])
]

signature_sets = [set(signature) for signature in file_signatures]
signature_similarities = [
  jaccard_similarity(signature_sets[0], signature_sets[1]),
  jaccard_similarity(signature_sets[0], signature_sets[2]),
  jaccard_similarity(signature_sets[1], signature_sets[2])
]

print("Shingle Similarities:", shingle_similarities)
print("Signature Similarities:", signature_similarities)
print("Signature Length:", signature_length)


Shingle Similarities: [0.005708498296657766, 0.0048614914433693715, 0.005373126388168482]
Signature Similarities: [0.0, 0.0, 0.0]
Signature Length: 15


In [8]:
import random

# Reading Files function
def read_text_files(file_paths):
  contents = []
  for file_path in file_paths:
    with open(file_path, 'r', encoding='utf-8') as file:
      contents.append(file.read())
  return contents

# Generating Shingles k=75
def g_shingles(text, k):
  shingles = set()
  for i in range(len(text) - k + 1):
    shingles.add(text[i:i+k])
  return shingles

# Vocabulary and one-hot encoding
def c_vocab(shingles):
  vocabulary = {}
  for index, shingle in enumerate(shingles):
    vocabulary[shingle] = index
  return vocabulary

# Hash function
def simple_hash(string):
  hash_val = 0
  for char in string:
    hash_val = (hash_val * 31 + ord(char)) % 2**32
  return hash_val

# Signature using hash function
def g_signature(shingles, signature_length):
  signature = [float('inf')] * signature_length
  for shingle in shingles:
    hashed_shingle = simple_hash(shingle)
    for i in range(signature_length):
      a = random.randint(0, 2**32 - 1)
      b = random.randint(0, 2**32 - 1)
      hashed_value = (a * hashed_shingle + b) % (2**32)
      signature[i] = min(signature[i], hashed_value)
  return signature

# Jaccard similarity
def jaccard_similarity(set1, set2):
  intersection = len(set1.intersection(set2))
  union = len(set1.union(set2))
  return intersection / union

file_paths = ["text1.txt", "text2.txt", "text3.txt"]
roll_no = 15
signature_length = int(str(roll_no)[-2:])

file_contents = read_text_files(file_paths)

# Defining k
k = 4

file_shingles = [g_shingles(content, k) for content in file_contents]

vocabulary = c_vocab(set().union(*file_shingles))

file_signatures = [g_signature(shingles, signature_length) for shingles in file_shingles]

shingle_similarities = [
  jaccard_similarity(file_shingles[0], file_shingles[1]),
  jaccard_similarity(file_shingles[0], file_shingles[2]),
  jaccard_similarity(file_shingles[1], file_shingles[2])
]

signature_sets = [set(signature) for signature in file_signatures]
signature_similarities = [
  jaccard_similarity(signature_sets[0], signature_sets[1]),
  jaccard_similarity(signature_sets[0], signature_sets[2]),
  jaccard_similarity(signature_sets[1], signature_sets[2])
]

print("Shingle Similarities:", shingle_similarities)
print("Signature Similarities:", signature_similarities)


Shingle Similarities: [0.3791843220338983, 0.3477113050883543, 0.3578845587926178]
Signature Similarities: [0.0, 0.0, 0.0]
