In [1]:
import hashlib

def tokenize(text):
    # Simple tokenizer that splits text into lowercase words
    return text.lower().split()

def get_features(tokens):
    # Generate a dictionary of token frequencies
    features = {}
    for token in tokens:
        features[token] = features.get(token, 0) + 1
    return features

def simhash(features, hashbits=64):
    # Initialize a vector of zeros with length equal to hashbits
    v = [0] * hashbits
    for feature, weight in features.items():
        # Hash the feature string into a large integer
        h = int(hashlib.md5(feature.encode('utf-8')).hexdigest(), 16)
        for i in range(hashbits):
            bitmask = 1 << i
            # Update the vector based on whether the bit is set
            if h & bitmask:
                v[i] += weight
            else:
                v[i] -= weight
    # Construct the final hash
    fingerprint = 0
    for i in range(hashbits):
        if v[i] >= 0:
            fingerprint |= 1 << i
    return fingerprint

def hamming_distance(hash1, hash2):
    # Compute the Hamming distance between two hashes
    x = hash1 ^ hash2
    total = 0
    while x:
        total += 1
        x &= x - 1  # Remove the lowest set bit
    return total

# Example usage
text1 = "The quick brown fox jumps over the lazy dog"
text2 = "The quick brown fox leaps over the lazy dog"

tokens1 = tokenize(text1)
tokens2 = tokenize(text2)

features1 = get_features(tokens1)
features2 = get_features(tokens2)

hash1 = simhash(features1)
hash2 = simhash(features2)

print(f"SimHash of text1: {hash1}")
print(f"SimHash of text2: {hash2}")
print(f"Hamming distance: {hamming_distance(hash1, hash2)}")


SimHash of text1: 3279303472042380063
SimHash of text2: 4397603971125643799
Hamming distance: 9


In [2]:
# Example usage
text1 = "This is another test"
text2 = "The quick brown fox leaps over the lazy dog"

tokens1 = tokenize(text1)
tokens2 = tokenize(text2)

features1 = get_features(tokens1)
features2 = get_features(tokens2)

hash1 = simhash(features1)
hash2 = simhash(features2)

print(f"SimHash of text1: {hash1}")
print(f"SimHash of text2: {hash2}")
print(f"Hamming distance: {hamming_distance(hash1, hash2)}")


SimHash of text1: 17212422180994792698
SimHash of text2: 4397603971125643799
Hamming distance: 37
