In [3]:
stateless_semantic_hashing = [
  "SimHash",
  "MinHash",
  "Random Projection LSH",
  "Random Hyperplane Hashing",
  "Feature Hashing",
  "Spectral Hashing",
  "Iterative Quantization (ITQ)",
  "Character Hash",
  "N-gram Hash",
  "Phonetic Hash"
]


In [10]:
import hashlib

def simhash(text, hash_bits=32):
    text = text.lower()
    features = [text[i:i+2] for i in range(len(text) - 1)]
    v = [0] * hash_bits
    
    for feature in features:
        h = int(hashlib.md5(feature.encode()).hexdigest(), 16)
        for i in range(hash_bits):
            if h & (1 << i):
                v[i] += 1
            else:
                v[i] -= 1
    
    fingerprint = 0
    for i in range(hash_bits):
        if v[i] > 0:
            fingerprint |= (1 << i)
    
    return format(fingerprint, '08x')

def minhash(text, num_hashes=32):
    text = text.lower()
    shingles = set(text[i:i+2] for i in range(len(text) - 1))
    
    hash_val = 0
    for i in range(num_hashes):
        min_hash = float('inf')
        for shingle in shingles:
            h = int(hashlib.md5(f"{shingle}{i}".encode()).hexdigest()[:8], 16)
            min_hash = min(min_hash, h)
        if (min_hash % 2) == 0:
            hash_val |= (1 << i)
    
    return format(hash_val, '08x')

def random_projection_lsh(text, num_projections=32, seed=42):
    text = text.lower()
    feature_vec = [text.count(chr(ord('a') + i)) for i in range(26)]
    
    hash_val = 0
    for i in range(num_projections):
        projection = sum(
            feature_vec[j] * ((hash((seed, i, j)) % 3) - 1)
            for j in range(len(feature_vec))
        )
        if projection > 0:
            hash_val |= (1 << i)
    
    return format(hash_val, '08x')

def random_hyperplane_hash(text, num_planes=32, seed=123):
    text = text.lower()
    feature_vec = [text.count(chr(ord('a') + i)) for i in range(26)]
    
    hash_val = 0
    for i in range(num_planes):
        dot_product = sum(
            feature_vec[j] * ((hash((seed, i, j)) % 5) - 2)
            for j in range(len(feature_vec))
        )
        if dot_product > 0:
            hash_val |= (1 << i)
    
    return format(hash_val, '08x')

def feature_hash(text, hash_bits=32):
    text = text.lower()
    features = [
        len(text),
        sum(text.count(v) for v in 'aeiou'),
        sum(text.count(c) for c in 'bcdfghjklmnpqrstvwxyz'),
        ord(text[0]) if text else 0,
        ord(text[-1]) if text else 0,
        len(set(text)),
    ]
    
    hash_input = ''.join(str(v) for v in features)
    h = int(hashlib.md5(hash_input.encode()).hexdigest(), 16)
    
    return format(h % (2**hash_bits), '08x')

def spectral_hash(text, hash_bits=32):
    text = text.lower()
    freq = [0] * 26
    for char in text:
        if 'a' <= char <= 'z':
            freq[ord(char) - ord('a')] += 1
    
    hash_val = 0
    for i in range(hash_bits):
        idx1 = (i * 7) % 26
        idx2 = (i * 13) % 26
        if freq[idx1] - freq[idx2] > 0:
            hash_val |= (1 << i)
    
    return format(hash_val, '08x')

def itq_hash(text, hash_bits=32):
    text = text.lower()
    features = [text.count(chr(ord('a') + i)) for i in range(26)]
    
    hash_val = 0
    for i in range(hash_bits):
        idx1 = (i * 3) % 26
        idx2 = (i * 5) % 26
        rotated_val = features[idx1] - features[idx2]
        if rotated_val > 0:
            hash_val |= (1 << i)
    
    return format(hash_val, '08x')

def character_hash(text, hash_bits=32):
    text = text.lower()
    hash_val = 0
    
    for i in range(min(len(text), hash_bits)):
        char_val = ord(text[i]) if i < len(text) else 0
        if (char_val * (i + 1)) % 2 == 0:
            hash_val |= (1 << i)
    
    return format(hash_val, '08x')

def ngram_hash(text, hash_bits=32):
    text = text.lower()
    patterns = ['a', 'e', 'i', 'o', 'u', 'ra', 'an', 'ar', 'sh', 'ee', 
                'ja', 'vi', 'sa', 'pr', 'de', 'am']
    
    hash_val = 0
    for i, pattern in enumerate(patterns[:hash_bits]):
        if pattern in text:
            hash_val |= (1 << i)
    
    return format(hash_val, '08x')

def phonetic_hash(text, hash_bits=32):
    text = text.lower()
    
    features = []
    features.append(1 if any(v in text for v in 'aeiou') else 0)
    features.append(1 if 'sh' in text or 'ch' in text else 0)
    features.append(1 if 'th' in text or 'ph' in text else 0)
    features.append(1 if text.startswith(tuple('aeiou')) else 0)
    features.append(1 if text.endswith(tuple('aeiou')) else 0)
    
    for cons in ['r', 'j', 's', 'v', 'k', 'm', 'n', 'd', 'p']:
        features.append(1 if cons in text else 0)
    
    hash_val = 0
    for i, feature in enumerate(features[:hash_bits]):
        if feature:
            hash_val |= (1 << i)
    
    return format(hash_val, '08x')

def hamming_distance(h1, h2):
    x = int(h1, 16) ^ int(h2, 16)
    return bin(x).count('1')

def similarity_score(hamming_dist, total_bits=32):
    return ((total_bits - hamming_dist) / total_bits) * 100
    

In [17]:
indian_names = [
    "Suresh", "Mahesh",
    "Arun", "Arjun", "Varun", "Deepak",
    "Deepa", "Sanjay", "Sanjeev", "Vikram",
    "Vikas", "Sujesh", "Sujeesh"
]

In [18]:
from tabulate import tabulate

table_data = []
headers = ["Name", "SimHash", "MinHash", "RandProj", "RandHP", "Feature", 
           "Spectral", "ITQ", "CharHash", "N-gram", "Phonetic"]
for name in indian_names:
    row = [
        name,
        simhash(name),
        minhash(name),
        random_projection_lsh(name),
        random_hyperplane_hash(name),
        feature_hash(name),
        spectral_hash(name),
        itq_hash(name),
        character_hash(name),
        ngram_hash(name),
        phonetic_hash(name)
    ]
    table_data.append(row)
print(tabulate(table_data, headers=headers, tablefmt="grid"))


+---------+-----------+-----------+------------+----------+-----------+------------+----------+------------+----------+------------+
| Name    | SimHash   | MinHash   | RandProj   | RandHP   | Feature   | Spectral   | ITQ      | CharHash   |   N-gram | Phonetic   |
| Suresh  | d8e4fcb6  | 3f9add43  | 00aaa8c0   | 29542b59 | ed6cc774  | 08204502   | 01800c40 | 0000002e   | 00000112 | 000000a3   |
+---------+-----------+-----------+------------+----------+-----------+------------+----------+------------+----------+------------+
| Mahesh  | 57a5cea0  | 15785d47  | 20abfdc8   | 695cab1d | dec57bc6  | 08000002   | 40000c10 | 0000002e   | 00000103 | 00000483   |
+---------+-----------+-----------+------------+----------+-----------+------------+----------+------------+----------+------------+
| Arun    | b2416e82  | 13153bdb  | c8000aa2   | 295c2b4b | fbfb092e  | 00000000   | 01800000 | 0000000a   | 00000091 | 00000829   |
+---------+-----------+-----------+------------+----------+----------

In [20]:
from tabulate import tabulate

# Similar pairs
similar_pairs = [
    ("Rajesh", "Rajeev"),
    ("Arun", "Arjun"),
    ("Varun", "Arun"),
    ("Amit", "Amita"),
    ("Deepak", "Deepa"),
    ("Anjali", "Anjalika"),
    ("Sanjay", "Sanjeev"),
    ("Vikas", "Vikram"),
    ("Sujesh", "Sujeesh")
]

algorithms = [
    ("SimHash", simhash),
    ("MinHash", minhash),
    ("RandProj", random_projection_lsh),
    ("RandHP", random_hyperplane_hash),
    ("Feature", feature_hash),
    ("Spectral", spectral_hash),
    ("ITQ", itq_hash),
    ("CharHash", character_hash),
    ("N-gram", ngram_hash),
    ("Phonetic", phonetic_hash)
]

# Prepare data for tabulate
table_data = []
headers = ["Name Pair", "SimHash", "MinHash", "RandProj", "RandHP", "Feature", 
           "Spectral", "ITQ", "CharHash", "N-gram", "Phonetic"]

# Track scores for averaging
algo_scores = {algo_name: [] for algo_name, _ in algorithms}

for name1, name2 in similar_pairs:
    pair_name = f"{name1}-{name2}"
    row = [pair_name]
    
    for algo_name, algo_func in algorithms:
        h1 = algo_func(name1)
        h2 = algo_func(name2)
        hamming = hamming_distance(h1, h2)
        similarity = similarity_score(hamming)
        
        algo_scores[algo_name].append(similarity)
        row.append(f"{similarity:.1f}%")
    
    table_data.append(row)

# Add average row
avg_row = ["AVERAGE"]
for algo_name, _ in algorithms:
    avg_score = sum(algo_scores[algo_name]) / len(algo_scores[algo_name])
    avg_row.append(f"{avg_score:.1f}%")

table_data.append(avg_row)

# Display using tabulate
print(tabulate(table_data, headers=headers, tablefmt="grid"))


+-----------------+-----------+-----------+------------+----------+-----------+------------+--------+------------+----------+------------+
| Name Pair       | SimHash   | MinHash   | RandProj   | RandHP   | Feature   | Spectral   | ITQ    | CharHash   | N-gram   | Phonetic   |
| Rajesh-Rajeev   | 84.4%     | 56.2%     | 84.4%      | 84.4%    | 59.4%     | 84.4%      | 96.9%  | 100.0%     | 93.8%    | 90.6%      |
+-----------------+-----------+-----------+------------+----------+-----------+------------+--------+------------+----------+------------+
| Arun-Arjun      | 71.9%     | 71.9%     | 96.9%      | 90.6%    | 46.9%     | 100.0%     | 93.8%  | 93.8%      | 100.0%   | 96.9%      |
+-----------------+-----------+-----------+------------+----------+-----------+------------+--------+------------+----------+------------+
| Varun-Arun      | 87.5%     | 90.6%     | 96.9%      | 100.0%   | 40.6%     | 100.0%     | 96.9%  | 90.6%      | 100.0%   | 93.8%      |
+-----------------+--------