In [5]:
%pip install datasketch hashlib

[31mERROR: Ignored the following yanked versions: 20081119[0m[31m
[0m[31mERROR: Could not find a version that satisfies the requirement hashlib (from versions: none)[0m[31m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[31mERROR: No matching distribution found for hashlib[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [11]:
import hashlib

# Dataset: Indian first names
indian_names = [
    "Rajesh", "Rajeev", "Suresh", "Mahesh", "Priya", "Priyanka",
    "Arun", "Arjun", "Varun", "Anjali", "Anjalika", "Deepak",
    "Deepa", "Sanjay", "Sanjeev", "Amit", "Amita", "Sujesh", "Sujeesh"
]

def alphabet_hash_v1(text):
    text = text.lower()
    result = []
    
    for i, char in enumerate(text):
        # Create hash from character and position
        hash_input = f"{char}{i}".encode()
        hash_val = int(hashlib.md5(hash_input).hexdigest(), 16)
        # Map to alphabet (a-z)
        letter = chr(ord('a') + (hash_val % 26))
        result.append(letter)
    return ''.join(result)


def alphabet_hash_v2(text):
    text = text.lower()
    result = []
    
    # Vowels and consonants mapping
    vowels = 'aeiou'
    consonants = 'bcdfghjklmnpqrstvwxyz'
    
    for i, char in enumerate(text):
        if char in vowels:
            # Map vowels to vowels deterministically
            hash_val = (ord(char) * (i + 1)) % len(vowels)
            result.append(vowels[hash_val])
        elif char.isalpha():
            # Map consonants to consonants
            hash_val = (ord(char) * (i + 1)) % len(consonants)
            result.append(consonants[hash_val])
        else:
            # Spaces or special chars -> 'x'
            result.append('x')
    
    return ''.join(result)


def alphabet_hash_v3(text):
    """
    Position-aware alphabet hash with semantic preservation
    Uses MD5 but outputs only letters
    """
    text = text.lower()
    result = []
    
    for i, char in enumerate(text):
        # Combine character with its neighbors for context
        context = text[max(0, i-1):min(len(text), i+2)]
        hash_input = f"{context}{i}".encode()
        hash_val = int(hashlib.md5(hash_input).hexdigest()[:4], 16)
        
        # Map to alphabet
        letter = chr(ord('a') + (hash_val % 26))
        result.append(letter)
    
    return ''.join(result)


def alphabet_hash_v4(text):
    """
    Phonetic-aware alphabet hash
    Preserves some phonetic properties
    """
    text = text.lower()
    result = []
    
    # Phonetic groups
    phonetic_map = {
        'a': 'aeiou', 'e': 'aeiou', 'i': 'aeiou', 'o': 'aeiou', 'u': 'aeiou',
        'b': 'bpv', 'p': 'bpv', 'v': 'bpv',
        'd': 'dt', 't': 'dt',
        'k': 'kgc', 'g': 'kgc', 'c': 'kgc',
        'r': 'rl', 'l': 'rl',
        's': 'sz', 'z': 'sz',
        'n': 'mn', 'm': 'mn',
        'j': 'jy', 'y': 'jy'
    }
    
    for i, char in enumerate(text):
        if char in phonetic_map:
            group = phonetic_map[char]
            # Deterministic selection within phonetic group
            hash_val = (ord(char) + i) % len(group)
            result.append(group[hash_val])
        elif char.isalpha():
            # Default fallback
            result.append(chr(ord('a') + ((ord(char) - ord('a') + i) % 26)))
        else:
            result.append('x')
    
    return ''.join(result)


# Test all alphabet hash functions
print("Alphabet-Only Hash Functions (Length Preserved)")
print("=" * 100)
print(f"{'Name':<12} {'Length':<8} {'Hash-V1':<12} {'Hash-V2':<12} {'Hash-V3':<12} {'Hash-V4':<12}")
print("-" * 100)

for name in indian_names:
    h1 = alphabet_hash_v1(name)
    h2 = alphabet_hash_v2(name)
    h3 = alphabet_hash_v3(name)
    h4 = alphabet_hash_v4(name)
    
    print(f"{name:<12} {len(name):<8} {h1:<12} {h2:<12} {h3:<12} {h4:<12}")



Alphabet-Only Hash Functions (Length Preserved)
Name         Length   Hash-V1      Hash-V2      Hash-V3      Hash-V4     
----------------------------------------------------------------------------------------------------
Rajesh       6        qmdohh       mufult       yvsbbo       rojuzm      
Rajeev       6        qmdogd       mufuat       yvsblp       rojuab      
Suresh       6        lmsohh       nujult       hzzabo       zoruzm      
Mahesh       6        mmtohh       guxult       utgibo       nojuzm      
Priya        5        lqzxv        kxaca        dprza        plije       
Priyanka     8        lqzxvoeu     kxacamse     dprzprsi     plijencu    
Arun         4        kqxx         ixez         zdvo         ilun        
Arjun        5        kqdkt        ixfog        zhjqb        iljam       
Varun        5        amskt        rujog        iqhob        poram       
Anjali       6        kcdwss       infota       byzcie       injara      
Anjalika     8        kcdwsseu     in

In [12]:
import hashlib
import math

# Dataset: Indian first names
indian_names = [
    "Rajesh", "Rajeev", "Suresh", "Mahesh", "Priya", "Priyanka",
    "Arun", "Arjun", "Varun", "Anjali", "Anjalika", "Deepak",
    "Deepa", "Sanjay", "Sanjeev", "Amit", "Amita", "Vikram",
    "Vikas", "Rahul"
]

# 1. SimHash - Most common semantic hashing algorithm
def simhash(text, hash_bits=32):
    """
    SimHash: Semantic hashing for near-duplicate detection
    Similar texts produce similar hash codes (low Hamming distance)
    Stateless, deterministic
    """
    text = text.lower()
    # Create features (character bigrams)
    features = [text[i:i+2] for i in range(len(text) - 1)]
    
    # Initialize vector
    v = [0] * hash_bits
    
    # Weight each feature
    for feature in features:
        # Hash the feature
        h = int(hashlib.md5(feature.encode()).hexdigest(), 16)
        
        # Update vector based on hash bits
        for i in range(hash_bits):
            if h & (1 << i):
                v[i] += 1
            else:
                v[i] -= 1
    
    # Generate fingerprint
    fingerprint = 0
    for i in range(hash_bits):
        if v[i] > 0:
            fingerprint |= (1 << i)
    
    # Return as binary string
    return format(fingerprint, f'0{hash_bits}b')


# 2. Random Projection Hash (LSH-based)
def random_projection_hash(text, num_projections=32, seed=42):
    """
    Random Projection Hash: Projects features onto random hyperplanes
    Preserves cosine similarity in Hamming space
    Stateless when seed is fixed
    """
    text = text.lower()
    # Create feature vector from character counts
    feature_vec = [text.count(chr(ord('a') + i)) for i in range(26)]
    
    # Generate hash
    hash_bits = []
    for i in range(num_projections):
        # Deterministic random projection using seed
        projection = sum(
            feature_vec[j] * hash((seed, i, j)) % 3 - 1
            for j in range(len(feature_vec))
        )
        hash_bits.append('1' if projection > 0 else '0')
    
    return ''.join(hash_bits)


# 3. MinHash-based Semantic Hash
def minhash_semantic(text, num_hashes=32):
    """
    MinHash: Estimates Jaccard similarity between sets
    Used for semantic similarity via set-based representation
    Stateless
    """
    text = text.lower()
    # Create shingles (character n-grams)
    shingles = set(text[i:i+2] for i in range(len(text) - 1))
    
    hash_values = []
    for i in range(num_hashes):
        min_hash = float('inf')
        for shingle in shingles:
            h = int(hashlib.md5(f"{shingle}{i}".encode()).hexdigest()[:8], 16)
            min_hash = min(min_hash, h)
        # Convert to bit
        hash_values.append('1' if (min_hash % 2) == 0 else '0')
    
    return ''.join(hash_values)


# 4. Feature Hash (Simple Semantic)
def feature_hash(text, hash_bits=32):
    """
    Feature Hashing: Maps text features to binary code
    Preserves structural similarity
    Stateless
    """
    text = text.lower()
    
    # Extract semantic features
    features = {
        'length': len(text),
        'vowels': sum(text.count(v) for v in 'aeiou'),
        'consonants': sum(text.count(c) for c in 'bcdfghjklmnpqrstvwxyz'),
        'first_char': ord(text[0]) if text else 0,
        'last_char': ord(text[-1]) if text else 0,
        'unique_chars': len(set(text)),
    }
    
    # Create hash from features
    hash_input = ''.join(str(v) for v in features.values())
    h = int(hashlib.md5(hash_input.encode()).hexdigest(), 16)
    
    return format(h % (2**hash_bits), f'0{hash_bits}b')


# 5. Spectral Hash (Simplified)
def spectral_hash_simple(text, hash_bits=32):
    """
    Spectral Hash: Uses spectral graph theory principles
    Simplified version using character frequency spectrum
    Stateless
    """
    text = text.lower()
    
    # Create frequency spectrum
    freq = [0] * 26
    for char in text:
        if 'a' <= char <= 'z':
            freq[ord(char) - ord('a')] += 1
    
    # Apply simple spectral transformation
    hash_bits_list = []
    for i in range(hash_bits):
        # Use different frequency combinations
        idx1 = (i * 7) % 26
        idx2 = (i * 13) % 26
        value = freq[idx1] - freq[idx2]
        hash_bits_list.append('1' if value > 0 else '0')
    
    return ''.join(hash_bits_list)


# Compute all hashes
print("SEMANTIC HASHING ALGORITHMS (Stateless)")
print("=" * 120)
print(f"{'Name':<12} {'SimHash (32-bit)':<35} {'RandomProj':<35} {'MinHash':<35}")
print("-" * 120)

for name in indian_names:
    h1 = simhash(name, 32)
    h2 = random_projection_hash(name, 32)
    h3 = minhash_semantic(name, 32)
    
    print(f"{name:<12} {h1:<35} {h2:<35} {h3:<35}")

print("\n" + "=" * 120)

SEMANTIC HASHING ALGORITHMS (Stateless)
Name         SimHash (32-bit)                    RandomProj                          MinHash                            
------------------------------------------------------------------------------------------------------------------------
Rajesh       01010111110010101100100010000110    00000000000000000000000000000000    11101110001010101100110110100000   
Rajeev       01000111010110101100100010011110    00000000000000000000000000000000    11010111010111000110010110110001   
Suresh       11011000111001001111110010110110    00000000000000000000000000000000    11000010101110110101100111111100   
Mahesh       01010111101001011100111010100000    00000000000000000000000000000000    11100010101110100001111010101000   
Priya        00010000001000100001010111000000    00000000000000000000000000000000    00010001111001011101100100110011   
Priyanka     01111110011110110101110111101000    00000000000000000000000000000000    0111010001100101110111010011

In [15]:
import hashlib

indian_names = [
    "Rajesh", "Rajeev", "Suresh", "Mahesh", "Priya", "Priyanka",
    "Arun", "Arjun", "Varun", "Anjali", "Anjalika", "Deepak",
    "Deepa", "Sanjay", "Sanjeev", "Amit", "Amita", "Vikram",
    "Vikas", "Rahul"
]

def simhash(text, hash_bits=32):
    """SimHash returning binary string representation"""
    text = text.lower()
    features = [text[i:i+2] for i in range(len(text) - 1)]
    v = [0] * hash_bits
    
    for feature in features:
        h = int(hashlib.md5(feature.encode()).hexdigest(), 16)
        for i in range(hash_bits):
            if h & (1 << i):
                v[i] += 1
            else:
                v[i] -= 1
    
    fingerprint = 0
    for i in range(hash_bits):
        if v[i] > 0:
            fingerprint |= (1 << i)
    
    # Return as binary string
    return format(fingerprint, f'0{hash_bits}b')

# Show different output formats for the same hash
print("32-BIT HASH OUTPUT FORMATS")
print("=" * 90)
print(f"{'Name':<12} {'Binary (32-bit)':<35} {'Hex':<12} {'Integer':<12}")
print("-" * 90)

for name in indian_names[:10]:  # Show first 10 for clarity
    binary = simhash(name, 32)
    hex_val = format(int(binary, 2), '08x')
    int_val = int(binary, 2)
    
    print(f"{name:<12} {binary:<35} {hex_val:<12} {int_val:<12}")

print("\n" + "=" * 90)


# Compare similar names
name1, name2 = "Rajesh", "Rajeev"
h1_bin = simhash(name1, 32)
h2_bin = simhash(name2, 32)
h1_hex = format(int(h1_bin, 2), '08x')
h2_hex = format(int(h2_bin, 2), '08x')

print(f"{name1:<10} (binary): {h1_bin}")
print(f"{name2:<10} (binary): {h2_bin}")
print()
print(f"{name1:<10} (hex):    {h1_hex}")
print(f"{name2:<10} (hex):    {h2_hex}")
print()

# Calculate Hamming distance
hamming = sum(b1 != b2 for b1, b2 in zip(h1_bin, h2_bin))
print(f"Hamming Distance: {hamming} bits out of 32")
print(f"Similarity: {(32 - hamming) / 32 * 100:.1f}%")

print("\n" + "=" * 90)
print("\nRECOMMENDATION:")
print("- Use BINARY STRINGS for analysis, visualization, and understanding")
print("- Use HEX for compact storage and display")
print("- Use INTEGER for mathematical operations and indexing")
print("=" * 90)

32-BIT HASH OUTPUT FORMATS
Name         Binary (32-bit)                     Hex          Integer     
------------------------------------------------------------------------------------------
Rajesh       01010111110010101100100010000110    57cac886     1472907398  
Rajeev       01000111010110101100100010011110    475ac89e     1197131934  
Suresh       11011000111001001111110010110110    d8e4fcb6     3638885558  
Mahesh       01010111101001011100111010100000    57a5cea0     1470484128  
Priya        00010000001000100001010111000000    102215c0     270669248   
Priyanka     01111110011110110101110111101000    7e7b5de8     2122014184  
Arun         10110010010000010110111010000010    b2416e82     2990632578  
Arjun        11000000000000010010111000000100    c0012e04     3221302788  
Varun        10110000010000010110100000000010    b0416802     2957076482  
Anjali       01001111010111100010011111100000    4f5e27e0     1331570656  

Rajesh     (binary): 01010111110010101100100010000110
Ra