## Create encoding and similarity functions

In [8]:
# Import numpy and scipy for sparse matrices
import numpy as np
from scipy import sparse

# Define the alphabet as a list of characters
alphabet = list("abcdefghijklmnopqrstuvwxyz./-*_@£§&#1234567890 ")

# Define the number of neurons as the length of the alphabet
n_neurons = len(alphabet)

# Define a function that encodes a string into a spa&e table
def encode_string(string):
    # Check if input is a string
    if not isinstance(string, str):
        raise ValueError("Input must be a string")

    # Initialize an empty sparse matrix of shape (n_neurons, n_neurons)
    table = sparse.lil_matrix((n_neurons, n_neurons), dtype=int)

    # Make the string lowercase
    string = string.lower()

    # Iterate over the string from the second character
    for i in range(1, len(string)):
        # Get the current and previous characters
        curr = string[i]
        prev = string[i-1]

        # Find the indices of the characters in the alphabet
        try:
            curr_idx = alphabet.index(curr)
            prev_idx = alphabet.index(prev)
        except ValueError:
            raise ValueError(f"Character '{curr}' or '{prev}' not in alphabet")

        # Set the corresponding cell in the table to 1
        table[prev_idx, curr_idx] = 1

    # Return the table as a sparse coo matrix
    return table.tocoo()

def similarity_score(str1, str2):
    # Check if inputs are strings
    if not isinstance(str1, str) or not isinstance(str2, str):
        raise ValueError("Both inputs must be strings")

    # Encode input strings into sparse tables
    try:
        table1 = encode_string(str1)
        table2 = encode_string(str2)
    except ValueError as e:
        raise ValueError("Error encoding string: " + str(e))

    # Get the row and column indices of the non-zero elements in the tables
    rows1, cols1 = table1.nonzero()
    rows2, cols2 = table2.nonzero()

    # Convert the indices to character pairs
    pairs1 = set(zip([alphabet[i] for i in rows1], [alphabet[i] for i in cols1]))
    pairs2 = set(zip([alphabet[i] for i in rows2], [alphabet[i] for i in cols2]))

    # Calculate the intersection and the cardinality of the pairs
    intersection = pairs1.intersection(pairs2)
    cardinality = len(pairs1)

    # Check if cardinality is zero to avoid division by zero
    if cardinality == 0:
        raise ValueError("Cardinality is zero, cannot calculate similarity score")

    # Calculate the similarity score using the formula
    score = len(intersection) / cardinality

    # Return the score as a float
    return float(score)

## Examples

In [10]:
import pandas as pd

# Define test cases
test_cases = [
    ("Microsoft", "Microsft"),  # Company name with typo
    ("Apple Inc.", "Apple"),  # Full company name vs. short name
    ("Google", "Alphabet"),  # Company name vs. parent company name
    ("Barack Obama", "Barak Obama"),  # Name with typo
    ("John F. Kennedy", "JFK"),  # Full name vs. initials
    ("New York", "New York City"),  # City name vs. full city name
    ("Python", "Pyhton"),  # Programming language name with typo
    ("Machine Learning", "Machin Learning"),  # Field name with typo
    ("Artificial Intelligence", "AI"),  # Full field name vs. abbreviation
    ("GitHub", "GitHUb"),  # Platform name with typo
]

# Calculate similarity scores for each test case
scores = [similarity_score(str1, str2) for str1, str2 in test_cases]

# Create DataFrame
df = pd.DataFrame(test_cases, columns=["string_1", "string_2"])
df["similarity_score"] = scores

print(df)

                  string_1         string_2  similarity_score
0                Microsoft         Microsft          0.750000
1               Apple Inc.            Apple          0.444444
2                   Google         Alphabet          0.000000
3             Barack Obama      Barak Obama          0.800000
4          John F. Kennedy              JFK          0.000000
5                 New York    New York City          1.000000
6                   Python           Pyhton          0.400000
7         Machine Learning  Machin Learning          0.857143
8  Artificial Intelligence               AI          0.000000
9                   GitHub           GitHUb          1.000000
