# Implémentation des métriques de similarité

In [None]:
# Distance de Hamming
def hamming_distance(s, t):
    if len(s) != len(t):
        raise ValueError("Les deux chaînes doivent avoir la même longueur.")

    distance = 0
    for a, b in zip(s, t):
        if a != b:
            distance += 1
    return distance


In [1]:
# Distance de Jaro

def jaro_distance(s, t):
    if s == t:
        return 1.0

    len_s, len_t = len(s), len(t)
    if len_s == 0 or len_t == 0:
        return 0.0

    max_dist = max(len_s, len_t) // 2 - 1

    match_s = [False] * len_s
    match_t = [False] * len_t

    matches = 0
    transpositions = 0

    # 1. Chercher les correspondances
    for i in range(len_s):
        start = max(0, i - max_dist)
        end = min(len_t - 1, i + max_dist)

        for j in range(start, end + 1):
            if s[i] == t[j] and not match_t[j]:
                match_s[i] = True
                match_t[j] = True
                matches += 1
                break

    if matches == 0:
        return 0.0

    # 2. Compter les transpositions
    k = 0
    for i in range(len_s):
        if match_s[i]:
            while not match_t[k]:
                k += 1
            if s[i] != t[k]:
                transpositions += 1
            k += 1

    transpositions /= 2

    # 3. Formule du Jaro
    return (1/3) * (
        matches / len_s +
        matches / len_t +
        (matches - transpositions) / matches
    )


In [4]:
# Distance de Jaro-Winkler

def jaro_winkler(texte1, texte2, bonus_prefixe=0.1):
    """
    Jaro-winkler.
    """
    score_jaro = jaro_distance(texte1, texte2)
    
    # Calculer la longueur du préfixe commun (max 4 caractères)
    longueur_prefixe = 0
    for i in range(min(len(texte1), len(texte2), 4)):
        if texte1[i] == texte2[i]:
            longueur_prefixe += 1
        else:
            break
    
    # Appliquer le bonus : plus le préfixe est long, plus le bonus est important
    bonus = longueur_prefixe * bonus_prefixe * (1 - score_jaro)
    score_final = score_jaro + bonus
    
    return min(score_final, 1.0)

In [5]:
# Jaccard similarity

def jaccard_similarity(set1, set2):
    """
    Calcule la similarité de Jaccard entre deux ensembles.
    
    Args:
        set1 (set): Premier ensemble
        set2 (set): Deuxième ensemble
    
    Returns:
        float: Similarité entre 0.0 (différent) et 1.0 (identique)
    """
    # Cas où les deux ensembles sont vides
    if len(set1) == 0 and len(set2) == 0:
        return 1.0
    
    # Calculer l'intersection et l'union
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    
    # Éviter la division par zéro
    if len(union) == 0:
        return 0.0
    
    # Retourner la similarité
    return len(intersection) / len(union)


def jaccard_distance(set1, set2):
    """
    Calcule la distance de Jaccard (1 - similarité).
    """
    similarity = jaccard_similarity(set1, set2)
    return 1 - similarity


In [6]:
import math


def cosine_similarity(a, b):
    # Produit scalaire: O(n)
    dot_product = sum(a_i * b_i for a_i, b_i in zip(a, b))
    
    # Norme a: O(n)
    norm_a = math.sqrt(sum(a_i ** 2 for a_i in a))
    
    # Norme b: O(n)
    norm_b = math.sqrt(sum(b_i ** 2 for b_i in b))
    
    # Division: O(1)
    return dot_product / (norm_a * norm_b)

In [None]:
def levenshtein(s1: str, s2: str) -> int:
    m, n = len(s1), len(s2)

    # Création de la matrice
    D = [[0] * (n + 1) for _ in range(m + 1)]

    # Initialisation
    for i in range(m + 1):
        D[i][0] = i
    for j in range(n + 1):
        D[0][j] = j

    # Remplissage
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            cost = 0 if s1[i - 1] == s2[j - 1] else 1
            D[i][j] = min(
                D[i - 1][j] + 1,      # suppression
                D[i][j - 1] + 1,      # insertion
                D[i - 1][j - 1] + cost  # substitution
            )

    return D[m][n]


In [12]:
[x**2 for x in range(10)]

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]

In [22]:
#### Similarité de Hamming

def hamming(chaine1: str, chaine2: str) -> int:
    """Implements the hamming similarity.

    Args:
        chaine1 (str): The first string to compare.
        chaine2 (str): The second string to compare.

    Returns:
        int: The value of the hamming similarity.
    """
    if len(chaine1) != len(chaine2):
        raise ValueError("String length differs")
    distance = 0
    for c1, c2 in zip(chaine1, chaine2):
        if c1 != c2:
            distance += 1
    return distance


def hamming_pythonique(chaine1: str, chaine2: str):
    """Implements the hamming similarity.

    Args:
        chaine1 (str): The first string to compare.
        chaine2 (str): The second string to compare.

    Returns:
        int: The value of the hamming similarity.
    """
    assert len(chaine1) == len(chaine2), "String length differs"
    return sum([c1 != c2 for c1, c2 in zip(chaine1, chaine2)])


print(hamming("chat", "chut"))

print(hamming_pythonique("chat", "chure"))

1


AssertionError: String length differs