# Compute the Levenshtein distance in Zurich dogs dataset

## Imports

In [None]:
import itertools
import numpy as np
import pandas as pd
from Levenshtein import distance as levenshtein_distance

## Dataset URL Declaration

In [None]:
url = "https://data.stadt-zuerich.ch/dataset/sid_stapo_hundenamen/download/20210103_hundenamen.csv"

### Download the dataset

In [None]:
df = pd.read_csv(url)

In [None]:
df.head()

In [None]:
n_names = len(df)
print(f"n_names: {n_names}")

In [None]:
unique_names = set(list(df["HUNDENAME"])) #set(list(itertools.chain(*df["HUNDENAME"])))
n_unique_names = len(unique_names)
print(f"n_unique_names: {n_unique_names}")
# print(f"unique_names: {unique_names}")

## Referecen dog's name

In [None]:
ref_name = 'Luca'

## Define the Levenshtein distance class

In [None]:
class LevenshteinDistance:
    def __init__(self, names:np.array, ref_name:str='Luca', distance:int=1):
        self.names = pd.DataFrame(names, columns=['name'])
        self.distance = distance
        self.ref_name = ref_name
    
        self.names['name'] = self.names['name'].apply(self._separate_double_names)
        
        names_ = list(itertools.chain(*self.names.name))
        unique_names = list(set(list(names_)))
        self.names = pd.DataFrame(unique_names, columns=['name'])
        
    def _separate_double_names(self, name):
        if '(' in name:
            names = name.split('(')
            names[1] = names[1].split(')')[0]
            names = [name.strip() for name in names]
            return names
        else:
            return [name]
    
    def compute(self, verbose=False):
        similar_names = self.names[
            self.names.name.apply(
                lambda i: levenshtein_distance(i.lower(), self.ref_name.lower()) == self.distance)]
        
        return similar_names

## Find similar names

In [None]:
unique_names = set(list(df["HUNDENAME"]))

In [None]:
similar_names = LevenshteinDistance(unique_names, ref_name='Luca', distance=1).compute()

In [None]:
similar_names = [name[0] for name in similar_names.values.tolist()]

In [None]:
print(f"n_similar_names: {len(similar_names)}")
print(f"similar_names:\n {similar_names}")