In [1]:
import pandas as pd
import numpy as np

## Load Dataset

In [2]:
data = pd.read_csv("20210103_hundenamen.csv")
print(data)

                         HUNDENAME  GEBURTSJAHR_HUND GESCHLECHT_HUND
0                            Ituma              2011               w
1         "Bo" Bendy of Treegarden              2020               m
2            "Bobby" Lord Sinclair              2009               m
3       "Buddy" Fortheringhay's J.              2011               m
4     "Fly" Showring i fly for you              2015               w
...                            ...               ...             ...
8569                     unbekannt              2010               w
8570                     unbekannt              2011               m
8571                     unbekannt              2018               m
8572                     unbekannt              2018               m
8573                     unbekannt              2017               m

[8574 rows x 3 columns]


## Check the data values

In [3]:
print(data['HUNDENAME'].loc[150:157])

150                           Alfie
151                          Alfred
152                         Alfredo
153    Algarvio (Casa Rastys Amigo)
154                             Ali
155                      Ali (Ally)
156                           Aliah
157                           Alice
Name: HUNDENAME, dtype: object


In [4]:
print("Check for Nan values:\n", data.isna().sum())

Check for Nan values:
 HUNDENAME           0
GEBURTSJAHR_HUND    0
GESCHLECHT_HUND     0
dtype: int64


In [5]:
print("The word 'unbekannt' appears {} times in the dataset.".format( len(data[data['HUNDENAME'] == 'unbekannt']) ) )

The word 'unbekannt' appears 5 times in the dataset.


## Prepare the dogs list

In [12]:
dogs = data['HUNDENAME'].values
print("Total dog names:", len(dogs))

Total dog names: 8574


## Calculate the Levenshtein distance

The Levenshtein distance is a metric that indicates how disimilar are two text sequences. In particular, it measures the minimum actions that we need to take in order to make one "word_a" to be similar with another "word_b". The actions can be a deletion, and/or a insertion, and/or a substitution. The cost of deletion and insertion is equal to 1, whereas the cost for substitution in Levenshtein distance is 2. 

The advantage of this metric is that we can compare words with unequal lengths. 

In [7]:
def levenshtein(word_1, word_2):
    
    # Initialize matrix of zeros
    rows = len(word_1)+1
    cols = len(word_2)+1
    distance = [[0 for col in range(cols)] for row in range(rows)]

    # Assign the indeces of each character of both strings in the matrix
    for i in range(1, rows):
        for j in range(1,cols):
            distance[i][0] = i
            distance[0][j] = j

    # Compute the cost of deletions,insertions and substitutions    
    for col in range(1, cols):
        for row in range(1, rows):
            if word_1[row-1] == word_2[col-1]:
                cost = 0 
            else:
                cost = 2
            
            distance[row][col] = min(distance[row-1][col] + 1,      # Cost of deletions
                                 distance[row][col-1] + 1,          # Cost of insertions
                                 distance[row-1][col-1] + cost)     # Cost of substitutions
    
    return distance[row][col]


## Test algorithm for the given word "Luca"

In [13]:
for dog in dogs:
    dist = levenshtein(dog, "Luca")
    if dist == 1:
        print("Distance is", dist, "for word:", dog)
   

Distance is 1 for word: Lua
Distance is 1 for word: Lua
Distance is 1 for word: Lua
Distance is 1 for word: Lua
Distance is 1 for word: Lua
Distance is 1 for word: Lua
Distance is 1 for word: Lucas
Distance is 1 for word: Lucia


## Re-run algorithm after removing duplicates

In [14]:
dogs = list(set(dogs))
print("Total dog names after duplicates deletion:", len(dogs))

Total dog names after duplicates deletion: 5192


In [16]:
for dog in dogs:
    dist = levenshtein(dog, "Luca")
    if dist == 1:
        print("Distance is", dist, "for word:", dog)

Distance is 1 for word: Lucas
Distance is 1 for word: Lucia
Distance is 1 for word: Lua
