# Import data

In [4]:
import pandas as pd

# Read data
#data = pd.read_excel('data.xlsx', sheet_name='PS19A_PERSON__SmplR10K', keep_default_na=False)
data = pd.read_csv('data_small.tsv.win.utf8.txt', sep='\t', keep_default_na=False)

# Extract addresses
addresses = data['person_address']

# Remove old data to free memory
del data

addressCount = len(addresses)

# data is a dictionary with every key a tupel of 2 indexes of addresses and value the Levenshtein ratio
# For example (0, 1): 0.5 means that address with index 0 and address with index 1 have a Levenshtein ratio of 0.5
data = {}

# Calculate all Levenshtein ratios and store the ones above a threshold (0.5)

Note that addresses will not be compared with themselves since this will always result in 1, this means the data will not include data like (1,1): 1 or (2,2): 1 since this is useless.

It will also not include reverse comparisons. For example, it will include (1, 0): n but not (1, 0): n AND (0, 1): n since these will have the same value.

In [5]:
import Levenshtein as lev
import multiprocessing
import threading
import math

# Number of threads to create
threadCount = multiprocessing.cpu_count() - 1

# Amount of addresses to use per thread
threadAddressCount = int(math.ceil(float(addressCount) / threadCount))

class CalculateDistanceThread(threading.Thread):
    def __init__(self, threadID):
        threading.Thread.__init__(self)
        self.threadID = threadID
    def run(self):
        # Calculate the start index
        start = threadAddressCount * self.threadID
        for n in range(threadAddressCount):
            if (start + n < addressCount):
                # Calculate and store Levenshtein ratios
                # above threshold 0.5
                for x in range(start + n):
                    ratio = lev.ratio(addresses[start + n].lower(), addresses[x].lower())
                    if (ratio > 0.5):
                        data[(start + n, x)] = ratio

# Create n threads
for n in range(threadCount):
    t = CalculateDistanceThread(n)
    t.start()
    t.join()

del threadCount

# Pretty print the first 8 ratios above the threshold (0.5)

In [6]:
from pprint import pprint

pprint(list(data.items())[:8])

[((6, 1), 0.5423728813559322),
 ((10, 4), 0.5373134328358209),
 ((11, 4), 0.5230769230769231),
 ((11, 6), 0.5714285714285715),
 ((11, 7), 0.5230769230769231),
 ((11, 10), 0.5161290322580645),
 ((14, 0), 0.5538461538461539),
 ((14, 1), 0.576271186440678)]
