# Import data

In [7]:
import pandas as pd

# Read data
data = pd.read_excel('data.xlsx', sheet_name='PS19A_PERSON__SmplR10K', keep_default_na=False)
#data = pd.read_csv('data.tsv.win.utf8.txt', sep='\t', keep_default_na=False)

# Extract addresses
addresses = data['person_address']

# Remove old data to free memory
del data

addressCount = len(addresses)

# data is a dictionary with every key a tupel of 2 indexes of addresses and value the Levenshtein ratio
# For example (0, 1): 0.5 means that address with index 0 and address with index 1 have a Levenshtein ratio of 0.5
data = {}

# Calculate all Levenshtein ratios and store the ones above a threshold (0.5)

In [8]:
import Levenshtein as lev
import multiprocessing
import threading
import math

# Number of threads to create
threadCount = multiprocessing.cpu_count() - 1

# Amount of addresses to use per thread
threadAddressCount = int(math.ceil(float(addressCount) / threadCount))

class CalculateDistanceThread(threading.Thread):
    def __init__(self, threadID):
        threading.Thread.__init__(self)
        self.threadID = threadID
    def run(self):
        # Calculate the start index
        start = threadAddressCount * self.threadID
        for n in range(threadAddressCount):
            if (start + n < addressCount):
                # Calculate and store Levenshtein ratios
                # above threshold 0.5
                for x in range(addressCount):
                    ratio = lev.ratio(addresses[start + n].lower(), addresses[x].lower())
                    if (ratio > 0.5):
                        data[(start + n, x)] = ratio

# Create n threads
for n in range(threadCount):
    t = CalculateDistanceThread(n)
    t.start()
    t.join()

del threadCount

# Pretty print the first 8 ratios above the threshold (0.5)

In [9]:
from pprint import pprint

pprint(list(data.items())[:8])

[((0, 0), 1.0),
 ((0, 918), 0.5952380952380952),
 ((1, 1), 1.0),
 ((1, 3), 0.5755395683453237),
 ((1, 7), 0.5584415584415584),
 ((1, 131), 0.6015037593984962),
 ((1, 531), 0.7972972972972973),
 ((1, 608), 0.5714285714285715)]
