# Correspondencia en direcciones postales

In [1]:
import random
import string
import numpy as np
import tensorflow as tf

In [27]:
n = 10
street_names = ["diagon", "elm", "abbey", "vallejo", "batiz"]
street_type = ["callejon", "calle", "carretera", "via", "avenida"]
street_zips = [random.randint(20000,29999) for i in range(5)]
numbers = [random.randint(1,999) for i in range(n)]

In [28]:
streets = [random.choice(street_names) for i in range(n)]
street_pref = [random.choice(street_type) for i in range(n)]
zips = [random.choice(street_zips) for i in range(n)]
full_streets = [x + " " + y + " " + str(z) for x,y,z in zip(street_pref, streets, numbers)]
reference_data = [list(x) for x in  zip(full_streets, zips)]

In [29]:
reference_data

[['avenida batiz 506', 24070],
 ['avenida elm 846', 24198],
 ['calle diagon 414', 20545],
 ['callejon vallejo 505', 24198],
 ['calle elm 819', 21707],
 ['avenida diagon 788', 24198],
 ['callejon abbey 395', 24070],
 ['calle batiz 496', 27680],
 ['calle abbey 890', 21707],
 ['callejon batiz 885', 24070]]

In [30]:
def create_typo(s, prob=0.75):
    if random.uniform(0,1) <0.75:
        rand_idx = random.choice(range(len(s)))
        s_list = list(s)
        s_list[rand_idx] = random.choice(string.ascii_lowercase)
        s = ''.join(s_list)
    return s

In [31]:
typo_streets = [create_typo(x) for x in streets]

In [32]:
typo_full_streets = [x+" "+y+" "+str(z) for x,y,z in zip(street_pref, typo_streets, numbers)]
test_data = [list(x) for x in zip(typo_full_streets, zips)]
test_data

[['avenida patiz 506', 24070],
 ['avenida elb 846', 24198],
 ['calle diwgon 414', 20545],
 ['callejon vallejo 505', 24198],
 ['calle elm 819', 21707],
 ['avenida diagon 788', 24198],
 ['callejon abbey 395', 24070],
 ['calle batiz 496', 27680],
 ['calle abbey 890', 21707],
 ['callejon baxiz 885', 24070]]

In [33]:
session = tf.Session()

In [34]:
test_address = tf.sparse_placeholder(dtype = tf.string)
test_zip = tf.placeholder(shape = [None, 1], dtype=tf.float32)

ref_address = tf.sparse_placeholder(dtype = tf.string)
ref_zip = tf.placeholder(shape=[None, n], dtype=tf.float32 )

In [35]:
zip_dist = tf.square(tf.subtract(ref_zip, test_zip))
address_dist = tf.edit_distance(test_address, ref_address, normalize=True)

- $S(x,y) = 0$ si $x$ e $y$ son totalmente diferentes (no se parecen en nada)
- $S(x,x) = 1$, ya que todo objeto es similar (si no igual) a si mismo.
- $S(x,y) = \frac{D - d(x,y)}{D-d}$ donde $D$ es la mayor distancia entre dos objetos posibles, y $d$ es la menor.

In [36]:
zip_max = tf.gather(tf.squeeze(zip_dist), tf.argmax(zip_dist, 1))
zip_min = tf.gather(tf.squeeze(zip_dist), tf.argmin(zip_dist, 1))
zip_sim = tf.divide(tf.subtract(zip_max, zip_dist), tf.subtract(zip_max, zip_min))

In [37]:
address_sim = tf.subtract(1.0, address_dist)

$$S(x,y) = \sum_{i=1}^k w_iS_k(x,y):\quad \sum_{i=1}^kw_i = 1$$

In [38]:
address_wi = 0.5
zip_wi = 1.0 - address_wi

In [39]:
weighted_sim = tf.add(tf.transpose(tf.multiply(address_wi, address_sim)), tf.multiply(zip_wi, zip_sim))

In [40]:
top_match_idx = tf.argmax(weighted_sim, 1)

In [41]:
def sparse_from_word_vector(word_vector):
    num_words = len(word_vector)
    idx = [[xi,0,yi] for xi,x in enumerate(word_vector) for yi, y in enumerate(x)]
    chars = list(''.join(word_vector))
    return tf.SparseTensorValue(idx, chars, [num_words,1,1])

In [45]:
reference_address = [x[0] for x in reference_data]
reference_zips = [x[1] for x in reference_data]