# Correspondencia en direcciones postales

In [46]:
import random
import string
import numpy as np
import tensorflow as tf

In [47]:
n = 10
street_names = ["diagon", "elm", "abbey", "vallejo", "batiz"]
street_type = ["callejon", "calle", "carretera", "via", "avenida"]
street_zips = [random.randint(20000,29999) for i in range(5)]
numbers = [random.randint(1,999) for i in range(n)]

In [48]:
streets = [random.choice(street_names) for i in range(n)]
street_pref = [random.choice(street_type) for i in range(n)]
zips = [random.choice(street_zips) for i in range(n)]
full_streets = [x + " " + y + " " + str(z) for x,y,z in zip(street_pref, streets, numbers)]
reference_data = [list(x) for x in  zip(full_streets, zips)]

In [49]:
reference_data

[['carretera vallejo 537', 22391],
 ['carretera abbey 92', 22391],
 ['via abbey 512', 22759],
 ['calle abbey 971', 27631],
 ['callejon abbey 381', 28174],
 ['via batiz 176', 22391],
 ['via diagon 79', 22391],
 ['callejon diagon 899', 28174],
 ['callejon diagon 25', 27631],
 ['via abbey 565', 22391]]

In [50]:
def create_typo(s, prob=0.75):
    if random.uniform(0,1) <0.75:
        rand_idx = random.choice(range(len(s)))
        s_list = list(s)
        s_list[rand_idx] = random.choice(string.ascii_lowercase)
        s = ''.join(s_list)
    return s

In [51]:
typo_streets = [create_typo(x) for x in streets]

In [52]:
typo_full_streets = [x+" "+y+" "+str(z) for x,y,z in zip(street_pref, typo_streets, numbers)]
test_data = [list(x) for x in zip(typo_full_streets, zips)]
test_data

[['carretera vallero 537', 22391],
 ['carretera abbly 92', 22391],
 ['via abbeu 512', 22759],
 ['calle gbbey 971', 27631],
 ['callejon aabey 381', 28174],
 ['via xatiz 176', 22391],
 ['via diagon 79', 22391],
 ['callejon diagoc 899', 28174],
 ['callejon dgagon 25', 27631],
 ['via ubbey 565', 22391]]

In [53]:
session = tf.Session()

In [54]:
test_address = tf.sparse_placeholder(dtype = tf.string)
test_zip = tf.placeholder(shape = [None, 1], dtype=tf.float32)

ref_address = tf.sparse_placeholder(dtype = tf.string)
ref_zip = tf.placeholder(shape=[None, n], dtype=tf.float32 )

In [55]:
zip_dist = tf.square(tf.subtract(ref_zip, test_zip))
address_dist = tf.edit_distance(test_address, ref_address, normalize=True)

- $S(x,y) = 0$ si $x$ e $y$ son totalmente diferentes (no se parecen en nada)
- $S(x,x) = 1$, ya que todo objeto es similar (si no igual) a si mismo.
- $S(x,y) = \frac{D - d(x,y)}{D-d}$ donde $D$ es la mayor distancia entre dos objetos posibles, y $d$ es la menor.

In [56]:
zip_max = tf.gather(tf.squeeze(zip_dist), tf.argmax(zip_dist, 1))
zip_min = tf.gather(tf.squeeze(zip_dist), tf.argmin(zip_dist, 1))
zip_sim = tf.divide(tf.subtract(zip_max, zip_dist), tf.subtract(zip_max, zip_min))

In [57]:
address_sim = tf.subtract(1.0, address_dist)

$$S(x,y) = \sum_{i=1}^k w_iS_k(x,y):\quad \sum_{i=1}^kw_i = 1$$

In [58]:
address_wi = 0.5
zip_wi = 1.0 - address_wi

In [59]:
weighted_sim = tf.add(tf.transpose(tf.multiply(address_wi, address_sim)), tf.multiply(zip_wi, zip_sim))

In [60]:
top_match_idx = tf.argmax(weighted_sim, 1)

In [61]:
def sparse_from_word_vector(word_vector):
    num_words = len(word_vector)
    idx = [[xi,0,yi] for xi,x in enumerate(word_vector) for yi, y in enumerate(x)]
    chars = list(''.join(word_vector))
    return tf.SparseTensorValue(idx, chars, [num_words,1,1])

In [73]:
reference_address = [x[0] for x in reference_data]
reference_zips = np.array([[x[1] for x in reference_data]])

In [74]:
sparse_ref_set = sparse_from_word_vector(reference_address)

In [75]:
for i in range(n):
    test_address_entry = test_data[i][0]
    test_zip_entry = [[test_data[i][1]]]
    
    test_address_rep = [test_address_entry]*n
    sparse_test_set = sparse_from_word_vector(test_address_rep)
    
    feed_dict = {test_address: sparse_test_set,
                test_zip: test_zip_entry,
                ref_address: sparse_ref_set,
                ref_zip: reference_zips}
    
    best_match = session.run(top_match_idx, feed_dict=feed_dict)
    best_address = reference_address[best_match[0]]
    [best_zip] = reference_zips[0][best_match]
    [[test_zip_aux]] = test_zip_entry
    
    print("Dirección original: "+str(test_address_entry)+ ", "+str(test_zip_aux))
    print("Dirección corregida: "+str(best_address)+", "+str(best_zip)+"\n")

Dirección original: carretera vallero 537, 22391
Dirección corregida: carretera vallejo 537, 22391

Dirección original: carretera abbly 92, 22391
Dirección corregida: carretera abbey 92, 22391

Dirección original: via abbeu 512, 22759
Dirección corregida: via abbey 512, 22759

Dirección original: calle gbbey 971, 27631
Dirección corregida: calle abbey 971, 27631

Dirección original: callejon aabey 381, 28174
Dirección corregida: callejon abbey 381, 28174

Dirección original: via xatiz 176, 22391
Dirección corregida: via batiz 176, 22391

Dirección original: via diagon 79, 22391
Dirección corregida: via diagon 79, 22391

Dirección original: callejon diagoc 899, 28174
Dirección corregida: callejon diagon 899, 28174

Dirección original: callejon dgagon 25, 27631
Dirección corregida: callejon diagon 25, 27631

Dirección original: via ubbey 565, 22391
Dirección corregida: via abbey 565, 22391

