# `nearest_record` example

For `synthimpute` package. Uses the `mpg` sample dataset.

## Setup

In [1]:
import synthimpute as si
import pandas as pd
import numpy as np
from scipy.spatial.distance import euclidean
import math

In [2]:
mpg = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/mpg.csv')
# Drop class columns and sometimes-missing horsepower.
mpg.drop(['origin', 'name', 'horsepower'], axis=1, inplace=True)

## Synthesize

In [3]:
synth = si.rf_synth(mpg, ['cylinders'], random_state=0)

Synthesizing feature 1 of 5: mpg...
Synthesizing feature 2 of 5: acceleration...
Synthesizing feature 3 of 5: weight...
Synthesizing feature 4 of 5: displacement...
Synthesizing feature 5 of 5: model_year...


## `nearest_record`

In [4]:
nearest = si.nearest_record(synth, mpg, metric='euclidean')
nearest.head()

Unnamed: 0,id1,dist,id2
0,0,0.22933,203
1,1,4.07558,365
2,2,3.621807,233
3,3,4.045211,125
4,4,11.679206,332


In [5]:
row = nearest.iloc[0]
math.isclose(row.dist, euclidean(synth.iloc[int(row.id1)], mpg.iloc[int(row.id2)]))

False

### Blocked

In [6]:
nearest_blocked = si.nearest_record(synth, mpg, ['cylinders'], metric='euclidean')
nearest_blocked.head()

Running block 1 of 5...
Running block 2 of 5...
Running block 3 of 5...
Running block 4 of 5...
Running block 5 of 5...


Unnamed: 0,id1,dist,id2
0,0,0.22933,203
1,1,4.07558,365
2,2,3.621807,233
3,3,4.045211,125
4,4,11.679206,332


In [7]:
row_blocked = nearest_blocked.iloc[0]
math.isclose(row_blocked.dist, 
             euclidean(synth.iloc[int(row.id1)], mpg.iloc[int(row.id2)]))

False

#### Calculate from `block_cdist`

In [8]:
dist_blocked = si.block_cdist(synth, mpg, ['cylinders'], metric='euclidean')

Running block 1 of 5...
Running block 2 of 5...
Running block 3 of 5...
Running block 4 of 5...
Running block 5 of 5...


In [9]:
math.isclose(row_blocked.dist,
             dist_blocked[dist_blocked.id1 == row_blocked.id1].dist.min())

True

#### Compare blocked and unblocked

Blocking means we might miss the true minimum distance, as we did for `id1=1`.

In [10]:
comp = nearest.merge(nearest_blocked, on='id1')
comp.head()

Unnamed: 0,id1,dist_x,id2_x,dist_y,id2_y
0,0,0.22933,203,0.22933,203
1,1,4.07558,365,4.07558,365
2,2,3.621807,233,3.621807,233
3,3,4.045211,125,4.045211,125
4,4,11.679206,332,11.679206,332


## Test new one

In [50]:
def nearest_single(XA, XB, **kwargs):
    dist = cdist_long(XA, XB, **kwargs).reset_index(drop=True)
    nearest = dist.groupby('id1').dist.nsmallest(1).reset_index()
    return nearest.set_index('level_1').join(dist.id2).reset_index(drop=True)

In [52]:
from synthimpute import *

def nearest(XA, XB, block_vars=None, verbose=True, **kwargs):
#     if block_vars is None:
#         return cdist_long(XA, XB, **kwargs)
    # TODO: Use adjacent_vars.
    A_blocks = XA[block_vars].drop_duplicates()
    B_blocks = XB[block_vars].drop_duplicates()
    # TODO: Warn when some blocks are dropped.
    blocks = A_blocks.merge(B_blocks, on=block_vars)
    n_blocks = blocks.shape[0]
    res = []
    for index, row in blocks.iterrows():
        if verbose:
            print('Running block ' + str(index + 1) + ' of ' + str(n_blocks) +
                  '...')
        res.append(nearest_single(subset_from_row(XA, row),
                                  subset_from_row(XB, row), **kwargs))
    return pd.concat(res).reset_index(drop=True)

In [55]:
XA = synth.copy()
XB = mpg.copy()
block_vars = ['cylinders']

In [56]:
nearest(XA, XB, block_vars)

Running block 1 of 5...
Running block 2 of 5...
Running block 3 of 5...
Running block 4 of 5...
Running block 5 of 5...


Unnamed: 0,id1,dist,id2
0,0,0.229330,203
1,2,3.621807,233
2,4,11.679206,332
3,6,1.292878,279
4,7,8.944928,332
5,11,4.007225,203
6,16,9.596968,296
7,17,11.002231,318
8,23,0.535770,279
9,31,10.488405,390


In [43]:
tmp = cdist_long(subset_from_row(XA, row),
                 subset_from_row(XB, row)).reset_index(drop=True)

In [36]:
cdist_long(XA.iloc[[0]], XB)

Unnamed: 0,dist,id1,id2
0,1692.134766,0,0
1,456.795857,0,1
2,1581.064224,0,2
3,416.633242,0,3
4,1674.643862,0,4
5,388.034997,0,5
6,1385.023198,0,6
7,1672.298663,0,7
8,687.450679,0,8
9,269.741670,0,9


In [None]:
set_index('level_1').join(dist.id2).reset_index(drop=True)

In [46]:
tmp.groupby('id1').dist.nsmallest(1).reset_index()

Unnamed: 0,id1,level_1,dist
0,0,5459,71.125529
1,1,2885,0.033446
2,2,5461,14.640688
3,3,5462,0.994011
4,4,5463,16.291955
5,5,4434,33.742262
6,6,5980,60.054106
7,7,4436,42.675873
8,8,4540,67.712895
9,9,6292,31.080707


In [21]:
A_blocks = XA[block_vars].drop_duplicates()
B_blocks = XB[block_vars].drop_duplicates()
# TODO: Warn when some blocks are dropped.
blocks = A_blocks.merge(B_blocks, on=block_vars)
n_blocks = blocks.shape[0]

In [24]:
row = blocks.iloc[0]

In [28]:
si.cdist_long(si.subset_from_row(XA, row),
           si.subset_from_row(XB, row)).groupby('id1').dist.nsmallest(1).reset_index()

Unnamed: 0,id1,level_1,dist
0,0,5459,71.125529
1,1,2885,0.033446
2,2,5461,14.640688
3,3,5462,0.994011
4,4,5463,16.291955
5,5,4434,33.742262
6,6,5980,60.054106
7,7,4436,42.675873
8,8,4540,67.712895
9,9,6292,31.080707


In [11]:
nearest_blocked

Unnamed: 0,id1,dist,id2
0,0,0.229330,203
1,1,4.075580,365
2,2,3.621807,233
3,3,4.045211,125
4,4,11.679206,332
5,5,1.000708,72
6,6,1.292878,279
7,7,8.944928,332
8,8,20.026911,38
9,9,3.537331,280
