# `nearest_record` example

For `synthimpute` package. Uses the `mpg` sample dataset.

## Setup

In [1]:
import synthimpute as si
import pandas as pd
import numpy as np
from scipy.spatial.distance import euclidean
import math

In [2]:
mpg = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/mpg.csv')
# Drop class columns and sometimes-missing horsepower.
mpg.drop(['origin', 'name', 'horsepower'], axis=1, inplace=True)

## Synthesize

In [3]:
synth = si.rf_synth(mpg, ['cylinders'], random_state=0)

Synthesizing feature 1 of 5: mpg...
Synthesizing feature 2 of 5: displacement...
Synthesizing feature 3 of 5: weight...
Synthesizing feature 4 of 5: acceleration...
Synthesizing feature 5 of 5: model_year...


## `nearest_record`

In [4]:
nearest = si.nearest_record(synth, mpg, metric='euclidean')
nearest.head()

Unnamed: 0,id1,dist,id2
0,0,0.996498,175
1,1,3.000161,254
2,2,0.435471,203
3,3,1.001864,199
4,4,6.309149,332


In [5]:
row = nearest.iloc[0]
math.isclose(row.dist, euclidean(synth.iloc[int(row.id1)], mpg.iloc[int(row.id2)]))

False

### Blocked

In [6]:
nearest_blocked = si.nearest_record(synth, mpg, ['cylinders'], metric='euclidean')
nearest_blocked.head()

Unnamed: 0,id1,dist,id2
0,0,0.996498,175
1,1,3.000161,254
2,2,0.435471,203
3,3,1.001864,199
4,4,6.309149,332


In [7]:
row_blocked = nearest_blocked.iloc[0]
math.isclose(row_blocked.dist, 
             euclidean(synth.iloc[int(row.id1)], mpg.iloc[int(row.id2)]))

False

#### Calculate from `block_cdist`

In [8]:
dist_blocked = si.block_cdist(synth, mpg, ['cylinders'], metric='euclidean')

Running block 1 of 5...
Running block 2 of 5...
Running block 3 of 5...
Running block 4 of 5...
Running block 5 of 5...


In [9]:
math.isclose(row_blocked.dist,
             dist_blocked[dist_blocked.id1 == row_blocked.id1].dist.min())

True

#### Compare blocked and unblocked

Blocking means we might miss the true minimum distance, as we did for `id1=1`.

In [10]:
comp = nearest.merge(nearest_blocked, on='id1')
comp.head()

Unnamed: 0,id1,dist_x,id2_x,dist_y,id2_y
0,0,0.996498,175,0.996498,175
1,1,3.000161,254,3.000161,254
2,2,0.435471,203,0.435471,203
3,3,1.001864,199,1.001864,199
4,4,6.309149,332,6.309149,332
