# `nearest_record` example

For `synthimpute` package. Uses the `mpg` sample dataset.

## Setup

In [1]:
import synthimpute as si
import pandas as pd
import numpy as np
from scipy.spatial.distance import euclidean
import math

In [2]:
mpg = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/mpg.csv')
# Drop class columns and sometimes-missing horsepower.
mpg.drop(['origin', 'name', 'horsepower'], axis=1, inplace=True)

## Synthesize

In [3]:
synth = si.rf_synth(mpg, ['cylinders'], random_state=0)

Synthesizing feature 1 of 5: weight...
Synthesizing feature 2 of 5: acceleration...
Synthesizing feature 3 of 5: model_year...
Synthesizing feature 4 of 5: displacement...
Synthesizing feature 5 of 5: mpg...


## `nearest_record`

In [4]:
nearest = si.nearest_record(synth, mpg, metric='euclidean')
nearest.head()

NameError: ("name 'XA1' is not defined", 'occurred at index 0')

In [7]:
from scipy.spatial.distance import cdist

def nearest_record1(XA1, XB):
    """Get the nearest record between XA1 and XB.

    Args:
        XA: Series.
        XB: DataFrame.

    Returns:
        DataFrame with columns for id_B (from XB) and dist.
    """
    dist = cdist(XA1.values.reshape(1, -1), XB)[0]
    return pd.Series([np.amin(dist), np.argmin(dist)], 
                     index=['dist', 'id_B'])

def nearest_record(XA, XB):
    """Get the nearest record in XA for each record in XB.

    Args:
        XA: DataFrame. Each record is matched against the nearest in XB.
        XB: DataFrame.

    Returns:
        DataFrame with columns for id_A (from XA), id_B (from XB), and dist.
        Each id_A maps to a single id_B, which is the nearest record from XB.
    """
    res = XA.apply(lambda x: nearest_record1(x, XB), axis=1)
    res['id_A'] = XA.index
    # id_B is sometimes returned as an object.
    res['id_B'] = res.id_B.astype(int)
    # Reorder columns.
    return res[['id_A', 'id_B', 'dist']]

In [13]:
%timeit nearest = nearest_record(synth, mpg)

237 ms ± 26.9 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [11]:
%timeit nearest = si.nearest_record(synth, mpg, metric='euclidean')

433 ms ± 4.26 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [5]:
row = nearest.iloc[0]
math.isclose(row.dist, euclidean(synth.iloc[int(row.id1)], mpg.iloc[int(row.id2)]))

False

### Blocked

In [6]:
nearest_blocked = si.nearest_record(synth, mpg, ['cylinders'], metric='euclidean')
nearest_blocked.head()

Unnamed: 0,id1,dist,id2
0,0,2.516109,84
1,1,1.153921,258
2,2,0.05237,279
3,3,0.096679,386
4,4,1.027272,172


In [7]:
row_blocked = nearest_blocked.iloc[0]
math.isclose(row_blocked.dist, 
             euclidean(synth.iloc[int(row.id1)], mpg.iloc[int(row.id2)]))

False

#### Calculate from `block_cdist`

In [8]:
dist_blocked = si.block_cdist(synth, mpg, ['cylinders'], metric='euclidean')

Running block 1 of 5...
Running block 2 of 5...
Running block 3 of 5...
Running block 4 of 5...
Running block 5 of 5...


In [9]:
math.isclose(row_blocked.dist,
             dist_blocked[dist_blocked.id1 == row_blocked.id1].dist.min())

True

#### Compare blocked and unblocked

Blocking means we might miss the true minimum distance, as we did for `id1=1`.

In [10]:
comp = nearest.merge(nearest_blocked, on='id1')
comp.head()

Unnamed: 0,id1,dist_x,id2_x,dist_y,id2_y
0,0,2.516109,84,2.516109,84
1,1,1.153921,258,1.153921,258
2,2,0.05237,279,0.05237,279
3,3,0.096679,386,0.096679,386
4,4,1.027272,172,1.027272,172


In [15]:
%%timeit
nearest_blocked2 = si.nearest_record2(synth, mpg, ['cylinders'], metric='euclidean')

365 ms ± 9.11 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
%%timeit
nearest_blocked = si.nearest_record(synth, mpg, ['cylinders'], metric='euclidean')

382 ms ± 7.79 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:
nearest_blocked.equals(nearest_blocked2)

True

In [12]:
nearest_blocked2.head()

Unnamed: 0,id1,dist,id2
0,0,2.516109,84
1,1,1.153921,258
2,2,0.05237,279
3,3,0.096679,386
4,4,1.027272,172
