# Section 1

Generating our data: a small dataset with in-distribution (IND) datapoints and a clearly separated out-of-distribution (OOD) cluster that we will use to test our models performance on unseen data.


In [2]:
# imports and set-up

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
# import seaborn as sns

from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

plt.rcParams['figure.figsize'] = (6.2, 5.2)
plt.rcParams['axes.spines.top'] = False
plt.rcParams['axes.spines.right'] = False

rng = np.random.RandomState(42)

# 2. generate our IND data - two interlocking half circles (moons)
X_all, y_all = make_moons(n_samples=600, noise=0.2, random_state=rng)

X_train, X_test, y_train, y_test = train_test_split(
    X_all, y_all, test_size=0.3, random_state=rng, stratify=y_all
)

# 3. generate OOD data - separate gaussian cluster
X_ood = rng.normal(loc=[3.4, 3.0], scale=[0.35, 0.35], size=(120, 2))

# 4. print shape summary
print("Shapes: ",
        "Train:", X_train.shape,
        "IND test:", X_test.shape,
        "OOD test:", X_ood.shape
)

Shapes:  Train: (420, 2) IND test: (180, 2) OOD test: (120, 2)
