In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import matplotlib.pyplot as plt
import pandas as pd

from data_reconstruct import *
from data_reconstruct.backend import *
from data_reconstruct.examples import *
from data_reconstruct.utilities import *

In [3]:
random_state = 42
np.random.seed(random_state)

# Load Data

In [None]:
# Read
mod1 = pd.read_csv('data/decennial_2020_nov8GAN.csv')[1:]
mod2 = pd.read_csv('data/public_150k_plus_210630.csv')
mod1_tagged = process_decennial(mod1)
mod2_tagged = process_ppp(mod2)
mod1, mod1_cols, mod1_tags_cols = mod1_tagged
mod2, mod2_cols, mod2_tags_cols = mod2_tagged

# Merge
mod1, mod2, annotations = merge_data(mod1_tagged, mod2_tagged, agg_by_tag=True)

# Sample
num_samples = min(mod1.shape[0], 2000)
mod1_sample = mod1.sample(num_samples, random_state=random_state)
mod2_sample = mod2.sample(num_samples, random_state=random_state)
annotations_sample = annotations.sample(num_samples, random_state=random_state);

# Generate Merged Anonymous Records

In [None]:
anonymized_data = anonymize(
    mod1_sample.to_numpy(),
    mod2_sample.to_numpy(),
    embedding_dim=5,
    embedding_kwargs={
        'project_mode': 'tsne',
        'epoch_pd': 2000,
        'log_pd': 200,
        'epoch_DNN': 200,
        'log_DNN': 100,
    },
)

# Preview Results

In [None]:
pd.DataFrame(anonymized_data[0], index=annotations_sample.iloc[:, 0])

# Usability

## Arbitrary Statistic Prediction

In [None]:
target = np.sqrt(mod1_sample['totalpop_other'] * mod2_sample['HEALTH_CARE_PROCEED']).to_numpy().reshape((-1, 1))
source = anonymized_data[0]

In [None]:
from data_reconstruct.backend import create_dataloader, train_model
from data_reconstruct.model_classes import Model

split_idx = int(.8 * target.shape[0])
training_loader = create_dataloader(source[:split_idx], target[:split_idx])
model = model_classes.Model(target.shape[1], 1, hidden_dim=20)
train_model(model, training_loader)

In [None]:
prediction = model(torch.Tensor(source)).detach().numpy()

In [None]:
plot_example_results(target, prediction, split_idx=split_idx)