In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import matplotlib.pyplot as plt
import pandas as pd

from data_reconstruct import *
from data_reconstruct.backend import *
from data_reconstruct.examples import *
from data_reconstruct.utilities import *

In [3]:
random_state = 42
np.random.seed(random_state)

# Load Data

In [4]:
# Read
mod1 = pd.read_csv('data/decennial_2020_nov8GAN.csv')[1:]
mod2 = pd.read_csv('data/public_150k_plus_210630.csv')
mod1_tagged = process_decennial(mod1)
mod2_tagged = process_ppp(mod2)
mod1, mod1_cols, mod1_tags_cols = mod1_tagged
mod2, mod2_cols, mod2_tags_cols = mod2_tagged

# Merge
mod1, mod2, annotations = merge_data(mod1_tagged, mod2_tagged)

# Sample
num_samples = min(mod1.shape[0], 2000)
mod1_sample = mod1.sample(num_samples, random_state=random_state)
mod2_sample = mod2.sample(num_samples, random_state=random_state)
annotations_sample = annotations.sample(num_samples, random_state=random_state);

# Generate Merged Anonymous Records

In [5]:
anonymized_data = anonymize(
    mod1_sample.to_numpy(),
    mod2_sample.to_numpy(),
    embedding_dim=5,
    embedding_kwargs={
        'project_mode': 'tsne',
        'epoch_pd': 4000,
        'log_pd': 200,
        'epoch_DNN': 300,
        'log_DNN': 100,
    },
)

use random seed: 666
Shape of Raw data
Dataset 0: (729, 6)
Dataset 1: (729, 4)
---------------------------------
Find correspondence between Dataset 1 and Dataset 2
use device: cpu
epoch:[200/4000] err:0.4023 alpha:1.1304
epoch:[400/4000] err:0.1626 alpha:0.4578
epoch:[600/4000] err:0.1996 alpha:0.5643
epoch:[800/4000] err:0.2308 alpha:0.6561
epoch:[1000/4000] err:0.2571 alpha:0.7368
epoch:[1200/4000] err:0.2789 alpha:0.8077
epoch:[1400/4000] err:0.2965 alpha:0.8698
epoch:[1600/4000] err:0.3103 alpha:0.9243
epoch:[1800/4000] err:0.3205 alpha:0.9726
epoch:[2000/4000] err:0.3275 alpha:1.0156
epoch:[2200/4000] err:0.3318 alpha:1.0545
epoch:[2400/4000] err:0.3335 alpha:1.0901
epoch:[2600/4000] err:0.3331 alpha:1.1234
epoch:[2800/4000] err:0.3308 alpha:1.1547
epoch:[3000/4000] err:0.3270 alpha:1.1843
epoch:[3200/4000] err:0.3225 alpha:1.2119
epoch:[3400/4000] err:0.3174 alpha:1.2375
epoch:[3600/4000] err:0.3122 alpha:1.2610
epoch:[3800/4000] err:0.3066 alpha:1.2832
epoch:[4000/4000] err:0.3

# Preview Results

In [6]:
pd.DataFrame(anonymized_data[0], index=annotations.iloc[:, 0])

Unnamed: 0_level_0,0,1,2,3,4
ProjectCountyName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ACADIA,-3.707166,1.931105,-2.695745,-3.733490,-4.279046
ACCOMACK,1.298629,3.896966,-5.030778,1.583157,-3.908888
ADAIR,1.360886,2.667762,-3.574935,1.569941,-2.424872
ADAMS,1.037992,2.813919,-3.740922,1.253703,-2.756526
ALAMEDA,0.260375,-1.541044,1.309018,-0.079070,1.666547
...,...,...,...,...,...
YELLOWSTONE,-2.082055,3.247779,-4.238104,-1.941257,-4.922127
YOLO,0.329283,0.273046,-0.726581,0.366358,-0.127813
YORK,-2.484622,1.291496,-1.959514,-2.531929,-2.908816
YOUNG,-2.435240,2.156137,-2.967344,-2.405302,-3.869308
