In [None]:
from os import path
import sys
from time import time
import pickle
from collections import namedtuple, Counter, defaultdict
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import scipy.stats
import cohd_temporal as cohd
import importlib
from cohd_temporal import AgeCounter, DeltaCounter  # Needed to load pickled cads and deltas

In [None]:
dir_data = '/path/to/data/dir'
dir_data_out = '/path/to/data/out/dir'

Note: use the sql load query to insert the dataset_id so that we don't have to redundantly write out the dataset_id in each row of these files  
  
Sort the data in the same order as the primary keys / indexes for their destination SQL tables for faster loading

In [None]:
# Load concept counts
with open(path.join(dir_data, 'concept_counts_perturbed.pkl'), 'rb') as f:
    concept_counts = pickle.load(f)

In [None]:
# Sort the concept IDs
concepts_sorted = sorted([k for k in concept_counts.keys()])

# Write out as csv
with open(path.join(dir_data_out, 'concept_counts.csv'), 'w') as f:
    f.write('concept_id\tconcept_count\n')
    for concept_id in concepts_sorted:
        f.write(f'{concept_id}\t{concept_counts[concept_id]}\n')    

In [None]:
# Load the concept pair counts
with open(path.join(dir_data, 'concept_pair_counts_perturbed.pkl'), 'rb') as f:
    concept_pair_counts = pickle.load(f)

In [None]:
# Sort the concept pairs
concept_pairs_sorted = sorted([k for k in concept_pair_counts.keys()])

# Write out concept pair counts to csv
with open(path.join(dir_data_out, 'concept_pair_counts.csv'), 'w') as f:
    f.write('concept_id_1\tconcept_id_2\tconcept_count\n')
    for pair in concept_pairs_sorted:
        f.write(f'{pair[0]}\t{pair[1]}\t{concept_pair_counts[pair]}\n')  

In [None]:
# Load the grouped age distributions
with open(path.join(dir_data, 'concept_age_group_dists_perturbed.pkl'), 'rb') as f:
    gacs = pickle.load(f)

# Sort the concept IDs
concepts_sorted = sorted([k for k in gacs.keys()])

file_age = path.join(dir_data_out, 'age_distributions.csv')
file_age_scheme = path.join(dir_data_out, 'age_schemes.csv')
with open(file_age, 'w') as f_age, open(file_age_scheme, 'w') as f_age_schemes:
    
    # Write headers
    f_age.write('concept_id\tbin\tconcept_count\n')
    f_age_schemes.write('concept_id\tbin_width\tbins\n')
        
    for concept_id in concepts_sorted:
        gac = gacs[concept_id]
        counts = gac.counts
        
        # Write the scheme
        f_age_schemes.write(f'{concept_id}\t{gac.bin_width}\t{gac.bins}\n')
        
        for i, c in enumerate(counts):
            f_age.write(f'{concept_id}\t{i}\t{c}\n')
    

In [None]:
# Load the grouped delta counts
with open(path.join(dir_data, 'deltas_group_perturbed.pkl'), 'rb') as f:
    gdcs = pickle.load(f)

In [None]:
# Sort the concept pairs to make the load into SQL database faster
concept_pairs_sorted = sorted([k for k in gdcs.keys()])

# Write out concept pair counts to csv
file_deltas = path.join(dir_data_out, 'deltas.csv')
file_delta_schemes = path.join(dir_data_out, 'delta_schemes.csv')
with open(file_deltas, 'w') as f_deltas, open(file_delta_schemes, 'w') as f_schemes:
    f_deltas.write('concept_id_1\tconcept_id_2\tbin\tconcept_count\n')
    f_schemes.write('concept_id_1\tconcept_id_2\tbin_width\tn\n')
    for pair in concept_pairs_sorted:
        gdc = gdcs[pair]
        counts = gdc.counts
        n = gdc.n
        f_schemes.write(f'{pair[0]}\t{pair[1]}\t{gdc.bin_width}\t{gdc.n}\n')
        for i, c in enumerate(counts):
            f_deltas.write(f'{pair[0]}\t{pair[1]}\t{i-gdc.n}\t{c}\n')        