In [21]:
import pandas as pd
import numpy as np
import scipy
from Data_Clean_Room import data_clean_room
import os
from realtabformer import REaLTabFormer
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import wasserstein_distance as wd
from scipy.stats import chisquare
from scipy import stats
import warnings
from tqdm import tqdm
from SIMPRO import simpro
from datetime import datetime
from pathlib import Path
from scipy.cluster import hierarchy
import calendar
import random
import names
import torch

print(torch.cuda.is_available())

True


# Define function to calculate the Cramer's V correlation matrix and convert the timestamp into categorical columns

In [22]:
def is_numeric(x):
        try:
            pd.to_numeric(x)
            return True
        except ValueError:
            return False
        
def cramer_v(x, y):
        confusion_matrix = pd.crosstab(x, y)
        chi2, p, _, _ = scipy.stats.chi2_contingency(confusion_matrix)
        n = confusion_matrix.sum().sum()
        phi2 = chi2 / n
        r, k = confusion_matrix.shape
        phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
        rcorr = r - ((r - 1) ** 2) / (n - 1)
        kcorr = k - ((k - 1) ** 2) / (n - 1)
        return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))   
    
def divide_date(number):
    # Parse the input date string
    date_string = str(number)
    date_object = datetime.strptime(date_string, '%Y%m%d%H%M')

    # Extract year, month, day, hour, and minute
    year = date_object.year
    month = date_object.month
    day = date_object.day
    hour = date_object.strftime('%I')  # Convert hour to 12-hour format
    minute = date_object.minute
    am_pm = 1 if date_object.strftime('%p') == 'AM' else 0

    return pd.Series([year, month, day, hour, minute, am_pm])




def convert_date_column(df, column_name):
    new_columns = df[column_name].apply(divide_date)
    new_columns.columns = [f"{column_name}_{i}" for i in ['y', 'm', 'd', 'h', 'min', 'AM']]
    return pd.concat([df.drop(columns=[column_name]), new_columns], axis=1)



def combine_date(df, column_name):
    cols = [f"{column_name}_{i}" for i in ['y', 'm', 'd', 'h', 'min', 'AM']]
    combined_dates = []
    for index, row in df.iterrows():
        year = row[cols[0]]
        month = row[cols[1]]
        day = row[cols[2]]
        if day == 0:
            day += 1
        hour = int(row[cols[3]])
        minute = row[cols[4]]
        if row[cols[5]] == 1:
            if hour == 12:
                hour = 0  # Midnight
        else:
            if hour != 12:
                hour += 12  # Convert to 24-hour format if PM
                
        try:
            combined_date = datetime(year, month, day, hour, minute)
        except ValueError:
            # If an invalid date is encountered, adjust to the nearest valid date
            max_day = 31  # Assume maximum days in a month (adjust as necessary)
            if month in [4, 6, 9, 11]:  # Months with 30 days
                max_day = 30
            elif month == 2:  # February handling (leap year not considered here)
                max_day = 28
            
            if day > max_day:
                day = max_day  # Adjust day to the last valid day of the month
            
            # Create a valid datetime object
            combined_date = datetime(year, month, day, hour, minute)
    combined_dates = pd.Series(combined_dates)
    combined_dates.name = column_name
    return pd.concat([df.drop(columns = cols), combined_dates], axis = 1)


# Correlation Dimensional Reduction Methods

In [23]:
def direct_combine(c1, c2, key = 'user_id'):
    c = pd.merge(c2, c1, left_on = key, right_on = key)
    return c

def cdr(c1, c2, key = 'user_id', corr = 0.1):
    c = pd.merge(c2, c1, left_on = key, right_on = key)

    special_col_list = [key]
    normal_col_list = [key]
    for col in c.columns:
            if col != key:
                if isinstance(c[col][0], str) and c[col].str.contains('^').any():
                    special_col_list.append(col)
                elif isinstance(c[col][0], datetime):
                    special_col_list.append(col)
                else:
                    normal_col_list.append(col)

    special_col = c[special_col_list].drop_duplicates()
    normal_col = c[normal_col_list].drop_duplicates()

    cor = np.zeros([len(normal_col.columns), len(normal_col.columns)])

    for i in range(len(normal_col.columns)):
        for j in range(len(normal_col.columns)):
            cor[i, j] = cramer_v(normal_col.iloc[:, i], normal_col.iloc[:, j])

    cor = pd.DataFrame(cor, index = normal_col.columns, columns = normal_col.columns)

    independent_col_list = [key]
    dependent_col_list = []
    for col in normal_col.columns:
        if all(cor[col].drop(col) <= corr):
            independent_col_list.append(col)
        else:
            dependent_col_list.append(col)
            
    independent_col = c[independent_col_list].drop_duplicates()
    dependent_col = c[dependent_col_list].drop_duplicates()
    
    child = dependent_col.copy()
    
    for col in independent_col:
        new_col = []
        for user_id in np.unique(child[key]):
            new_col.extend(independent_col[independent_col[key] == user_id][col].sample(child[key].value_counts()[user_id], replace = True).values)
        child[col] = new_col
    

    for col in special_col:
        new_col = []
        for user_id in np.unique(child[key]):
            new_col.extend(special_col[special_col[key] == user_id][col].sample(child[key].value_counts()[user_id], replace = True).values)
        child[col] = new_col
    return child



def cdr_hierarchical_cluster(c1, c2, key = 'user_id'):
    c = pd.merge(c2, c1, left_on = key, right_on = key)

    special_col_list = [key]
    normal_col_list = [key]
    for col in c.columns:
            if col != key:
                if isinstance(c[col][0], str) and c[col].str.contains('^').any():
                    special_col_list.append(col)
                elif isinstance(c[col][0], datetime):
                    special_col_list.append(col)
                else:
                    normal_col_list.append(col)

    special_col = c[special_col_list].drop_duplicates()
    normal_col = c[normal_col_list].drop_duplicates()

    cor = np.zeros([len(normal_col.columns), len(normal_col.columns)])

    for i in range(len(normal_col.columns)):
        for j in range(len(normal_col.columns)):
            cor[i, j] = cramer_v(normal_col.iloc[:, i], normal_col.iloc[:, j])

    cor = pd.DataFrame(cor, index = normal_col.columns, columns = normal_col.columns)
    pdist = hierarchy.distance.pdist(cor)
    linkage = hierarchy.linkage(pdist, method='average')
    idx = hierarchy.fcluster(linkage, 0.5 * pdist.max(), 'distance')
    order = []
    subsets = {}
    for i in range(len(np.unique(idx))):
        if 'user_id' in cor.columns[np.where(idx == i + 1)[0]]:
            subsets[f"Subset_{i + 1}"] = list(cor.columns[np.where(idx == i + 1)[0]])
        else:
            subsets[f"Subset_{i + 1}"] = ['user_id'] + list(cor.columns[np.where(idx == i + 1)[0]])

    normal_col_subset = {}

    for subset, subset_features in subsets.items():
        normal_col_subset[subset] = normal_col[subset_features].drop_duplicates()

    longest_dataframe_name = None
    longest_dataframe_length = -1


    for name, df in normal_col_subset.items():
        current_length = len(df)
        if current_length > longest_dataframe_length:
            longest_dataframe_length = current_length
            longest_dataframe_name = name

    base_col = normal_col_subset[longest_dataframe_name].reset_index(drop=True)

    
    additional_subsets = {}

    for i in range(len(normal_col_subset) - 1):
        if f"Subset_{i + 1}" != longest_dataframe_name:
            subset_to_add = normal_col_subset[f"Subset_{i + 1}"]
            new_df = []
            for user_id in np.unique(base_col['user_id']):
                new_df.extend(subset_to_add[subset_to_add['user_id'] == user_id].sample(base_col['user_id'].value_counts()[user_id], replace = True).values)
                #base_col[col] = new_col 
            additional_subsets[i] = pd.DataFrame(new_df, columns = subset_to_add.columns)
            
            
    for sub in additional_subsets.keys():
        for col in additional_subsets[sub].columns:
            if col != 'user_id':
                base_col[col] = additional_subsets[sub][col]

    child = base_col   
    for col in special_col:
        new_col = []
        for user_id in np.unique(base_col['user_id']):
            new_col.extend(special_col[special_col['user_id'] == user_id][col].sample(base_col['user_id'].value_counts()[user_id], replace = True).values)
        child[col] = new_col
    return child

# Synthesis Function

In [24]:
def synthesize(parent, child, parent_n = 20, child_n = 200, join_on = 'user_id'):
    parent_model = REaLTabFormer(model_type="tabular", epochs = 1, batch_size = 5, train_size = 0.8)
    parent_model.fit(parent.drop(join_on, axis=1), num_bootstrap=5)
    
    save_directory = f"fine_tuned_model_demo"
    if not os.path.exists(save_directory):
        os.makedirs(save_directory)
        
    pdir = Path(save_directory)
        
    parent_model.save(pdir)
    parent_model_path = sorted([p for p in pdir.glob("id*") if p.is_dir()], key=os.path.getmtime)[-1]
        
    child_model_1 = REaLTabFormer(model_type="relational",
                    parent_realtabformer_path=parent_model_path, epochs=10, batch_size = 5, train_size = 0.8)
        
    child_model_1.fit(df = child,
                    in_df = parent,
                    join_on = join_on, num_bootstrap = 5)
    
    
    parent_samples = parent_model.sample(parent_n)
    parent_samples.index.name = join_on
    parent_samples = parent_samples.reset_index()
    
    child_samples = child_model_1.sample(n_samples = child_n,
                    input_unique_ids=parent_samples[join_on],
                    input_df=parent_samples.drop(join_on, axis=1),
                    output_max_length = None,
                    gen_batch = 1)
        
    child_samples.index.name = join_on
    return parent_samples, child_samples

# Corrlation Dimension Reduction Only

In [None]:
task_ids = [10005, 10006, 14584, 22100, 31941, 31996, 34382, 34975]

for task_id in task_ids:
    torch.cuda.empty_cache()
    d1 = pd.read_csv(f"datasets/task_id_{task_id}/feeds.csv")
    d2 = pd.read_csv(f"datasets/task_id_{task_id}/ads.csv")
    
    
    if task_id == 10006 or task_id == 22100:
        d2 = d2.drop('ad_close_list_v001', axis = 1)
        d2 = d2.drop('ad_close_list_v002', axis = 1)
        d2 = d2.drop('ad_close_list_v003', axis = 1)
        d2 = d2.drop('pt_d', axis = 1)
        d1 = d1.drop('e_et', axis = 1)
    else:
        d1 = convert_date_column(d1, 'e_et')
        d2 = convert_date_column(d2, 'pt_d')

   
    
    if 'log_id' in d2.columns:
        d2 = d2.drop('log_id', axis = 1)
    
    dcr = data_clean_room(d1, d2, 'user_id')
    dcr.derec()
    dcr.sampling(200)
    
    c1 = dcr.derec_child_1_small
    c2 = dcr.derec_child_2_small
    
    # Extract the Parent Table with the DEREC pipeline 
    parent = dcr.derec_parent_small
    
    # Combine the remaining child tables with one of the correlation dimensional reduction method
    child = cdr(c1, c2, 'user_id', 0.1)
    #child = direct_combine(c1, c2, 'user_id')
    #child = cdr_hierarchical_cluster(c1, c2, 'user_id')
    
    join_on = 'user_id'              
        
    # Synthesize data
    parent_samples, child_samples = synthesize(parent, child, parent_n = 20, child_n = 200, join_on = join_on)
    
    # Refresh cuda memories
    torch.cuda.empty_cache()
    
    # Reverse the parent-child table structure back to input data structure
    new_syn_child_1 = child_samples[[col for col in child_samples.columns if col in dcr.og1.columns]]
    new_syn_child_2 = child_samples[[col for col in child_samples.columns if col in dcr.og2.columns]]
    new_syn_parent_1 = parent_samples[[col for col in parent_samples.columns if col in dcr.og1.columns]]
    new_syn_parent_2 = parent_samples[[col for col in parent_samples.columns if col in dcr.og2.columns]]
    
    new_syn_1 = pd.merge(new_syn_parent_1, new_syn_child_1, left_on = 'user_id', right_on = 'user_id')
    new_syn_2 = pd.merge(new_syn_parent_2, new_syn_child_2, left_on = 'user_id', right_on = 'user_id')
    

    og_parent = dcr.derec_parent_small
    og_1_p = og_parent[[col for col in parent_samples.columns if col in dcr.og1.columns]]
    og_2_p = og_parent[[col for col in parent_samples.columns if col in dcr.og2.columns]]
    
    og_1 = pd.merge(og_1_p, c1, left_on = 'user_id', right_on = 'user_id')
    og_2 = pd.merge(og_2_p, c2, left_on = 'user_id', right_on = 'user_id')
    
    og_1 = og_1[list(new_syn_1.columns)]
    og_2 = og_2[list(new_syn_2.columns)]
    
    
    dcr.synthesize(20, 200)
    
    old_syn_1 = dcr.syn1
    old_syn_2 = dcr.syn2
    
    old_syn_1 = old_syn_1[list(new_syn_1.columns)]
    old_syn_2 = old_syn_2[list(new_syn_2.columns)] 
    
    
    if task_id != 10006 and task_id != 22100:
        new_syn_1 = combine_date(new_syn_1, 'e_et')
        new_syn_2 = combine_date(new_syn_2, 'pt_d')

        old_syn_1 = combine_date(old_syn_1, 'e_et')
        old_syn_2 = combine_date(old_syn_2, 'pt_d')

        og_1 = combine_date(og_1, 'e_et')
        og_2 = combine_date(og_2, 'pt_d')
    
    
    og = {}
    og['d1'] = og_1
    og['d2'] = og_2
    
    old = {}
    old['d1'] = old_syn_1
    old['d2'] = old_syn_2
    
    new = {}
    new['d1'] = new_syn_1
    new['d2'] = new_syn_2
    
    old_evaluation = simpro(og, old)
    old_evaluation.cal_marginal_indicators()
    old_evaluation.cal_conditional_indicators()
    
    new_evaluation = simpro(og, new)
    new_evaluation.cal_marginal_indicators()
    new_evaluation.cal_conditional_indicators()
    
    old_p = old_evaluation.conditional_indicators['p-values']
    new_p = new_evaluation.conditional_indicators['p-values']
    old_w = old_evaluation.conditional_indicators['w-distance']
    new_w = new_evaluation.conditional_indicators['w-distance']
    
    p_val = pd.DataFrame([old_p, new_p], index = ['Old', 'New']).T
    p_val = p_val.fillna(1)
    p_val.to_csv(f"results/p_val_{task_id}_cdr.csv")
    
    w_dis = pd.DataFrame([old_w, new_w], index = ['Old', 'New']).T
    w_dis.to_csv(f"results/w_dis_{task_id}_cdr.csv")
    

# Automatic Transforming Class

In [25]:
class qualitative_transformer:
    def __init__(self, df, col_names):
        self.df = df.copy()
        self.col_names = col_names
        self.all_inv_maps = None
        self.all_maps = None
        
    def generate_list_of_names(self, n):
        name_list = []
        while len(name_list) < n:
            name_list.append(names.get_first_name())
            name_list = list(set(name_list))
        return list(set(name_list))
    
    def remove_elements(self, original_list, elements_to_remove):
        return [x for x in original_list if x not in elements_to_remove]
    
    def create_map(self):
        N = 0
        for col_name in self.col_names:
            N += len(np.unique(self.df[col_name]))
        name_list = self.generate_list_of_names(N)
        self.all_maps = {}
        for col_name in self.col_names:
            col_map = {}
            col_pool = random.sample(name_list, len(np.unique(self.df[col_name])))
            j = 0
            for i in np.unique(self.df[col_name]):
                col_map[i] = col_pool[j]
                j += 1
            name_list = self.remove_elements(name_list, col_pool)
            self.all_maps[col_name] = col_map
        
        self.all_inv_maps = {}
        for col_name in self.col_names:
            self.all_inv_maps[col_name] = {v: k for k, v in self.all_maps[col_name].items()}
            
            
    def transform(self, new_df):
        if self.all_maps == None:
            self.create_map()
        output_df = new_df.copy()
        for col_name in self.col_names:
            output_df[col_name] = output_df[col_name].map(self.all_maps[col_name])
        return output_df
            
            
    def inv_transform(self, new_df):
        output_df = new_df.copy()
        for col_name in self.col_names:
            output_df[col_name] = output_df[col_name].map(self.all_inv_maps[col_name])
        return output_df
            
        

# Automatic Categorical Mapping

In [26]:
task_ids = [10005, 10006, 14584, 22100, 31941, 31996, 34382, 34975]

for task_id in task_ids:
    torch.cuda.empty_cache()
    d1 = pd.read_csv(f"datasets/task_id_{task_id}/feeds.csv")
    d2 = pd.read_csv(f"datasets/task_id_{task_id}/ads.csv")
    
    
    if task_id == 10006 or task_id == 22100:
        d2 = d2.drop('ad_close_list_v001', axis = 1)
        d2 = d2.drop('ad_close_list_v002', axis = 1)
        d2 = d2.drop('ad_close_list_v003', axis = 1)
        d2 = d2.drop('pt_d', axis = 1)
        d1 = d1.drop('e_et', axis = 1)
    else:
        d1 = convert_date_column(d1, 'e_et')
        d2 = convert_date_column(d2, 'pt_d')
    
    # Use map function to replace numeric values with categorical terms
    transformer = qualitative_transformer(d2, ['age', 'gender', 'residence'])
    d2 = transformer.transform(d2)
    
    
    if 'log_id' in d2.columns:
        d2 = d2.drop('log_id', axis = 1)
    
    dcr = data_clean_room(d1, d2, 'user_id')
    dcr.derec()
    dcr.sampling(200)
    
    c1 = dcr.derec_child_1_small
    c2 = dcr.derec_child_2_small
    
    # Extract the Parent Table with the DEREC pipeline 
    parent = dcr.derec_parent_small
    # Combine the remaining child tables with one of the correlation dimensional reduction method
    child = cdr(c1, c2, 'user_id', 0.1)
    #child = direct_combine(c1, c2, 'user_id')
    #child = cdr_hierarchical_cluster(c1, c2, 'user_id')
    
    join_on = 'user_id'              
        
    ###
    parent_samples, child_samples = synthesize(parent, child, parent_n = 20, child_n = 200, join_on = join_on)
    
    torch.cuda.empty_cache()
    
    new_syn_child_1 = child_samples[[col for col in child_samples.columns if col in dcr.og1.columns]]
    new_syn_child_2 = child_samples[[col for col in child_samples.columns if col in dcr.og2.columns]]
    new_syn_parent_1 = parent_samples[[col for col in parent_samples.columns if col in dcr.og1.columns]]
    new_syn_parent_2 = parent_samples[[col for col in parent_samples.columns if col in dcr.og2.columns]]
    
    new_syn_1 = pd.merge(new_syn_parent_1, new_syn_child_1, left_on = 'user_id', right_on = 'user_id')
    new_syn_2 = pd.merge(new_syn_parent_2, new_syn_child_2, left_on = 'user_id', right_on = 'user_id')
    
    
    d2 = transformer.inv_transform(d2)
    new_syn_2 = transformer.inv_transform(new_syn_2)


    
    og_parent = dcr.derec_parent_small
    og_1_p = og_parent[[col for col in parent_samples.columns if col in dcr.og1.columns]]
    og_2_p = og_parent[[col for col in parent_samples.columns if col in dcr.og2.columns]]
    
    og_1 = pd.merge(og_1_p, c1, left_on = 'user_id', right_on = 'user_id')
    og_2 = pd.merge(og_2_p, c2, left_on = 'user_id', right_on = 'user_id')
    
    og_1 = og_1[list(new_syn_1.columns)]
    og_2 = og_2[list(new_syn_2.columns)]
    
    og_2 = transformer.inv_transform(og_2)


    
    dcr.synthesize(20, 200)
    
    old_syn_1 = dcr.syn1
    old_syn_2 = dcr.syn2
    
    old_syn_1 = old_syn_1[list(new_syn_1.columns)]
    old_syn_2 = old_syn_2[list(new_syn_2.columns)]
    
    old_syn_2 = transformer.inv_transform(old_syn_2)
    
    
    
    if task_id != 10006 and task_id != 22100:
        new_syn_1 = combine_date(new_syn_1, 'e_et')
        new_syn_2 = combine_date(new_syn_2, 'pt_d')

        old_syn_1 = combine_date(old_syn_1, 'e_et')
        old_syn_2 = combine_date(old_syn_2, 'pt_d')

        og_1 = combine_date(og_1, 'e_et')
        og_2 = combine_date(og_2, 'pt_d')
    
    
    og = {}
    og['d1'] = og_1
    og['d2'] = og_2
    
    old = {}
    old['d1'] = old_syn_1
    old['d2'] = old_syn_2
    
    new = {}
    new['d1'] = new_syn_1
    new['d2'] = new_syn_2
    
    old_evaluation = simpro(og, old)
    old_evaluation.cal_marginal_indicators()
    old_evaluation.cal_conditional_indicators()
    
    new_evaluation = simpro(og, new)
    new_evaluation.cal_marginal_indicators()
    new_evaluation.cal_conditional_indicators()
    
    old_p = old_evaluation.conditional_indicators['p-values']
    new_p = new_evaluation.conditional_indicators['p-values']
    old_w = old_evaluation.conditional_indicators['w-distance']
    new_w = new_evaluation.conditional_indicators['w-distance']
    
    p_val = pd.DataFrame([old_p, new_p], index = ['Old', 'New']).T
    p_val = p_val.fillna(1)
    p_val.to_csv(f"results/p_val_{task_id}_auto_mapping_reduction.csv")
    
    w_dis = pd.DataFrame([old_w, new_w], index = ['Old', 'New']).T
    w_dis.to_csv(f"results/w_dis_{task_id}_auto_mapping_reduction.csv")
    

Computing the sensitivity threshold...
Using parallel computation!!!




Bootstrap round:   0%|          | 0/5 [00:00<?, ?it/s]

Sensitivity threshold summary:
count    5.000000
mean     0.014394
std      0.012677
min     -0.006818
25%      0.014394
50%      0.017424
75%      0.020455
max      0.026515
dtype: float64
Sensitivity threshold: 0.0253030303030303 qt_max: 0.05


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
[34m[1mwandb[0m: Currently logged in as: [33mthomask1018[0m ([33mthomask1018-adaptive-investment-solutions[0m). Use [1m`wandb login --relogin`[0m to force relogin


  attn_output = torch.nn.functional.scaled_dot_product_attention(


Step,Training Loss,Validation Loss


  0%|          | 0/99 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 128 samples generated. Sampling efficiency is: 100.0000%
Critic round: 5,                     sensitivity_threshold: 0.0253030303030303,                         val_sensitivity: -0.0241919191919192,                             val_sensitivities: [-0.025, -0.025, -0.025, -0.01893939393939394, -0.025, -0.025, -0.025, -0.025, -0.025, -0.025, -0.01893939393939394, -0.025, -0.025, -0.025, -0.025]




Copying artefacts from: best-disc-model
Copying artefacts from: mean-best-disc-model
Copying artefacts from: not-best-disc-model
Copying artefacts from: last-epoch-model


  torch.load(parent_realtabformer_path / ModelFileName.rtf_model_pt)


Map:   0%|          | 0/35468 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Filter:   0%|          | 0/200 [00:00<?, ? examples/s]



Map:   0%|          | 0/5 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)


Step,Training Loss,Validation Loss


  0%|          | 0/20 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 128 samples generated. Sampling efficiency is: 100.0000%




Map:   0%|          | 0/20 [00:00<?, ? examples/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Computing the sensitivity threshold...
Using parallel computation!!!




Bootstrap round:   0%|          | 0/5 [00:00<?, ?it/s]

Sensitivity threshold summary:
count    5.000000
mean    -0.005000
std      0.018383
min     -0.025000
25%     -0.015909
50%     -0.006818
75%     -0.000758
max      0.023485
dtype: float64
Sensitivity threshold: 0.01863636363636363 qt_max: 0.05


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss


  0%|          | 0/99 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 128 samples generated. Sampling efficiency is: 100.0000%
Critic round: 5,                     sensitivity_threshold: 0.01863636363636363,                         val_sensitivity: -0.0241919191919192,                             val_sensitivities: [-0.025, -0.025, -0.025, -0.01893939393939394, -0.025, -0.025, -0.025, -0.025, -0.025, -0.025, -0.01893939393939394, -0.025, -0.025, -0.025, -0.025]




Copying artefacts from: best-disc-model
Copying artefacts from: mean-best-disc-model
Copying artefacts from: not-best-disc-model
Copying artefacts from: last-epoch-model


  torch.load(parent_realtabformer_path / ModelFileName.rtf_model_pt)


Map:   0%|          | 0/17940 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Filter:   0%|          | 0/200 [00:00<?, ? examples/s]



Map:   0%|          | 0/13 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)


Step,Training Loss,Validation Loss


  0%|          | 0/20 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 128 samples generated. Sampling efficiency is: 100.0000%




Map:   0%|          | 0/20 [00:00<?, ? examples/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Computing the sensitivity threshold...
Using parallel computation!!!




Bootstrap round:   0%|          | 0/5 [00:00<?, ?it/s]

Sensitivity threshold summary:
count    5.000000
mean     0.004091
std      0.019709
min     -0.012879
25%     -0.012879
50%     -0.003788
75%      0.020455
max      0.029545
dtype: float64
Sensitivity threshold: 0.027727272727272725 qt_max: 0.05


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss


  0%|          | 0/99 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 128 samples generated. Sampling efficiency is: 100.0000%
Critic round: 5,                     sensitivity_threshold: 0.027727272727272725,                         val_sensitivity: -0.0241919191919192,                             val_sensitivities: [-0.025, -0.025, -0.025, -0.01893939393939394, -0.025, -0.025, -0.025, -0.025, -0.025, -0.025, -0.01893939393939394, -0.025, -0.025, -0.025, -0.025]




Copying artefacts from: best-disc-model
Copying artefacts from: mean-best-disc-model
Copying artefacts from: not-best-disc-model
Copying artefacts from: last-epoch-model


  torch.load(parent_realtabformer_path / ModelFileName.rtf_model_pt)


Map:   0%|          | 0/375 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Filter:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)


Step,Training Loss,Validation Loss


  0%|          | 0/20 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 128 samples generated. Sampling efficiency is: 100.0000%




Map:   0%|          | 0/20 [00:00<?, ? examples/s]

  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████████████████████████████████████████████████████████████████████████| 1936/1936 [04:50<00:00,  6.67it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1936/1936 [04:38<00:00,  6.96it/s]


Computing the sensitivity threshold...
Using parallel computation!!!


Bootstrap round:   0%|          | 0/5 [00:00<?, ?it/s]

Sensitivity threshold summary:
count    5.000000
mean    -0.003182
std      0.015804
min     -0.025000
25%     -0.012879
50%     -0.000758
75%      0.011364
max      0.011364
dtype: float64
Sensitivity threshold: 0.011363636363636364 qt_max: 0.05


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss


  0%|          | 0/99 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 128 samples generated. Sampling efficiency is: 100.0000%
Critic round: 5,                     sensitivity_threshold: 0.011363636363636364,                         val_sensitivity: -0.021969696969696972,                             val_sensitivities: [-0.01590909090909091, -0.01893939393939394, -0.025, -0.01893939393939394, -0.025, -0.025, -0.0068181818181818205, -0.025, -0.025, -0.025, -0.025, -0.025, -0.01893939393939394, -0.025, -0.025]
Copying artefacts from: best-disc-model
Copying artefacts from: mean-best-disc-model
Copying artefacts from: not-best-disc-model
Copying artefacts from: last-epoch-model


Map:   0%|          | 0/26480 [00:00<?, ? examples/s]

KeyboardInterrupt: 

# Manual Categorical Mapping

In [27]:
task_ids = [10005, 10006, 14584, 22100, 31941, 31996, 34382, 34975]

for task_id in task_ids:
    torch.cuda.empty_cache()
    d1 = pd.read_csv(f"datasets/task_id_{task_id}/feeds.csv")
    d2 = pd.read_csv(f"datasets/task_id_{task_id}/ads.csv")
    
    
    if task_id == 10006 or task_id == 22100:
        d2 = d2.drop('ad_close_list_v001', axis = 1)
        d2 = d2.drop('ad_close_list_v002', axis = 1)
        d2 = d2.drop('ad_close_list_v003', axis = 1)
        d2 = d2.drop('pt_d', axis = 1)
        d1 = d1.drop('e_et', axis = 1)
    else:
        d1 = convert_date_column(d1, 'e_et')
        d2 = convert_date_column(d2, 'pt_d')

    
    # Create the mapping system manually 
    gender_map = {2: 'male', 3: 'female', 4: 'others'}
    age_map = {}

    for i in np.unique(d2['age']):
        age_map[i] = f"Age from {i}0 to {i}9"


    residence_list = ['US', 'China', 'Canada', 'Mexico', 'Japan', 'Korea', 'UK', 'France', 'Italy', 'Spain', 'Russua', 'India', 'Indonesia', 'Australia', 'Brazil', 'Argentina', 'Mexico', 
                      'Portugal', 'Sweden', 'Norway', 'Denmark', 'Finland', 'New Zealand', 'Cambodia', 'Thailand', 'Vietnam', 'Malaysia', 'Philippines', 'Jamaica', 'Egypt', 'Saudi Arabia', 'Iran', 'Israel', 'Kenya', 'Nigeria']

    residence_map = {}
    residence_pool = random.sample(residence_list, len(np.unique(d2['residence'])))
    j = 0
    for i in np.unique(d2['residence']):
        residence_map[i] = residence_pool[j]
        j += 1
    
    # Use map function to replace numeric values with categorical terms
    d2['gender'] = d2['gender'].map(gender_map)
    d2['age'] = d2['age'].map(age_map)
    d2['residence'] = d2['residence'].map(residence_map)
    
    # Create the inverse mapping system to revert synthetic data
    inv_gender_map = {v: k for k, v in gender_map.items()}
    inv_residence_map = {v: k for k, v in residence_map.items()}
    inv_age_map = {v: k for k, v in age_map.items()}
    
    
    
    if 'log_id' in d2.columns:
        d2 = d2.drop('log_id', axis = 1)
    
    dcr = data_clean_room(d1, d2, 'user_id')
    dcr.derec()
    dcr.sampling(200)
    
    c1 = dcr.derec_child_1_small
    c2 = dcr.derec_child_2_small
    
    # Extract the Parent Table with the DEREC pipeline 
    parent = dcr.derec_parent_small
    
    # Combine the remaining child tables with one of the correlation dimensional reduction method
    child = cdr(c1, c2, 'user_id', 0.1)
    #child = direct_combine(c1, c2, 'user_id')
    #child = cdr_hierarchical_cluster(c1, c2, 'user_id')
    
    join_on = 'user_id'              
        
    # Synthesize data
    parent_samples, child_samples = synthesize(parent, child, parent_n = 20, child_n = 200, join_on = join_on)
    
    # Refresh cuda memories
    torch.cuda.empty_cache()
    
    # Reverse the parent-child table structure back to input data structure
    new_syn_child_1 = child_samples[[col for col in child_samples.columns if col in dcr.og1.columns]]
    new_syn_child_2 = child_samples[[col for col in child_samples.columns if col in dcr.og2.columns]]
    new_syn_parent_1 = parent_samples[[col for col in parent_samples.columns if col in dcr.og1.columns]]
    new_syn_parent_2 = parent_samples[[col for col in parent_samples.columns if col in dcr.og2.columns]]
    
    new_syn_1 = pd.merge(new_syn_parent_1, new_syn_child_1, left_on = 'user_id', right_on = 'user_id')
    new_syn_2 = pd.merge(new_syn_parent_2, new_syn_child_2, left_on = 'user_id', right_on = 'user_id')
    

    # Reverse map the categories to return to input data format
    new_syn_2['gender'] = new_syn_2['gender'].map(inv_gender_map)
    new_syn_2['age'] = new_syn_2['age'].map(inv_age_map)
    new_syn_2['residence'] = new_syn_2['residence'].map(inv_residence_map)
    
    d2['gender'] = d2['gender'].map(inv_gender_map)
    d2['age'] = d2['age'].map(inv_age_map)
    d2['residence'] = d2['residence'].map(inv_residence_map)
    
    og_parent = dcr.derec_parent_small
    og_1_p = og_parent[[col for col in parent_samples.columns if col in dcr.og1.columns]]
    og_2_p = og_parent[[col for col in parent_samples.columns if col in dcr.og2.columns]]
    
    og_1 = pd.merge(og_1_p, c1, left_on = 'user_id', right_on = 'user_id')
    og_2 = pd.merge(og_2_p, c2, left_on = 'user_id', right_on = 'user_id')
    
    og_1 = og_1[list(new_syn_1.columns)]
    og_2 = og_2[list(new_syn_2.columns)]
    
    og_2['gender'] = og_2['gender'].map(inv_gender_map)
    og_2['age'] = og_2['age'].map(inv_age_map)
    og_2['residence'] = og_2['residence'].map(inv_residence_map)
    
    
    dcr.synthesize(20, 200)
    
    old_syn_1 = dcr.syn1
    old_syn_2 = dcr.syn2
    
    old_syn_1 = old_syn_1[list(new_syn_1.columns)]
    old_syn_2 = old_syn_2[list(new_syn_2.columns)]
        
    old_syn_2['gender'] = old_syn_2['gender'].map(inv_gender_map)
    old_syn_2['age'] = old_syn_2['age'].map(inv_age_map)
    old_syn_2['residence'] = old_syn_2['residence'].map(inv_residence_map)    
    
    
    if task_id != 10006 and task_id != 22100:
        new_syn_1 = combine_date(new_syn_1, 'e_et')
        new_syn_2 = combine_date(new_syn_2, 'pt_d')

        old_syn_1 = combine_date(old_syn_1, 'e_et')
        old_syn_2 = combine_date(old_syn_2, 'pt_d')

        og_1 = combine_date(og_1, 'e_et')
        og_2 = combine_date(og_2, 'pt_d')
    
    
    og = {}
    og['d1'] = og_1
    og['d2'] = og_2
    
    old = {}
    old['d1'] = old_syn_1
    old['d2'] = old_syn_2
    
    new = {}
    new['d1'] = new_syn_1
    new['d2'] = new_syn_2
    
    old_evaluation = simpro(og, old)
    old_evaluation.cal_marginal_indicators()
    old_evaluation.cal_conditional_indicators()
    
    new_evaluation = simpro(og, new)
    new_evaluation.cal_marginal_indicators()
    new_evaluation.cal_conditional_indicators()
    
    old_p = old_evaluation.conditional_indicators['p-values']
    new_p = new_evaluation.conditional_indicators['p-values']
    old_w = old_evaluation.conditional_indicators['w-distance']
    new_w = new_evaluation.conditional_indicators['w-distance']
    
    p_val = pd.DataFrame([old_p, new_p], index = ['Old', 'New']).T
    p_val = p_val.fillna(1)
    p_val.to_csv(f"results/p_val_{task_id}_manual_mapping_reduction.csv")
    
    w_dis = pd.DataFrame([old_w, new_w], index = ['Old', 'New']).T
    w_dis.to_csv(f"results/w_dis_{task_id}_manual_mapping_reduction.csv")
    

Computing the sensitivity threshold...
Using parallel computation!!!


Bootstrap round:   0%|          | 0/5 [00:00<?, ?it/s]

Sensitivity threshold summary:
count    5.000000
mean     0.031970
std      0.049995
min     -0.006818
25%     -0.003788
50%      0.008333
75%      0.050758
max      0.111364
dtype: float64
Sensitivity threshold: 0.09924242424242423 qt_max: 0.05


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss


  0%|          | 0/99 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 128 samples generated. Sampling efficiency is: 100.0000%
Critic round: 5,                     sensitivity_threshold: 0.09924242424242423,                         val_sensitivity: -0.0241919191919192,                             val_sensitivities: [-0.025, -0.025, -0.025, -0.01893939393939394, -0.01893939393939394, -0.025, -0.025, -0.025, -0.025, -0.025, -0.025, -0.025, -0.025, -0.025, -0.025]
Copying artefacts from: best-disc-model
Copying artefacts from: mean-best-disc-model
Copying artefacts from: not-best-disc-model
Copying artefacts from: last-epoch-model


Map:   0%|          | 0/35468 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Filter:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss


  0%|          | 0/20 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 128 samples generated. Sampling efficiency is: 100.0000%


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Computing the sensitivity threshold...
Using parallel computation!!!


Bootstrap round:   0%|          | 0/5 [00:00<?, ?it/s]

Sensitivity threshold summary:
count    5.000000
mean    -0.002576
std      0.009959
min     -0.018939
25%     -0.003788
50%     -0.000758
75%      0.005303
max      0.005303
dtype: float64
Sensitivity threshold: 0.0053030303030303025 qt_max: 0.05


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss


  0%|          | 0/99 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 128 samples generated. Sampling efficiency is: 100.0000%
Critic round: 5,                     sensitivity_threshold: 0.0053030303030303025,                         val_sensitivity: -0.0241919191919192,                             val_sensitivities: [-0.025, -0.025, -0.025, -0.01893939393939394, -0.01893939393939394, -0.025, -0.025, -0.025, -0.025, -0.025, -0.025, -0.025, -0.025, -0.025, -0.025]
Copying artefacts from: best-disc-model
Copying artefacts from: mean-best-disc-model
Copying artefacts from: not-best-disc-model
Copying artefacts from: last-epoch-model


Map:   0%|          | 0/17940 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Filter:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss


  0%|          | 0/20 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 128 samples generated. Sampling efficiency is: 100.0000%


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Computing the sensitivity threshold...
Using parallel computation!!!


Bootstrap round:   0%|          | 0/5 [00:00<?, ?it/s]

Sensitivity threshold summary:
count    5.000000
mean     0.003485
std      0.026556
min     -0.025000
25%     -0.012879
50%     -0.006818
75%      0.023485
max      0.038636
dtype: float64
Sensitivity threshold: 0.0356060606060606 qt_max: 0.05


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss


  0%|          | 0/99 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 128 samples generated. Sampling efficiency is: 100.0000%
Critic round: 5,                     sensitivity_threshold: 0.0356060606060606,                         val_sensitivity: -0.0241919191919192,                             val_sensitivities: [-0.025, -0.025, -0.025, -0.01893939393939394, -0.01893939393939394, -0.025, -0.025, -0.025, -0.025, -0.025, -0.025, -0.025, -0.025, -0.025, -0.025]
Copying artefacts from: best-disc-model
Copying artefacts from: mean-best-disc-model
Copying artefacts from: not-best-disc-model
Copying artefacts from: last-epoch-model


Map:   0%|          | 0/375 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Filter:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss


  0%|          | 0/20 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 128 samples generated. Sampling efficiency is: 100.0000%


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████████████████████████████████████████████████████████████████████████| 1936/1936 [04:43<00:00,  6.82it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1936/1936 [04:47<00:00,  6.73it/s]


Computing the sensitivity threshold...
Using parallel computation!!!


Bootstrap round:   0%|          | 0/5 [00:00<?, ?it/s]

Sensitivity threshold summary:
count    5.000000
mean     0.023485
std      0.020215
min     -0.000758
25%      0.014394
50%      0.020455
75%      0.029545
max      0.053788
dtype: float64
Sensitivity threshold: 0.048939393939393935 qt_max: 0.05


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss


  0%|          | 0/99 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 128 samples generated. Sampling efficiency is: 100.0000%
Critic round: 5,                     sensitivity_threshold: 0.048939393939393935,                         val_sensitivity: -0.02217171717171717,                             val_sensitivities: [-0.025, -0.025, -0.025, -0.025, -0.025, -0.025, -0.0068181818181818205, -0.025, -0.025, -0.025, -0.01893939393939394, -0.01893939393939394, -0.025, -0.01893939393939394, -0.01893939393939394]
Copying artefacts from: best-disc-model
Copying artefacts from: mean-best-disc-model
Copying artefacts from: not-best-disc-model
Copying artefacts from: last-epoch-model


Map:   0%|          | 0/26480 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Filter:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss


  0%|          | 0/20 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 128 samples generated. Sampling efficiency is: 100.0000%


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Computing the sensitivity threshold...
Using parallel computation!!!


Bootstrap round:   0%|          | 0/5 [00:00<?, ?it/s]

Sensitivity threshold summary:
count    5.000000
mean    -0.001970
std      0.026985
min     -0.025000
25%     -0.012879
50%     -0.009848
75%     -0.006818
max      0.044697
dtype: float64
Sensitivity threshold: 0.034393939393939386 qt_max: 0.05


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss


  0%|          | 0/99 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 128 samples generated. Sampling efficiency is: 100.0000%
Critic round: 5,                     sensitivity_threshold: 0.034393939393939386,                         val_sensitivity: -0.02217171717171717,                             val_sensitivities: [-0.025, -0.025, -0.025, -0.025, -0.025, -0.025, -0.0068181818181818205, -0.025, -0.025, -0.025, -0.01893939393939394, -0.01893939393939394, -0.025, -0.01893939393939394, -0.01893939393939394]
Copying artefacts from: best-disc-model
Copying artefacts from: mean-best-disc-model
Copying artefacts from: not-best-disc-model
Copying artefacts from: last-epoch-model


Map:   0%|          | 0/22385 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Filter:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss


  0%|          | 0/20 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 128 samples generated. Sampling efficiency is: 100.0000%


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

  0%|          | 0/20 [00:00<?, ?it/s]

KeyboardInterrupt: 

# Convert '^' to ' and '

In [28]:
def change_symbol_to_word(value, symbol = '^'):
    if symbol in str(value):
        return str(value).replace(symbol, " and ")
    else: return value
    
def inv_change_symbol_to_word(value, symbol = " and "):
    if symbol in str(value):
        return str(value).replace(symbol, "^")
    else: return value    

In [29]:
task_ids = [10005, 10006, 14584, 22100, 31941, 31996, 34382, 34975]

for task_id in task_ids:
    torch.cuda.empty_cache()
    d1 = pd.read_csv(f"datasets/task_id_{task_id}/feeds.csv")
    d2 = pd.read_csv(f"datasets/task_id_{task_id}/ads.csv")
    
    if task_id == 10006 or task_id == 22100:
        d2 = d2.drop('ad_close_list_v001', axis = 1)
        d2 = d2.drop('ad_close_list_v002', axis = 1)
        d2 = d2.drop('ad_close_list_v003', axis = 1)
        d2 = d2.drop('pt_d', axis = 1)
        d1 = d1.drop('e_et', axis = 1)
    else:
        d1 = convert_date_column(d1, 'e_et')
        d2 = convert_date_column(d2, 'pt_d')
    
    col_to_change = ['u_newsCatInterests', 'u_newsCatDislike', 'u_newsCatInterestsST', 'u_click_ca2_news']


    for col in col_to_change:
        if col in d1.columns:
            d1[col] = d1[col].apply(lambda x: change_symbol_to_word(x, symbol = '^'))
        if col in d2.columns:
            d2[col] = d2[col].apply(lambda x: change_symbol_to_word(x, symbol = '^'))

    
    
    if 'log_id' in d2.columns:
        d2 = d2.drop('log_id', axis = 1)
    
    dcr = data_clean_room(d1, d2, 'user_id')
    dcr.derec()
    dcr.sampling(200)
    
    c1 = dcr.derec_child_1_small
    c2 = dcr.derec_child_2_small
    
    # Extract the Parent Table with the DEREC pipeline 
    parent = dcr.derec_parent_small
    
    # Combine the remaining child tables with one of the correlation dimensional reduction method
    child = cdr(c1, c2, 'user_id', 0.1)
    #child = direct_combine(c1, c2, 'user_id')
    #child = cdr_hierarchical_cluster(c1, c2, 'user_id')
    
    join_on = 'user_id'              
        
    ###
    parent_samples, child_samples = synthesize(parent, child, parent_n = 20, child_n = 200, join_on = join_on)
    
    torch.cuda.empty_cache()
    
    new_syn_child_1 = child_samples[[col for col in child_samples.columns if col in dcr.og1.columns]]
    new_syn_child_2 = child_samples[[col for col in child_samples.columns if col in dcr.og2.columns]]
    new_syn_parent_1 = parent_samples[[col for col in parent_samples.columns if col in dcr.og1.columns]]
    new_syn_parent_2 = parent_samples[[col for col in parent_samples.columns if col in dcr.og2.columns]]
    
    new_syn_1 = pd.merge(new_syn_parent_1, new_syn_child_1, left_on = 'user_id', right_on = 'user_id')
    new_syn_2 = pd.merge(new_syn_parent_2, new_syn_child_2, left_on = 'user_id', right_on = 'user_id')
    
    og_parent = dcr.derec_parent_small
    og_1_p = og_parent[[col for col in parent_samples.columns if col in dcr.og1.columns]]
    og_2_p = og_parent[[col for col in parent_samples.columns if col in dcr.og2.columns]]
    
    og_1 = pd.merge(og_1_p, c1, left_on = 'user_id', right_on = 'user_id')
    og_2 = pd.merge(og_2_p, c2, left_on = 'user_id', right_on = 'user_id')
    
    og_1 = og_1[list(new_syn_1.columns)]
    og_2 = og_2[list(new_syn_2.columns)]
    
    
    for col in col_to_change:
        if col in new_syn_1.columns:
            new_syn_1[col] = new_syn_1[col].apply(lambda x: inv_change_symbol_to_word(x, symbol = ' and '))
        if col in new_syn_2.columns:
            new_syn_2[col] = new_syn_2[col].apply(lambda x: inv_change_symbol_to_word(x, symbol = ' and '))
        if col in og_1.columns:
            og_1[col] = og_1[col].apply(lambda x: inv_change_symbol_to_word(x, symbol = ' and '))
        if col in og_2.columns:
            og_2[col] = og_2[col].apply(lambda x: inv_change_symbol_to_word(x, symbol = ' and '))
        if col in d1.columns:
            d1[col] = d1[col].apply(lambda x: inv_change_symbol_to_word(x, symbol = ' and '))
        if col in d2.columns:
            d2[col] = d2[col].apply(lambda x: inv_change_symbol_to_word(x, symbol = ' and '))
    
    dcr.synthesize(20, 200)
    
    old_syn_1 = dcr.syn1
    old_syn_2 = dcr.syn2
    
    old_syn_1 = old_syn_1[list(new_syn_1.columns)]
    old_syn_2 = old_syn_2[list(new_syn_2.columns)]
    
    
    
    if task_id != 10006 and task_id != 22100:
        new_syn_1 = combine_date(new_syn_1, 'e_et')
        new_syn_2 = combine_date(new_syn_2, 'pt_d')

        old_syn_1 = combine_date(old_syn_1, 'e_et')
        old_syn_2 = combine_date(old_syn_2, 'pt_d')

        og_1 = combine_date(og_1, 'e_et')
        og_2 = combine_date(og_2, 'pt_d')
    
    
    og = {}
    og['d1'] = og_1
    og['d2'] = og_2
    
    old = {}
    old['d1'] = old_syn_1
    old['d2'] = old_syn_2
    
    new = {}
    new['d1'] = new_syn_1
    new['d2'] = new_syn_2
    
    old_evaluation = simpro(og, old)
    old_evaluation.cal_marginal_indicators()
    old_evaluation.cal_conditional_indicators()
    
    new_evaluation = simpro(og, new)
    new_evaluation.cal_marginal_indicators()
    new_evaluation.cal_conditional_indicators()
    
    old_p = old_evaluation.conditional_indicators['p-values']
    new_p = new_evaluation.conditional_indicators['p-values']
    old_w = old_evaluation.conditional_indicators['w-distance']
    new_w = new_evaluation.conditional_indicators['w-distance']
    
    p_val = pd.DataFrame([old_p, new_p], index = ['Old', 'New']).T
    p_val = p_val.fillna(1)
    p_val.to_csv(f"results/p_val_{task_id}_replace_special_symbol_reduction.csv")
    
    w_dis = pd.DataFrame([old_w, new_w], index = ['Old', 'New']).T
    w_dis.to_csv(f"results/w_dis_{task_id}_replace_special_symbol_reduction.csv")
    

Computing the sensitivity threshold...
Using parallel computation!!!


Bootstrap round:   0%|          | 0/5 [00:00<?, ?it/s]

Sensitivity threshold summary:
count    5.000000
mean     0.011364
std      0.016735
min     -0.003788
25%     -0.000758
50%      0.011364
75%      0.011364
max      0.038636
dtype: float64
Sensitivity threshold: 0.03318181818181817 qt_max: 0.05


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss


  0%|          | 0/99 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 128 samples generated. Sampling efficiency is: 100.0000%
Critic round: 5,                     sensitivity_threshold: 0.03318181818181817,                         val_sensitivity: -0.018333333333333333,                             val_sensitivities: [-0.01893939393939394, -0.01893939393939394, -0.025, -0.025, -0.025, -0.025, -0.025, -0.025, -0.01893939393939394, -0.01287878787878788, -0.01287878787878788, -0.01287878787878788, -0.025, -0.003787878787878789, -0.0007575757575757599]
Copying artefacts from: best-disc-model
Copying artefacts from: mean-best-disc-model
Copying artefacts from: not-best-disc-model
Copying artefacts from: last-epoch-model


Map:   0%|          | 0/35468 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Filter:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss


  0%|          | 0/20 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 128 samples generated. Sampling efficiency is: 100.0000%


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Computing the sensitivity threshold...
Using parallel computation!!!


Bootstrap round:   0%|          | 0/5 [00:00<?, ?it/s]

Sensitivity threshold summary:
count    5.000000
mean     0.033788
std      0.046779
min     -0.025000
25%     -0.000758
50%      0.050758
75%      0.050758
max      0.093182
dtype: float64
Sensitivity threshold: 0.08469696969696969 qt_max: 0.05


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss


  0%|          | 0/99 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 128 samples generated. Sampling efficiency is: 100.0000%
Critic round: 5,                     sensitivity_threshold: 0.08469696969696969,                         val_sensitivity: -0.018333333333333333,                             val_sensitivities: [-0.01893939393939394, -0.01893939393939394, -0.025, -0.025, -0.025, -0.025, -0.025, -0.025, -0.01893939393939394, -0.01287878787878788, -0.01287878787878788, -0.01287878787878788, -0.025, -0.003787878787878789, -0.0007575757575757599]
Copying artefacts from: best-disc-model
Copying artefacts from: mean-best-disc-model
Copying artefacts from: not-best-disc-model
Copying artefacts from: last-epoch-model


Map:   0%|          | 0/17940 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Filter:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss


  0%|          | 0/20 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 128 samples generated. Sampling efficiency is: 100.0000%


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Computing the sensitivity threshold...
Using parallel computation!!!


Bootstrap round:   0%|          | 0/5 [00:00<?, ?it/s]

Sensitivity threshold summary:
count    5.000000
mean     0.030152
std      0.020708
min      0.011364
25%      0.014394
50%      0.026515
75%      0.035606
max      0.062879
dtype: float64
Sensitivity threshold: 0.05742424242424242 qt_max: 0.05


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss


  0%|          | 0/99 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 128 samples generated. Sampling efficiency is: 100.0000%
Critic round: 5,                     sensitivity_threshold: 0.05742424242424242,                         val_sensitivity: -0.018333333333333333,                             val_sensitivities: [-0.01893939393939394, -0.01893939393939394, -0.025, -0.025, -0.025, -0.025, -0.025, -0.025, -0.01893939393939394, -0.01287878787878788, -0.01287878787878788, -0.01287878787878788, -0.025, -0.003787878787878789, -0.0007575757575757599]
Copying artefacts from: best-disc-model
Copying artefacts from: mean-best-disc-model
Copying artefacts from: not-best-disc-model
Copying artefacts from: last-epoch-model


Map:   0%|          | 0/375 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Filter:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss


  0%|          | 0/20 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 128 samples generated. Sampling efficiency is: 100.0000%


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████████████████████████████████████████████████████████████████████████| 1936/1936 [04:51<00:00,  6.64it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1936/1936 [04:51<00:00,  6.65it/s]


KeyboardInterrupt: 