In [None]:
import pandas as pd
import random
from datasets import Dataset

data_folder    = ''
train_folder   = f'{data_folder}/train'
test_folder    = f'{data_folder}/test'

sorted_dict_path = ''

# PAFT orders for datasets
with open(sorted_dict_path, 'r') as f:
    sorted_order_dict = eval(f.read())

# Info about datasets
data_info_path = ''
data_info = pd.read_csv(data_info_path, index_col='df_name').sort_values('row_number')

data_info

# Datasets for standard training

In [None]:
PAFT_ORDER = True

output_folder = f"./data/datasets_for_standard_FT/{'PAFT' if PAFT_ORDER else 'GREAT'}"

for df_name in data_info.index:
    df_train = pd.read_csv(f'{train_folder}/{df_name}.csv')
    
    if PAFT_ORDER:
        order = sorted_order_dict[df_name]

        if order == '':
            # No df in calculated func dependencies
            order = df_train.columns.tolist() # a fixed random order for all samples
            # random order
            random.shuffle(order)
            order = ','.join(order)

    else:
        order = df_train.columns.tolist()
        order = ','.join(order)

    df_train = df_train[order.split(',')] * 1

    df_text = []
    for i in range(len(df_train)):
        row = df_train.loc[i]
        cols, vals = row.index, row.values

        text = ', '.join([f'{col} is {int(val) if val.is_integer() else round(val, 3)}' for col, val in zip(cols, vals)])
        df_text.append(text)


    df_for_tuning_pandas = pd.DataFrame({'output' : df_text})
    df_for_tuning_pandas['instruction'] = 'Generate a synthetic sample giving an order of the columns. There must be no columns that were not provided in order!'
    df_for_tuning_pandas['input'] = order

    dataset_DS = Dataset.from_pandas(df_for_tuning_pandas)

    dataset_DS.save_to_disk(f'{output_folder}/{df_name}_prompts_train')

# Datasets for batch training

In [None]:
PAFT_ORDER = True
output_folder = f"./data/datasets_for_batch_FT/{'PAFT' if PAFT_ORDER else 'GREAT'}"
    
N_batch = 10

for df_name in data_info.index:
    df_train = pd.read_csv(f'{train_folder}/{df_name}.csv')
    old_shape = df_train.shape

    df_train = df_train.head(len(df_train) - len(df_train) % N_batch)

    if PAFT_ORDER:
        order = sorted_order_dict[df_name]

        if order == '':
            # No df in calculated func dependencies
            order = df_train.columns.tolist() # a fixed random order for all samples
            # random order
            random.shuffle(order)
            order = ','.join(order)

    else:
        order = df_train.columns.tolist()
        order = ','.join(order)

    df_train = df_train[order.split(',')] * 1

    df_text = []
    for i in range(len(df_train)):
        row = df_train.loc[i]
        cols, vals = row.index, row.values

        text = ', '.join([f'{col} is {int(val) if val.is_integer() else round(val, 3)}' for col, val in zip(cols, vals)])
        df_text.append(text)

    df_text_Nbatch = []
    number_of_iters = len(df_text) // N_batch

    for i in range(number_of_iters):
        df_text_slice = df_text[i*N_batch:(i+1)*N_batch]
        N_samples_line = ''
        for sample_idx, sample in enumerate(df_text_slice):
            N_samples_line += f'Sample {sample_idx}: {sample}\n'
        
        df_text_Nbatch.append(N_samples_line)
        
    df_for_tuning_pandas = pd.DataFrame({'output' : df_text_Nbatch})
    df_for_tuning_pandas['instruction'] = f'Generate {N_batch} synthetic samples giving an order of the columns. There must be no columns that were not provided in order!'
    df_for_tuning_pandas['input'] = order

    dataset_DS = Dataset.from_pandas(df_for_tuning_pandas)

    dataset_DS.save_to_disk(f'{output_folder}/{df_name}_prompts_train')

# Datasets for batch anon training

In [None]:
PAFT_ORDER = True

N_batch = 10

output_folder = f"./data/datasets_for_batch_anon_FT/{'PAFT' if PAFT_ORDER else 'GREAT'}"

for df_name in data_info.index:
    df_train = pd.read_csv(f'{train_folder}/{df_name}.csv')
    old_shape = df_train.shape

    df_train = df_train.head(len(df_train) - len(df_train) % N_batch)
    print(df_name, ", old shape:", old_shape, 'new shape:', df_train.shape)

    if PAFT_ORDER:
        order = sorted_order_dict[df_name]

        if order == '':
            # No df in calculated func dependencies
            order = df_train.columns.tolist() # a fixed random order for all samples
            # random order
            random.shuffle(order)
            order = ','.join(order)

    else:
        order = df_train.columns.tolist()
        order = ','.join(order)

    df_train = df_train[order.split(',')] * 1

    print(df_name, ":", order)

    df_text = []
    cols = [f'col{i}' for i in range(1, len(df_train.columns) + 1)]

    for i in range(len(df_train)):
        row = df_train.loc[i]
        vals = row.values

        text = ', '.join([f'{col} is {int(val) if val.is_integer() else round(val, 3)}' for col, val in zip(cols, vals)])
        df_text.append(text)

    df_text_Nbatch = []
    number_of_iters = len(df_text) // N_batch

    for i in range(number_of_iters):
        df_text_slice = df_text[i*N_batch:(i+1)*N_batch]
        N_samples_line = ''
        for sample_idx, sample in enumerate(df_text_slice):
            N_samples_line += f'Sample {sample_idx}: {sample}\n'
        
        df_text_Nbatch.append(N_samples_line)
        
    df_for_tuning_pandas = pd.DataFrame({'output' : df_text_Nbatch})
    df_for_tuning_pandas['instruction'] = f'Generate {N_batch} synthetic samples giving an order of the columns. There must be no columns that were not provided in order!'
    df_for_tuning_pandas['input'] = ','.join(cols)

    dataset_DS = Dataset.from_pandas(df_for_tuning_pandas)

    dataset_DS.save_to_disk(f'{output_folder}/{df_name}_prompts_train')