In [1]:
import numpy as np
from random import shuffle
import numpy as np
from numpy.random import randint
from scipy.stats import ortho_group
import pandas as pd

In [2]:
max_seq_len = 100
samples_seq = [0] * (max_seq_len+1)
seq_dict = {}
encoding = "dense_orthonormal"
x = list()
y = list()
y_mlp = list()
raw_sequence = list()
token_repeated = list()
pos_first_token = list()
sequence_len = list()
eos_decoder = 2

In [3]:
def generate_labels(sequence):
    new_list = list()
    seq_len = len(sequence)
    label = [0]*(seq_len+1)
    for num, letter in enumerate(sequence):
        #print(num, letter)
        if letter in new_list:
            label[num] = 1

        new_list.append(letter)

    label[seq_len] = eos_decoder
    return label

In [4]:
def generate_seq(seq_len, num_repeat, num_tokens_rep, positive, orthonormal_vectors):
    """
    :param seq_len: length of the sequence
    :param num_repeat: number of times the token needs to be repeated
    :param repeat_dist: number of intervening tokens between reps
    :param num_tokens_rep: number of tokens that are repeated in the seq
    :return: sequence
    """
    # repeat position - recency; random but be balanced accross dataset

    seq_list = np.arange(0, max_seq_len)
    shuffle(seq_list)
    seq_list = seq_list[:seq_len]

    if(positive):
        # randomly generate first repeat position

        first_pos = randint(0, seq_len-1)
        # the repeated token is always at the end, there are no tokens after the
        # repeated token
        seq_list[first_pos] = seq_list[-1]
        rep_token = seq_list[first_pos]

    else:
        # none of the tokens are repeating
        rep_token = -1
        first_pos = -1


    return seq_list, rep_token, first_pos, seq_len

In [5]:
def aggregate_inputs(sequence, rep_token, first_token_pos, seq_len, positive, orthonormal_vectors):
    seq_list = tuple(sequence)
    if seq_list in seq_dict:
        skipped = 1
        return skipped
    else:
        skipped = 0
        seq_dict[seq_list] = 1

    # proceed to apend the sequence
    sequence_one_hot = []
    for token in sequence:

        if(encoding == "one_hot"):
            seq_token = [0] * (max_seq_len + 1)
            seq_token[token] = 1
        elif (encoding == "dense_orthonormal"):
            seq_token = orthonormal_vectors[token]

        sequence_one_hot.append(seq_token)
    if(encoding == "one_hot"):
        sequence_one_hot.append(eos_seq_ip)
    elif(encoding == "dense_orthonormal"):
        sequence_one_hot.append(orthonormal_vectors[-1])
    x.append(sequence_one_hot)

    raw_sequence.append(sequence)
    label = generate_labels(sequence)
    y.append(label)
    y_mlp.append(positive)

    token_repeated.append(rep_token)
    pos_first_token.append(first_token_pos)
    sequence_len.append(seq_len)

    return skipped

In [6]:
def generate_dataset(max_seq_len=26, num_tokens_rep=1, num_instances_per_seq_len=5000):
    """
    :param num_samples:
    :param seq_len:
    :param num_repeat:
    :param repeat_dist:
    :param num_tokens_rep:
    :param max_seq_len:
    :return:
    """
    orthonormal_vectors = ortho_group.rvs(dim=(512)) # do not need orthonormal vectors for now
    # min seq_len is always 2 as we do not consider 1 length sequence
    min_seq_len = 2
    num_repeat = 1

    num_positive_examples = 0
    num_negative_examples = 0

    for seq_len in range(min_seq_len, max_seq_len+1):
        #positive examples with repetion
        #print("seq_len is" + str(seq_len))
        num_samples = min((max_seq_len*np.math.factorial(seq_len-1)), num_instances_per_seq_len)
        # number of samples per sequence
        samples_seq[seq_len] = num_samples*2
        for sample in range(num_samples):
            positive = 1
            sequence, rep_token, first_token_pos, seq_len = generate_seq(seq_len,
                                                                      num_repeat,
                                                                      num_tokens_rep,
                                                                       positive, orthonormal_vectors)
            # while aggregating inputs do not add repeating samples
            if sequence is not None:
                skipped = aggregate_inputs(sequence, rep_token, first_token_pos, seq_len, positive, orthonormal_vectors)

            if(skipped == 0):
                num_positive_examples =num_positive_examples + 1
                #negative samples, only when we have added a positive sample
                positive = 0
                sequence, rep_token, first_token_pos, seq_len = generate_seq(seq_len,
                                                                      num_repeat,
                                                                      num_tokens_rep,
                                                                       positive, orthonormal_vectors)

                skipped = aggregate_inputs(sequence, rep_token, first_token_pos, seq_len, positive, orthonormal_vectors)
                if(skipped == 0):
                    num_negative_examples = num_negative_examples + 1

    print("Number of positive examples are: " + str(num_positive_examples))
    print("Number of negative examples are: " + str(num_negative_examples))

    return x, y, y_mlp, raw_sequence, token_repeated, pos_first_token, sequence_len, orthonormal_vectors


In [7]:
x, y, y_mlp, raw_sequence, token_repeated, pos_first_token, sequence_len, orthonormal_vectors = generate_dataset(100,
                                                                      1, 1000)

Number of positive examples are: 96860
Number of negative examples are: 96860


In [8]:
# separate out the query and the rest of the seq and account for length
raw_seq = [seq[:-1] for seq in raw_sequence]
len_seq = [slen-1 for slen in sequence_len]
query = [seq[-1] for seq in raw_sequence]
# directly use token_repeated, pos_first_token, y_mlp

In [12]:
raw_seq[1000]

array([82,  1, 73])

In [9]:
# create a dataframe with these entries
columns = ['sequence', 'query', 'sequence_len', 'first_rep_pos', 'label', 'token_repeated']
syn_df = pd.DataFrame(columns = columns)
syn_df['sequence'] = raw_seq
syn_df['query'] = query
syn_df['sequence_len'] = len_seq
syn_df['first_rep_pos'] = pos_first_token
syn_df['label'] = y_mlp
syn_df['token_repeated'] = token_repeated

In [None]:
syn_df.to_pickle('synthetic_inputs.pkl')

In [10]:
syn_df.to_json('synthetic_inputs.json')