In [1]:
from thirdai import data, dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
text_samples = [
    "Lord of the Rings by JRR Tolkien is the greatest piece of fiction written."
]

In [3]:
ngram_encoder = dataset.NGramEncoder(n=1)
ngram_encoder2 = dataset.NGramEncoder(n=2)
pairgram_encoder = dataset.PairGramEncoder()
fixed_dim_encoder = dataset.FixedDimEncoder(max_tokens = 100)

composite_encoder = dataset.CompositeEncoder(
    max_tokens = 100,
    encoders = [
        dataset.NGramEncoder(n=1),
        dataset.NGramEncoder(n=2),
        dataset.PairGramEncoder()
    ],
    sampling_strategy = "fifo"
)

In [4]:
tokenizer = dataset.NaiveSplitTokenizer(delimiter=" ")

In [13]:
def transform(tokenizer, encoder):
    transformation = data.transformations.Text(
        input_column = "input",
        output_indices = "tokens",
        output_values=None,
        tokenizer = tokenizer,
        encoder = encoder,
        dim=0xFFFFFFFF
    )
    columns = data.ColumnMap({"input": data.columns.StringColumn(text_samples)})
    output = transformation(columns)
    return output

In [14]:
ngram_output = transform(tokenizer, ngram_encoder)
ngram_2_output = transform(tokenizer, ngram_encoder2)
pairgram_output = transform(tokenizer, pairgram_encoder)
fixed_dim_output = transform(tokenizer, fixed_dim_encoder)

composite_output = transform(tokenizer, composite_encoder)

generated encodings size: 14
current encoding size: 14
generated encodings size: 27
current encoding size: 41
generated encodings size: 105
current encoding size: 146


In [15]:
print(ngram_output["tokens"].data()[0])
print(len(ngram_output["tokens"].data()[0]))
ngram1_set = set(ngram_output["tokens"].data()[0])

[2404589427, 3169505401, 4174496210, 3059272593, 2913386271, 1569519593, 1634292382, 2174608854, 4174496210, 598481703, 3533171813, 3169505401, 1679377089, 1262999890]
14


In [16]:
print(ngram_2_output["tokens"].data()[0])
print(len(ngram_2_output["tokens"].data()[0]))
ngram2_set = set(ngram_2_output["tokens"].data()[0])

[2404589427, 3169505401, 4174496210, 3059272593, 2913386271, 1569519593, 1634292382, 2174608854, 4174496210, 598481703, 3533171813, 3169505401, 1679377089, 1262999890, 3706581437, 3155352976, 1071967212, 3120530541, 1031493174, 667874168, 4092397864, 2675429206, 3428431878, 1198337408, 3881087726, 3503644321, 1825509690]
27


In [17]:
print(pairgram_output["tokens"].data()[0])
print(len(pairgram_output["tokens"].data()[0]))
pairgram_set = set(pairgram_output["tokens"].data()[0])

[2842960571, 3706581437, 2046545129, 16187220, 3155352976, 4051689901, 3263166101, 43370449, 1071967212, 2919340031, 3408511495, 153606991, 1179835422, 3120530541, 1297157952, 2066876749, 3646526585, 2528679748, 164971351, 1031493174, 736714293, 2003726208, 3580186316, 2324983961, 1966156010, 958201025, 667874168, 321297120, 2535529304, 892264852, 1789625681, 1439087906, 424175625, 3355074592, 4092397864, 640396114, 16187220, 3155352976, 4051689901, 3968506174, 2172914701, 1277592620, 1758553388, 2675429206, 4051689901, 903030287, 2537076551, 3428431878, 871655317, 4217090936, 1627929331, 1442562969, 3361302019, 3428431878, 2524544590, 3862894025, 1716840453, 463833792, 2193667795, 2823709370, 3069995441, 2182557915, 2033037509, 463833792, 1198337408, 2582091282, 3706581437, 2046545129, 890695412, 2830346951, 1570602662, 2286908805, 2897745095, 1654578417, 890695412, 1823998364, 3881087726, 2046545129, 1975418469, 3503644321, 2351194300, 1883314191, 986483486, 561840989, 362211903, 195

In [18]:
print(composite_output["tokens"].data()[0])
print(len(composite_output["tokens"].data()[0]))
composite_set = set(composite_output["tokens"].data()[0])

[2404589427, 3169505401, 4174496210, 3059272593, 2913386271, 1569519593, 1634292382, 2174608854, 4174496210, 598481703, 3533171813, 3169505401, 1679377089, 1262999890, 2404589427, 3169505401, 4174496210, 3059272593, 2913386271, 1569519593, 1634292382, 2174608854, 4174496210, 598481703, 3533171813, 3169505401, 1679377089, 1262999890, 3706581437, 3155352976, 1071967212, 3120530541, 1031493174, 667874168, 4092397864, 2675429206, 3428431878, 1198337408, 3881087726, 3503644321, 1825509690, 2842960571, 3706581437, 2046545129, 16187220, 3155352976, 4051689901, 3263166101, 43370449, 1071967212, 2919340031, 3408511495, 153606991, 1179835422, 3120530541, 1297157952, 2066876749, 3646526585, 2528679748, 164971351, 1031493174, 736714293, 2003726208, 3580186316, 2324983961, 1966156010, 958201025, 667874168, 321297120, 2535529304, 892264852, 1789625681, 1439087906, 424175625, 3355074592, 4092397864, 640396114, 16187220, 3155352976, 4051689901, 3968506174, 2172914701, 1277592620, 1758553388, 267542920