In [1]:
import numpy as np
import pandas as pd
from datasets import load_dataset
import sentencepiece as spm
import gensim
from gensim.models import Word2Vec
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def generate_inputs_and_labels(df):
    inputs = []
    labels = []

    for index, row in df.iterrows():
        text = row['text']
        tokens = text.split()  # Tokenize the text by splitting on whitespace

        # Generate input by adding start of sentence token at the beginning
        input_sequence = ['<sos>'] + tokens

        # Generate label by adding end of sentence token at the end
        label_sequence = tokens + ['</sos>']

        inputs.append(input_sequence)
        labels.append(label_sequence)

    return inputs, labels

In [4]:
train_df = pd.read_json('train_data.json', lines=True)

In [5]:
train_df.head()

Unnamed: 0,text
0,"One day, a little girl named Lily found a need..."
1,"Once upon a time, there was a little car named..."
2,"One day, a little fish named Fin was swimming ..."
3,"Once upon a time, in a land full of trees, the..."
4,"Once upon a time, there was a little girl name..."


In [8]:
train_df = train_df[:1]
train_df

Unnamed: 0,text
0,"One day, a little girl named Lily found a need..."


In [9]:
inputs, labels = generate_inputs_and_labels(train_df)

In [10]:
# Print first 5 samples
for i in range(min(5, len(inputs))):
    print("Input:", inputs[i][:5], "...", inputs[i][-5:])
    print("Label:", labels[i][:5], "...", labels[i][-5:])
    print()

Input: ['<sos>', 'One', 'day,', 'a', 'little'] ... ['had', 'shared', 'and', 'worked', 'together.']
Label: ['One', 'day,', 'a', 'little', 'girl'] ... ['shared', 'and', 'worked', 'together.', '</sos>']



In [11]:
sp = spm.SentencePieceProcessor()
sp.load('small_m.model')

True

In [28]:
# Convert input and label sequences to strings
input_strings = [' '.join(sequence) for sequence in inputs]
label_strings = [' '.join(sequence) for sequence in labels]

# Tokenize input and label strings
tokenized_inputs = [sp.encode_as_pieces(sequence) for sequence in input_strings]
tokenized_labels = [sp.encode_as_pieces(sequence) for sequence in label_strings]

# Print tokenized input and label sequences
for i in range(len(inputs)):
    print("Tokenized Input:", tokenized_inputs[i])
    print("Tokenized Label:", tokenized_labels[i])

Tokenized Input: ['▁', '<', 'so', 's', '>', '▁one', '▁day', ',', '▁a', '▁little', '▁girl', '▁name', 'd', '▁lily', '▁found', '▁a', '▁needle', '▁in', '▁her', '▁room', '.', '▁she', '▁knew', '▁it', '▁was', '▁difficult', '▁to', '▁play', '▁with', '▁it', '▁', 'because', '▁it', '▁was', '▁sharp', '.', '▁lily', '▁wanted', '▁to', '▁share', '▁the', '▁needle', '▁with', '▁her', '▁mom', ',', '▁so', '▁she', '▁could', '▁sew', '▁a', '▁button', '▁on', '▁her', '▁shirt', '.', '▁lily', '▁went', '▁to', '▁her', '▁mom', '▁and', '▁said', ',', '▁"', 'mom', ',', '▁i', '▁found', '▁this', '▁needle', '.', '▁can', '▁you', '▁share', '▁it', '▁with', '▁me', '▁and', '▁sew', '▁my', '▁shirt', '?"', '▁her', '▁mom', '▁smiled', '▁and', '▁said', ',', '▁"', 'yes', ',', '▁lily', ',', '▁we', '▁can', '▁share', '▁the', '▁needle', '▁and', '▁fix', '▁your', '▁shirt', '."', '▁together', ',', '▁they', '▁shared', '▁the', '▁needle', '▁and', '▁sew', 'ed', '▁the', '▁button', '▁on', '▁lily', "'", 's', '▁shirt', '.', '▁it', '▁was', '▁not', '▁

In [29]:
# Convert tokenized sequences to IDs
input_ids = [sp.encode_as_ids(sequence) for sequence in tokenized_inputs]
label_ids = [sp.encode_as_ids(sequence) for sequence in tokenized_labels]

# Print token IDs
print("Input Token IDs:", input_ids)
print("Label Token IDs:", label_ids)

Input Token IDs: [[[], [19, 0], [23], [298], [19, 0], [19, 38], [19, 28], [19, 6], [19, 8], [19, 37], [19, 53], [19, 86], [1233], [19, 31], [19, 119], [19, 8], [19, 1614], [19, 21], [19, 14], [19, 198], [19, 3], [19, 11], [19, 185], [19, 12], [19, 9], [19, 1455], [19, 7], [19, 54], [19, 24], [19, 12], [], [19, 230], [19, 12], [19, 9], [19, 1316], [19, 3], [19, 31], [19, 59], [19, 7], [19, 259], [19, 4], [19, 1614], [19, 24], [19, 14], [19, 43], [19, 6], [19, 23], [19, 11], [19, 94], [19, 2599], [19, 8], [19, 1293], [19, 32], [19, 14], [19, 802], [19, 3], [19, 31], [19, 68], [19, 7], [19, 14], [19, 43], [19, 5], [19, 18], [19, 6], [19, 16], [43], [19, 6], [19, 49], [19, 119], [19, 149], [19, 1614], [19, 3], [19, 66], [19, 25], [19, 259], [19, 12], [19, 24], [19, 145], [19, 5], [19, 2599], [19, 140], [19, 802], [19, 82], [19, 14], [19, 43], [19, 76], [19, 5], [19, 18], [19, 6], [19, 16], [542], [19, 6], [19, 31], [19, 6], [19, 96], [19, 66], [19, 259], [19, 4], [19, 1614], [19, 5], [19, 