In [None]:
! pip install datasets



In [None]:
from datasets import load_dataset

# load the dataset from huggingface in streaming mode and shuffle it
wiki_data = load_dataset(
    'vblagoje/wikipedia_snippets_streamed',
    split='train',
    streaming=True
).shuffle(seed=960)

In [None]:
# filter only documents with History as section_title
history = wiki_data.filter(
    lambda d: d['section_title'].startswith('Technology')
)

In [None]:
history

<datasets.iterable_dataset.IterableDataset at 0x7fb0a0d0e8f0>

In [None]:
!pip install tqdm



In [None]:
from tqdm.auto import tqdm  # progress bar

total_doc_count = 500

counter = 0
docs = []
# iterate through the dataset and apply our filter
for d in tqdm(history, total=total_doc_count):
    # extract the fields we need
    doc = {
        "article_title": d["article_title"],
        "section_title": d["section_title"],
        "passage_text": d["passage_text"]
    }
    # add the dict containing fields we need to docs list
    docs.append(doc)

    # stop iteration once we reach 50k
    if counter == total_doc_count:
        break

    # increase the counter on every iteration
    counter += 1

  0%|          | 0/500 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
tokenizer = Tokenizer()

In [None]:
passage_texts = [doc['passage_text'] for doc in docs]

# Create a tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(passage_texts)

In [None]:
unique_words_count = len(tokenizer.word_index)

print("Number of unique words:", unique_words_count)

Number of unique words: 4830


In [None]:
input_sequences = []

# Iterate through each passage_text in the list
for passage_text in passage_texts:
    # Tokenize the passage_text
    tokenized_sentence = tokenizer.texts_to_sequences([passage_text])[0]

    # Generate input sequences by considering prefixes of tokenized_sentence
    for i in range(1, len(tokenized_sentence)):
        input_sequences.append(tokenized_sentence[:i+1])

In [None]:
input_sequences

[[3, 937],
 [3, 937, 188],
 [3, 937, 188, 31],
 [3, 937, 188, 31, 2176],
 [3, 937, 188, 31, 2176, 12],
 [3, 937, 188, 31, 2176, 12, 938],
 [3, 937, 188, 31, 2176, 12, 938, 73],
 [3, 937, 188, 31, 2176, 12, 938, 73, 13],
 [3, 937, 188, 31, 2176, 12, 938, 73, 13, 4],
 [3, 937, 188, 31, 2176, 12, 938, 73, 13, 4, 1326],
 [3, 937, 188, 31, 2176, 12, 938, 73, 13, 4, 1326, 5],
 [3, 937, 188, 31, 2176, 12, 938, 73, 13, 4, 1326, 5, 2177],
 [3, 937, 188, 31, 2176, 12, 938, 73, 13, 4, 1326, 5, 2177, 3],
 [3, 937, 188, 31, 2176, 12, 938, 73, 13, 4, 1326, 5, 2177, 3, 939],
 [3, 937, 188, 31, 2176, 12, 938, 73, 13, 4, 1326, 5, 2177, 3, 939, 511],
 [3, 937, 188, 31, 2176, 12, 938, 73, 13, 4, 1326, 5, 2177, 3, 939, 511, 2],
 [3,
  937,
  188,
  31,
  2176,
  12,
  938,
  73,
  13,
  4,
  1326,
  5,
  2177,
  3,
  939,
  511,
  2,
  213],
 [3,
  937,
  188,
  31,
  2176,
  12,
  938,
  73,
  13,
  4,
  1326,
  5,
  2177,
  3,
  939,
  511,
  2,
  213,
  940],
 [3,
  937,
  188,
  31,
  2176,
  12,
  93

In [None]:
max_len = max([len(x) for x in input_sequences ])
max_len

115

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded_input_sequences = pad_sequences(input_sequences, maxlen = max_len, padding = 'pre')

In [None]:
padded_input_sequences

array([[  0,   0,   0, ...,   0,   3, 937],
       [  0,   0,   0, ...,   3, 937, 188],
       [  0,   0,   0, ..., 937, 188,  31],
       ...,
       [  0,   0,   0, ..., 117,   6,   1],
       [  0,   0,   0, ...,   6,   1, 330],
       [  0,   0,   0, ...,   1, 330,   2]], dtype=int32)

In [None]:
X = padded_input_sequences[:,:-1]

In [None]:
y = padded_input_sequences[:,-1]

In [None]:
len(padded_input_sequences)

19210

In [None]:
from tensorflow.keras.utils import to_categorical
y = to_categorical(y, num_classes = 19211)

In [None]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]], dtype=float32)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [None]:
model = Sequential()
model.add(Embedding(19211, 150, input_length =  114))

In [None]:
model.add(LSTM(150))
model.add(Dense(19211, activation = 'softmax'))

In [None]:
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [None]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 114, 150)          2881650   
                                                                 
 lstm_1 (LSTM)               (None, 150)               180600    
                                                                 
 dense_1 (Dense)             (None, 19211)             2900861   
                                                                 
Total params: 5,963,111
Trainable params: 5,963,111
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(X,y, epochs = 100)

Epoch 1/100
 15/601 [..............................] - ETA: 5:04 - loss: 0.2471 - accuracy: 0.9792

KeyboardInterrupt: ignored

In [None]:
text = "logic"

tokenized_text = tokenizer.texts_to_sequences([text])[0]

In [None]:
padded_token_text = pad_sequences([tokenized_text], maxlen = 114, padding = 'pre')

In [None]:
padded_token_text

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0, 1828]], dtype=int32)

In [None]:
model.predict(padded_token_text)



array([[1.3752584e-10, 4.5193069e-02, 4.7626084e-01, ..., 1.2494267e-10,
        1.4039646e-10, 1.0764426e-10]], dtype=float32)

In [None]:
import numpy as np
max_prob_word_embed = np.argmax(model.predict(padded_token_text))



In [None]:
max_prob_word_embed

2

In [None]:
for word, index in tokenizer.word_index.items():
  if index == max_prob_word_embed:
    print(word)

of


In [None]:
text = "financial"
for i in range(15):
  tokenized_text = tokenizer.texts_to_sequences([text])[0]
  padded_token_text = pad_sequences([tokenized_text], maxlen = 114, padding = 'pre')
  max_prob_word_embed = np.argmax(model.predict(padded_token_text))
  for word, index in tokenizer.word_index.items():
      if index == max_prob_word_embed:
         text = text + " " + word
         print(text)

financial compliance
financial compliance funding
financial compliance funding as
financial compliance funding as the
financial compliance funding as the technology
financial compliance funding as the technology to
financial compliance funding as the technology to 2000
financial compliance funding as the technology to 2000 on
financial compliance funding as the technology to 2000 on 1995
financial compliance funding as the technology to 2000 on 1995 he
financial compliance funding as the technology to 2000 on 1995 he coordinates
financial compliance funding as the technology to 2000 on 1995 he coordinates the
financial compliance funding as the technology to 2000 on 1995 he coordinates the work
financial compliance funding as the technology to 2000 on 1995 he coordinates the work of
financial compliance funding as the technology to 2000 on 1995 he coordinates the work of a


In [None]:
text = "Computers"
for i in range(15):
  tokenized_text = tokenizer.texts_to_sequences([text])[0]
  padded_token_text = pad_sequences([tokenized_text], maxlen = 114, padding = 'pre')
  max_prob_word_embed = np.argmax(model.predict(padded_token_text))
  for word, index in tokenizer.word_index.items():
      if index == max_prob_word_embed:
         text = text + " " + word
         print(text)

Computers with
Computers with a
Computers with a specific
Computers with a specific tape
Computers with a specific tape looping
Computers with a specific tape looping 5
Computers with a specific tape looping 5 and
Computers with a specific tape looping 5 and be
Computers with a specific tape looping 5 and be used
Computers with a specific tape looping 5 and be used in
Computers with a specific tape looping 5 and be used in an
Computers with a specific tape looping 5 and be used in an increasing
Computers with a specific tape looping 5 and be used in an increasing or
Computers with a specific tape looping 5 and be used in an increasing or other
Computers with a specific tape looping 5 and be used in an increasing or other other


In [None]:
text = "continue"
for i in range(15):
  tokenized_text = tokenizer.texts_to_sequences([text])[0]
  padded_token_text = pad_sequences([tokenized_text], maxlen = 114, padding = 'pre')
  max_prob_word_embed = np.argmax(model.predict(padded_token_text))
  for word, index in tokenizer.word_index.items():
      if index == max_prob_word_embed:
         text = text + " " + word
         print(text)

continue to
continue to live
continue to live underwater
continue to live underwater sentient
continue to live underwater sentient species
continue to live underwater sentient species mysteries
continue to live underwater sentient species mysteries every
continue to live underwater sentient species mysteries every sentient
continue to live underwater sentient species mysteries every sentient species
continue to live underwater sentient species mysteries every sentient species has
continue to live underwater sentient species mysteries every sentient species has certain
continue to live underwater sentient species mysteries every sentient species has certain mysteries
continue to live underwater sentient species mysteries every sentient species has certain mysteries that
continue to live underwater sentient species mysteries every sentient species has certain mysteries that are
continue to live underwater sentient species mysteries every sentient species has certain mysteries that are un

In [None]:
text = "capable"
for i in range(15):
  tokenized_text = tokenizer.texts_to_sequences([text])[0]
  padded_token_text = pad_sequences([tokenized_text], maxlen = 114, padding = 'pre')
  max_prob_word_embed = np.argmax(model.predict(padded_token_text))
  for word, index in tokenizer.word_index.items():
      if index == max_prob_word_embed:
         text = text + " " + word
         print(text)

capable of
capable of capodimonte
capable of capodimonte naples
capable of capodimonte naples that
capable of capodimonte naples that same
capable of capodimonte naples that same year
capable of capodimonte naples that same year he
capable of capodimonte naples that same year he coordinates
capable of capodimonte naples that same year he coordinates the
capable of capodimonte naples that same year he coordinates the work
capable of capodimonte naples that same year he coordinates the work of
capable of capodimonte naples that same year he coordinates the work of analysis
capable of capodimonte naples that same year he coordinates the work of analysis of
capable of capodimonte naples that same year he coordinates the work of analysis of the
capable of capodimonte naples that same year he coordinates the work of analysis of the angel


In [None]:
text = "ideal"
for i in range(15):
  tokenized_text = tokenizer.texts_to_sequences([text])[0]
  padded_token_text = pad_sequences([tokenized_text], maxlen = 114, padding = 'pre')
  max_prob_word_embed = np.argmax(model.predict(padded_token_text))
  for word, index in tokenizer.word_index.items():
      if index == max_prob_word_embed:
         text = text + " " + word
         print(text)

ideal for
ideal for comparison
ideal for comparison shopping
ideal for comparison shopping and
ideal for comparison shopping and 2d
ideal for comparison shopping and 2d barcode
ideal for comparison shopping and 2d barcode scanning
ideal for comparison shopping and 2d barcode scanning and
ideal for comparison shopping and 2d barcode scanning and finding
ideal for comparison shopping and 2d barcode scanning and finding information
ideal for comparison shopping and 2d barcode scanning and finding information related
ideal for comparison shopping and 2d barcode scanning and finding information related to
ideal for comparison shopping and 2d barcode scanning and finding information related to products
ideal for comparison shopping and 2d barcode scanning and finding information related to products and
ideal for comparison shopping and 2d barcode scanning and finding information related to products and services
