In [21]:
from tensorflow.keras.preprocessing.sequence import skipgrams
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence


data = [['The', 'King', 'James', 'Bible'], 
        ['The', 'Old', 'Testament', 'of', 'the', 'King', 'James', 'Bible'], 
        ['The', 'First', 'Book', 'of', 'Moses', ':', 'Called', 'Genesis'], 
        ['In', 'the', 'beginning', 'God', 'created', 'the', 'heaven', 'and', 'the', 'earth'], 
        ['And', 'the', 'earth', 'was', 'without', 'form', ',', 'and', 'void', ';', 'and', 'darkness', 'was', 'upon', 'the', 'face', 'of', 'the', 'deep', '.'], 
        ['And', 'the', 'Spirit', 'of', 'God', 'moved', 'upon', 'the', 'face', 'of', 'the', 'waters', '.'], 
        ['And', 'God', 'said', ',', 'Let', 'there', 'be', 'light', ':', 'and', 'there', 'was', 'light', '.'], 
        ['And', 'God', 'saw', 'the', 'light', ',', 'that', 'it', 'was', 'good', ':', 'and', 'God', 'divided', 'the', 'light', 'from', 'the', 'darkness', '.'], 
        ['And', 'God', 'called', 'the', 'light', 'Day', ',', 'and', 'the', 'darkness', 'he', 'called', 'Night', '.'], 
        ['And', 'the', 'evening', 'and', 'the', 'morning', 'were', 'the', 'first', 'day', '.']]

t = Tokenizer()
t.fit_on_texts(data)

word2id = t.word_index

id2word = {v:k for k, v in word2id.items()}

vocab_size = len(word2id) + 1 
embed_size = 100

word2id

{'the': 1,
 'and': 2,
 'god': 3,
 '.': 4,
 'of': 5,
 'light': 6,
 'was': 7,
 ',': 8,
 ':': 9,
 'called': 10,
 'darkness': 11,
 'king': 12,
 'james': 13,
 'bible': 14,
 'first': 15,
 'earth': 16,
 'upon': 17,
 'face': 18,
 'there': 19,
 'day': 20,
 'old': 21,
 'testament': 22,
 'book': 23,
 'moses': 24,
 'genesis': 25,
 'in': 26,
 'beginning': 27,
 'created': 28,
 'heaven': 29,
 'without': 30,
 'form': 31,
 'void': 32,
 ';': 33,
 'deep': 34,
 'spirit': 35,
 'moved': 36,
 'waters': 37,
 'said': 38,
 'let': 39,
 'be': 40,
 'saw': 41,
 'that': 42,
 'it': 43,
 'good': 44,
 'divided': 45,
 'from': 46,
 'he': 47,
 'night': 48,
 'evening': 49,
 'morning': 50,
 'were': 51}

In [20]:
wids = [[word2id[w.lower()] for w in doc] for doc in data]

wids

[[1, 12, 13, 14],
 [1, 21, 22, 5, 1, 12, 13, 14],
 [1, 15, 23, 5, 24, 9, 10, 25],
 [26, 1, 27, 3, 28, 1, 29, 2, 1, 16],
 [2, 1, 16, 7, 30, 31, 8, 2, 32, 33, 2, 11, 7, 17, 1, 18, 5, 1, 34, 4],
 [2, 1, 35, 5, 3, 36, 17, 1, 18, 5, 1, 37, 4],
 [2, 3, 38, 8, 39, 19, 40, 6, 9, 2, 19, 7, 6, 4],
 [2, 3, 41, 1, 6, 8, 42, 43, 7, 44, 9, 2, 3, 45, 1, 6, 46, 1, 11, 4],
 [2, 3, 10, 1, 6, 20, 8, 2, 1, 11, 47, 10, 48, 4],
 [2, 1, 49, 2, 1, 50, 51, 1, 15, 20, 4]]

In [26]:
# generate skip-grams
skip_grams = [skipgrams(wid, vocabulary_size=vocab_size, window_size=3) for wid in wids]

In [29]:
pairs = skip_grams[0][0]

for i in range(10):
    print("({:s} ({:d}), {:s} ({:d}))".format(
          id2word[pairs[i][0]], pairs[i][0], 
          id2word[pairs[i][1]], pairs[i][1]))

(the (1), bible (14))
(the (1), , (8))
(the (1), be (40))
(bible (14), beginning (27))
(king (12), there (19))
(king (12), he (47))
(king (12), upon (17))
(king (12), the (1))
(james (13), testament (22))
(james (13), bible (14))
