In [1]:
import numpy as np
import tensorflow as tf

In [2]:
#  Text to Word : Split

sentence = "The brown fox is quick and he is jumping over the lazy dog"

words = sentence.split()

print(words)

['The', 'brown', 'fox', 'is', 'quick', 'and', 'he', 'is', 'jumping', 'over', 'the', 'lazy', 'dog']


In [3]:
# Length of the senetence

print(len(words))

13


In [4]:
# Max length of the word in document
np.max([len(x) for x in words])

7

In [5]:
# Vocabulary List

from keras.preprocessing.text import text_to_word_sequence

sentence = "The brown fox is quick and he is jumping over the lazy dog"

vocab = set(text_to_word_sequence(sentence))
vocab_size = len(vocab)

print("Vocabulary:",vocab)
print("Vocabulary size:",vocab_size)

Using TensorFlow backend.


Vocabulary: {'is', 'brown', 'the', 'he', 'over', 'jumping', 'lazy', 'dog', 'and', 'quick', 'fox'}
Vocabulary size: 11


In [7]:
# Text to word Sequence

from keras.preprocessing.text import text

norm_bible = ['king james bible',
             'old testament king james bible',
             'first book moses called genesis',
             'beginning god created heaven earth',
             'earth without form void darkness upon face deep',
             'spirit god moved upon face waters',
             'god said let light light',
             'god saw light good god divided light darkness',
             'god called light day darkness called night',
             'evening morning first day']

[[w for w in text.text_to_word_sequence(doc)] for doc in norm_bible]


[['king', 'james', 'bible'],
 ['old', 'testament', 'king', 'james', 'bible'],
 ['first', 'book', 'moses', 'called', 'genesis'],
 ['beginning', 'god', 'created', 'heaven', 'earth'],
 ['earth', 'without', 'form', 'void', 'darkness', 'upon', 'face', 'deep'],
 ['spirit', 'god', 'moved', 'upon', 'face', 'waters'],
 ['god', 'said', 'let', 'light', 'light'],
 ['god', 'saw', 'light', 'good', 'god', 'divided', 'light', 'darkness'],
 ['god', 'called', 'light', 'day', 'darkness', 'called', 'night'],
 ['evening', 'morning', 'first', 'day']]

In [8]:
# Word Frequency (Word Count)

word_freq = {}

for tok in sentence.split():
    if tok in word_freq:
        word_freq[tok] +=1
    else:
        word_freq[tok] = 1

print(word_freq)

{'The': 1, 'brown': 1, 'fox': 1, 'is': 2, 'quick': 1, 'and': 1, 'he': 1, 'jumping': 1, 'over': 1, 'the': 1, 'lazy': 1, 'dog': 1}


In [9]:
# Word Frequency (Word Count)

import collections

counter = collections.Counter(sentence.split())

print(counter)

Counter({'is': 2, 'The': 1, 'brown': 1, 'fox': 1, 'quick': 1, 'and': 1, 'he': 1, 'jumping': 1, 'over': 1, 'the': 1, 'lazy': 1, 'dog': 1})


In [10]:
# Find Most Common word
counter.most_common(8)

[('is', 2),
 ('The', 1),
 ('brown', 1),
 ('fox', 1),
 ('quick', 1),
 ('and', 1),
 ('he', 1),
 ('jumping', 1)]

In [11]:
# Word Index

corpus = sentence.split()

uniq_text = set(corpus)

text_to_int = {}

for i, c in enumerate (uniq_text):
    text_to_int.update({c: i})
    
print(text_to_int)

{'is': 0, 'brown': 1, 'The': 2, 'the': 3, 'he': 4, 'over': 5, 'jumping': 6, 'lazy': 7, 'dog': 8, 'and': 9, 'quick': 10, 'fox': 11}


In [12]:
# Unique words in Senetnce

print(set(sentence.lower().split()))

{'is', 'brown', 'the', 'he', 'over', 'jumping', 'lazy', 'dog', 'and', 'quick', 'fox'}


In [13]:
# N Gram (n =2): Bigram

words =sentence.split()

n = 2
output= []
for i in range(len(words) - n+1):
    output.append(words[i:i+n])
    
output

[['The', 'brown'],
 ['brown', 'fox'],
 ['fox', 'is'],
 ['is', 'quick'],
 ['quick', 'and'],
 ['and', 'he'],
 ['he', 'is'],
 ['is', 'jumping'],
 ['jumping', 'over'],
 ['over', 'the'],
 ['the', 'lazy'],
 ['lazy', 'dog']]

In [14]:
# N Gram (n =3): Trigram

words =sentence.split()

n = 3
output= []
for i in range(len(words) - n+1):
    output.append(words[i:i+n])
    
output

[['The', 'brown', 'fox'],
 ['brown', 'fox', 'is'],
 ['fox', 'is', 'quick'],
 ['is', 'quick', 'and'],
 ['quick', 'and', 'he'],
 ['and', 'he', 'is'],
 ['he', 'is', 'jumping'],
 ['is', 'jumping', 'over'],
 ['jumping', 'over', 'the'],
 ['over', 'the', 'lazy'],
 ['the', 'lazy', 'dog']]

In [15]:
# Text to Integer

list1 = ['Located', 'on', 'the', 'southern', 'tip', 'of', 'Lake', 'Union,', 'the', 'Hilton']

text_int ={}

for i,j in enumerate(list1):
    text_int.update({j:(i+1)})
    
text_int

{'Located': 1,
 'on': 2,
 'the': 9,
 'southern': 4,
 'tip': 5,
 'of': 6,
 'Lake': 7,
 'Union,': 8,
 'Hilton': 10}

In [16]:
# Text to Integer

from keras.preprocessing import text

norm_bible = ['king james bible',
             'old testament king james bible',
             'first book moses called genesis',
             'beginning god created heaven earth',
             'earth without form void darkness upon face deep',
             'spirit god moved upon face waters',
             'god said let light light',
             'god saw light good god divided light darkness',
             'god called light day darkness called night',
             'evening morning first day']

tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(norm_bible)
word2id = tokenizer.word_index

word2id['PAD'] = 0
id2word = {v:k for k, v in word2id.items()}
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in norm_bible]

wids

[[5, 6, 7],
 [13, 14, 5, 6, 7],
 [8, 15, 16, 3, 17],
 [18, 1, 19, 20, 9],
 [9, 21, 22, 23, 4, 10, 11, 24],
 [25, 1, 26, 10, 11, 27],
 [1, 28, 29, 2, 2],
 [1, 30, 2, 31, 1, 32, 2, 4],
 [1, 3, 2, 12, 4, 3, 33],
 [34, 35, 8, 12]]

In [19]:
# Binary One Hot Encoder for One Senetence

from sklearn.preprocessing import OneHotEncoder
from keras.preprocessing.text import Tokenizer

# Bag of Word

# One Hot Encoding

doc = "Can I eat the Pizza".lower().split()

t = Tokenizer()
# fit the tokenizer on the documents
t.fit_on_texts(doc)

# integer encode documents
encoded_docs = t.texts_to_matrix(doc, mode='count')

encoded_docs

array([[0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.]])

In [20]:
# One Hot Ending for Multiple Sentence

doc = [['can', 'i', 'eat', 'the', 'pizza'],['can', 'i', 'eat', 'the', 'pizza']]


t = Tokenizer()
# fit the tokenizer on the documents
t.fit_on_texts(doc)

# integer encode documents
encoded_docs = t.texts_to_matrix(doc, mode='count')

encoded_docs

array([[0., 1., 1., 1., 1., 1.],
       [0., 1., 1., 1., 1., 1.]])

In [21]:
# One Hot Encoding : Binary

from keras.preprocessing.text import Tokenizer

# define 5 documents

docs = ['Well done!',
        'Good work',
        'Great effort',
        'nice work',
        'Excellent!']

# create the tokenizer
t = Tokenizer()

# fit the tokenizer on the documents
t.fit_on_texts(docs)

# integer encode documents
encoded_docs = t.texts_to_matrix(docs, mode='count')

print(encoded_docs)

# Word Index

#'work': 1,
# 'well': 2,
# 'done': 3,
# 'good': 4,
# 'great': 5,
# 'effort': 6,
# 'nice': 7,
# 'excellent': 8}

[[0. 0. 1. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1.]]


In [22]:
# Multi Dimenstional One Hot Encosing

from keras.utils import to_categorical

x = [[1,2,3,4,5],
     [1,4,8,4,4],
     [1,2,3,3,5]]

y = to_categorical(x, num_classes=10)

y

array([[[0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.]],

       [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]],

       [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.]]], dtype=float32)

In [23]:
from sklearn.preprocessing import LabelEncoder

doc =  ['S','C','Q','S','C','Q','S','C','Q','Q','C','Q','C','Q','Q']

label_encoder = LabelEncoder()

label_encoder.fit(['S','C','Q'])

label_encoder.transform(doc)

array([2, 0, 1, 2, 0, 1, 2, 0, 1, 1, 0, 1, 0, 1, 1], dtype=int64)

In [24]:
# OrdinalEncoder converts each string value to a whole number. 
# The first unique value in your column becomes 1, the second becomes 2, the third becomes 3, and so on.

from sklearn.preprocessing import OrdinalEncoder
import numpy as np

doc =  [['S'],['C'],['Q'],['S'],['C'],['Q'],['S'],['C'],['Q'],['Q'],['C'],['Q'],['C'],['Q'],['Q']]

encoder = OrdinalEncoder()

encoder.fit_transform(doc)

array([[2.],
       [0.],
       [1.],
       [2.],
       [0.],
       [1.],
       [2.],
       [0.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [1.]])

In [25]:
# OrdinalEncoder

import numpy as np

X = np.array([["good", "london"],
             ["good", "tokyo"],
             ["bad", "paris"],
             ["average", "so so"],
             ["good", "tokyo"]])

encoder = OrdinalEncoder()
encoder.fit_transform(X)


array([[2., 0.],
       [2., 3.],
       [1., 1.],
       [0., 2.],
       [2., 3.]])

In [26]:
# Count Vectorizer

from sklearn.feature_extraction.text import CountVectorizer

docs = np.array(['The sun is shining',
                 'The weather is sweet',
                 'The sun is shining and the weather is sweet'])

count = CountVectorizer()
bag = count.fit_transform(docs)

bag.toarray()

# Index

# and = 0
# is =1
# shining = 2
# sun =3
# sweet =4
# the =5
# wether =6

array([[0, 1, 1, 1, 0, 1, 0],
       [0, 1, 0, 0, 1, 1, 1],
       [1, 2, 1, 1, 1, 2, 1]], dtype=int64)

In [27]:
# TFIDF

# tf-idf = tf(i,d) * idf(t,d)

# td(i,d) = term frequency

# idf(t,d) = 	log(n/1+df(t,d)) :  inverse document frequency  df(t,d) : no. of documents d that contain the term t

import pandas as pd

corpus = ["Data Science is an overlap between Arts and Science",
          "Generally,Arts graduates are right-brained and Science graduates are left-brained",
          "Excelling in both Arts and Science at a time becomes difficult",
          "Natural Language Processing is a part of Data Science"]

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_model = TfidfVectorizer()
features = tfidf_model.fit_transform(corpus)
df = pd.DataFrame(features.todense(),columns= sorted(tfidf_model.vocabulary_))

df

Unnamed: 0,an,and,are,arts,at,becomes,between,both,brained,data,...,language,left,natural,of,overlap,part,processing,right,science,time
0,0.403328,0.257439,0.0,0.257439,0.0,0.0,0.403328,0.0,0.0,0.317989,...,0.0,0.0,0.0,0.0,0.403328,0.0,0.0,0.0,0.420947,0.0
1,0.0,0.159139,0.498644,0.159139,0.0,0.0,0.0,0.0,0.498644,0.0,...,0.0,0.249322,0.0,0.0,0.0,0.0,0.0,0.249322,0.130107,0.0
2,0.0,0.224449,0.0,0.224449,0.351643,0.351643,0.0,0.351643,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.183502,0.351643
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.308872,...,0.391765,0.0,0.391765,0.391765,0.0,0.391765,0.391765,0.0,0.204439,0.0


In [28]:
# Sequence from List

In [29]:
# Pad Sequence (Pre)

from tensorflow.keras.preprocessing.sequence import pad_sequences

raw_inputs = [[83, 91, 1, 645, 1253, 927],
              [73, 8, 3215, 55, 927],
              [711, 632, 71]]

padded_inputs = pad_sequences(raw_inputs,padding='pre')

padded_inputs

array([[  83,   91,    1,  645, 1253,  927],
       [   0,   73,    8, 3215,   55,  927],
       [   0,    0,    0,  711,  632,   71]])

In [30]:
# Pad Sequence (Post)

raw_inputs = [[83, 91, 1, 645, 1253, 927],
              [73, 8, 3215, 55, 927],
              [711, 632, 71]]

padded_inputs = pad_sequences(raw_inputs,padding='post')

padded_inputs

array([[  83,   91,    1,  645, 1253,  927],
       [  73,    8, 3215,   55,  927,    0],
       [ 711,  632,   71,    0,    0,    0]])

In [31]:
a = [[24],
     [24, 34],
     [24, 34, 1],
     [24, 34, 1, 9],
     [24, 34, 1, 9, 56],
     [24, 34, 1, 9, 56, 76],
     [24, 34, 1, 9, 56, 76, 90],
     [24, 34, 1, 9, 56, 76, 90, 11],
     [24, 34, 1, 9, 56, 76, 90, 11, 67],
     [24, 34, 1, 9, 56, 76, 90, 11, 67, 54],
     [24, 34, 1, 9, 56, 76, 90, 11, 67, 54, 14]]

import numpy as np
np.max([len(x) for x in a])

row = 11
column = len(a)

tensor = np.zeros((row,column))

for i in range(len(a)):
    lst = a[i]
    for j in range(len(lst)):
        tensor[i][j] = lst[j]

In [32]:
# N Gram from Sequence List

lst = [24,34,1,9,56,76,90,11,67,54,14]

data = []

for i in range(0,len(lst)):
    print(lst[:i+1])
    data.append(lst[:i+1])

# Padding
pad_sequences(data,padding='post')

[24]
[24, 34]
[24, 34, 1]
[24, 34, 1, 9]
[24, 34, 1, 9, 56]
[24, 34, 1, 9, 56, 76]
[24, 34, 1, 9, 56, 76, 90]
[24, 34, 1, 9, 56, 76, 90, 11]
[24, 34, 1, 9, 56, 76, 90, 11, 67]
[24, 34, 1, 9, 56, 76, 90, 11, 67, 54]
[24, 34, 1, 9, 56, 76, 90, 11, 67, 54, 14]


array([[24,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [24, 34,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [24, 34,  1,  0,  0,  0,  0,  0,  0,  0,  0],
       [24, 34,  1,  9,  0,  0,  0,  0,  0,  0,  0],
       [24, 34,  1,  9, 56,  0,  0,  0,  0,  0,  0],
       [24, 34,  1,  9, 56, 76,  0,  0,  0,  0,  0],
       [24, 34,  1,  9, 56, 76, 90,  0,  0,  0,  0],
       [24, 34,  1,  9, 56, 76, 90, 11,  0,  0,  0],
       [24, 34,  1,  9, 56, 76, 90, 11, 67,  0,  0],
       [24, 34,  1,  9, 56, 76, 90, 11, 67, 54,  0],
       [24, 34,  1,  9, 56, 76, 90, 11, 67, 54, 14]])

In [33]:
# Truncating Sequence (Pre)

import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

data_vec = [['3', '18', '9', '3', '11', '5', '20'],
            ['3', '8', '1', '12'],
            ['18', '1', '8', '1'],
            ['8', '1', '9', '14'],
            ['25', '1', '8', '1'],
            ['9','3']]

# truncate sequence
truncated= pad_sequences(data_vec, maxlen=2)
print(truncated)

[[ 5 20]
 [ 1 12]
 [ 8  1]
 [ 9 14]
 [ 8  1]
 [ 9  3]]


In [34]:
# Truncating Sequence (Post)

# truncate sequence
truncated= pad_sequences(data_vec, maxlen=2,truncating='post')
print(truncated)

[[ 3 18]
 [ 3  8]
 [18  1]
 [ 8  1]
 [25  1]
 [ 9  3]]


In [35]:
# Frequency Distribution

In [36]:
# Union

document1 = ['Hi', 'How','are','You', 'I','am']
document2 = ['Hello','I','am','fine','How','about','You']

#Union : words from two documents
print("Union",set(document1) | set(document2))

Union {'are', 'You', 'Hello', 'about', 'Hi', 'am', 'fine', 'How', 'I'}


In [37]:
# Intersection

document1 = ['Hi', 'How','are','You', 'I','am']
document2 = ['Hello','I','am','fine','How','about','You']

#Intersection : common words across two documents
print("Intersection:",set(document1) & set(document2))       
            

Intersection: {'You', 'am', 'How', 'I'}


In [38]:
# Sequence to Matrix (mode = binary and num_words = 10)

from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=10)

x_train = [[1,2,3,4,1],
           [4,5,],
           [6,7,8]]

x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')

x_train

array([[0., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 1., 1., 0.]])

In [39]:
# Sequences to Matrix (mode = count and num_words = 10)
tokenizer = Tokenizer(num_words=10)

x_train = [[1,2,3,4,1],
           [4,5,],
           [6,7,8]]

x_train = tokenizer.sequences_to_matrix(x_train, mode='count')

x_train

array([[0., 2., 1., 1., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 1., 1., 0.]])

In [40]:
# Sequences to Matrix (mode = freq and num_words = 10)
tokenizer = Tokenizer(num_words=10)

x_train = [[1,2,3,4,1],
           [4,5,],
           [6,7,8]]

x_train = tokenizer.sequences_to_matrix(x_train, mode='freq')

import pandas as pd

pd.DataFrame(x_train)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.0,0.4,0.2,0.2,0.2,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.5,0.5,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.333333,0.333333,0.0


In [46]:
import string

all_letters = string.ascii_letters + ".,;''"

print("all_letters = {0}".format(all_letters))

n_letters = len(all_letters)

all_letters = abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,;''


In [47]:
# Genrate Input for the RNN
import numpy as np

def name_intensor(name):
    name_in_tensor = np.zeros((len(name),1,n_letters))
    
    for i,letter in enumerate(name):
        name_in_tensor[i][0][all_letters.find(letter)] = 1
    
    return name_in_tensor

In [48]:
print(name_intensor('a'))
print(name_intensor('a').shape)

[[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
   0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
   0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]]
(1, 1, 57)


In [49]:
print(name_intensor('af'))
print(name_intensor('af').shape)

[[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
   0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
   0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

 [[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
   0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
   0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]]
(2, 1, 57)


In [50]:
print(name_intensor('anand'))
print(name_intensor('anand').shape)

[[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
   0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
   0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

 [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
   0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
   0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

 [[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
   0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
   0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

 [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
   0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
   0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

 [[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
   0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
   0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]]
(5, 1, 57)


In [52]:
# CBOW

from keras.preprocessing import text

norm_bible = ['beginning god created heaven earth',
             'earth without form void darkness upon face deep',
             'spirit god moved upon face waters']

tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(norm_bible)
word2id = tokenizer.word_index

wids = [[w for w in text.text_to_word_sequence(doc)] for doc in norm_bible]

wids

[['beginning', 'god', 'created', 'heaven', 'earth'],
 ['earth', 'without', 'form', 'void', 'darkness', 'upon', 'face', 'deep'],
 ['spirit', 'god', 'moved', 'upon', 'face', 'waters']]

In [53]:
from keras.preprocessing import text

tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(norm_bible)
word2id = tokenizer.word_index

word2id

# build vocabulary of unique words

word2id['PAD'] = 0
id2word = {v:k for k, v in word2id.items()}
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in norm_bible]

wids

[[5, 1, 6, 7, 2], [2, 8, 9, 10, 11, 3, 4, 12], [13, 1, 14, 3, 4, 15]]

In [60]:
vocab_size = len(word2id)
embed_size = 100
window_size = 2 # context window size

vocab_size

16

In [61]:
window_size = 2

for words in wids:
    print(words)
    sentence_length = len(words)
    for index,word in enumerate(words):
        print("sentence_length:",sentence_length)
        print("WindowSize:",window_size)
        print("index:",index)
        print("Word at index {0} : '{1}'".format(index,word))
        
        context_words =[]
        context_words1 = []
       
        print("start = index - window_size")
        print("end = index + window_size + 1")
        
        start = index - window_size
        end = index + window_size + 1
        

        print("start:",start)
        print("end:",end)
       
        print("Choosing context word based on condition:")
        
        print([x for x in range(start, end)])
        for i in range(start, end):
            
            print("range index:",i)
            
            if 0 <= i:
                print("Statisfied the condition '0 <= i' ")
                
                if i < sentence_length:
                    print("Statisfied the condition 'i < sentence_length' ")
                    
                    if i != index:
                        print("Statisfied the condition 'i != index'")
                        print("Selected Word: '{0}'".format(words[i]))
            
            if  0 <= i < sentence_length and i != index:
                print("Word at range index at {0} is '{1}'".format(i,words[i]))
                context_words.append(words[i])
                
        
        print("Input:",context_words)   
        print("Output:",words[index])
        
        print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")

    
        context_words1.append([words[i] for i in range(start, end) if 0 <= i < sentence_length and i != index])
        
        
        
        print("context_words :",context_words)
        print("context_words1 :",context_words1)
        print("*****************************")
    print("###############################################################################")

[5, 1, 6, 7, 2]
sentence_length: 5
WindowSize: 2
index: 0
Word at index 0 : '5'
start = index - window_size
end = index + window_size + 1
start: -2
end: 3
Choosing context word based on condition:
[-2, -1, 0, 1, 2]
range index: -2
range index: -1
range index: 0
Statisfied the condition '0 <= i' 
Statisfied the condition 'i < sentence_length' 
range index: 1
Statisfied the condition '0 <= i' 
Statisfied the condition 'i < sentence_length' 
Statisfied the condition 'i != index'
Selected Word: '1'
Word at range index at 1 is '1'
range index: 2
Statisfied the condition '0 <= i' 
Statisfied the condition 'i < sentence_length' 
Statisfied the condition 'i != index'
Selected Word: '6'
Word at range index at 2 is '6'
Input: [1, 6]
Output: 5
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
context_words : [1, 6]
context_words1 : [[1, 6]]
*****************************
sentence_length: 5
WindowSize: 2
index: 1
Word at index 1 : '1'
start = index - window_size
end =

In [62]:
from keras.preprocessing import sequence
from keras.utils import np_utils

window_size = 2

def generator():
    for words in wids:
        sentence_length = len(words)
        for index,word in enumerate(words):
            context_words =[]
            label_word = []

            start = index - window_size
            end = index + window_size + 1

            context_words.append([words[i] for i in range(start, end) if 0 <= i < sentence_length and i != index])

            label_word.append(word)
            
            x = sequence.pad_sequences(context_words, maxlen=7)
            y = np_utils.to_categorical(label_word, vocab_size)

            yield (x, y)

In [63]:
for x ,y in generator():
    print("input: {0}  output: {1}".format(x,y))

input: [[0 0 0 0 0 1 6]]  output: [[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
input: [[0 0 0 0 5 6 7]]  output: [[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
input: [[0 0 0 5 1 7 2]]  output: [[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
input: [[0 0 0 0 1 6 2]]  output: [[0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]]
input: [[0 0 0 0 0 6 7]]  output: [[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
input: [[0 0 0 0 0 8 9]]  output: [[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
input: [[ 0  0  0  0  2  9 10]]  output: [[0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]]
input: [[ 0  0  0  2  8 10 11]]  output: [[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]]
input: [[ 0  0  0  8  9 11  3]]  output: [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]]
input: [[ 0  0  0  9 10  3  4]]  output: [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]]
input: [[ 0  0  0 10 11  4 12]]  output: [[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
input: [[ 0  0  0  

In [None]:
# Skip Gram

In [64]:
from tensorflow.keras.preprocessing.sequence import skipgrams

# generate skip-grams
skip_grams = [skipgrams(wid, vocabulary_size=vocab_size, window_size=10) for wid in wids]

# view sample skip-grams
pairs, labels = skip_grams[0][0], skip_grams[0][1]
for i in range(10):
    print("({:s} ({:d}), {:s} ({:d})) -> {:d}".format(
          id2word[pairs[i][0]], pairs[i][0], 
          id2word[pairs[i][1]], pairs[i][1], 
          labels[i]))

(heaven (7), beginning (5)) -> 1
(heaven (7), created (6)) -> 1
(god (1), beginning (5)) -> 1
(heaven (7), earth (2)) -> 1
(beginning (5), heaven (7)) -> 1
(beginning (5), god (1)) -> 0
(beginning (5), void (10)) -> 0
(beginning (5), god (1)) -> 1
(beginning (5), earth (2)) -> 1
(god (1), created (6)) -> 1


In [65]:
from tensorflow.keras.preprocessing import text

tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(norm_bible)

word2id = tokenizer.word_index
id2word = {v:k for k, v in word2id.items()}

vocab_size = len(word2id) + 1 
embed_size = 100

wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in norm_bible]
print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(word2id.items())[:10])

Vocabulary Size: 16
Vocabulary Sample: [('god', 1), ('earth', 2), ('upon', 3), ('face', 4), ('beginning', 5), ('created', 6), ('heaven', 7), ('without', 8), ('form', 9), ('void', 10)]
