#### Word Embedding

In [6]:
import os
from keras.preprocessing.text import one_hot
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [7]:
docs = ['Well done!',
 'Good work',
 'Great effort',
 'nice work',
 'Excellent!',
 'Weak',
 'Poor effort!',
 'not good',
 'poor work',
 'Could have done better.']
# define class labels
labels = array([1,1,1,1,1,0,0,0,0,0])

In [11]:
one_hot('Hello how are you', 3)

[2, 2, 1, 1]

In [12]:
vocab_size = 50
encoded_docs = [one_hot(d, vocab_size) for d in docs]
encoded_docs

[[41, 11],
 [30, 3],
 [1, 32],
 [35, 3],
 [8],
 [15],
 [6, 32],
 [14, 30],
 [6, 3],
 [32, 20, 11, 27]]

In [13]:
max_length = 4
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
padded_docs

array([[41, 11,  0,  0],
       [30,  3,  0,  0],
       [ 1, 32,  0,  0],
       [35,  3,  0,  0],
       [ 8,  0,  0,  0],
       [15,  0,  0,  0],
       [ 6, 32,  0,  0],
       [14, 30,  0,  0],
       [ 6,  3,  0,  0],
       [32, 20, 11, 27]])

In [21]:
model = Sequential()
model.add(Embedding(vocab_size, 3, input_length=max_length,name='emb'))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

In [22]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 emb (Embedding)             (None, 4, 3)              150       
                                                                 
 flatten_3 (Flatten)         (None, 12)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 13        
                                                                 
Total params: 163
Trainable params: 163
Non-trainable params: 0
_________________________________________________________________
None


In [23]:
model.fit(padded_docs, labels, epochs=50, verbose=0)

<keras.callbacks.History at 0x2bd6c12b580>

In [24]:
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 69.999999


In [27]:
weights=model.get_layer('emb').get_weights()[0]

In [28]:
weights[41]

array([ 0.09815478, -0.04210135, -0.08211587], dtype=float32)

In [29]:
weights[1]

array([ 0.0942912 , -0.06128442, -0.02740877], dtype=float32)

#### Word2Vec

In [30]:
import gensim
import pandas as pd

In [31]:
df = pd.read_json("E:/DS/Datasets/Cell_Phones_and_Accessories_5.json", lines=True)
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A30TL5EWN6DFXT,120401325X,christina,"[0, 0]",They look good and stick good! I just don't li...,4,Looks Good,1400630400,"05 21, 2014"
1,ASY55RVNIL0UD,120401325X,emily l.,"[0, 0]",These stickers work like the review says they ...,5,Really great product.,1389657600,"01 14, 2014"
2,A2TMXE2AFO7ONB,120401325X,Erica,"[0, 0]",These are awesome and make my phone look so st...,5,LOVE LOVE LOVE,1403740800,"06 26, 2014"
3,AWJ0WZQYMYFQ4,120401325X,JM,"[4, 4]",Item arrived in great time and was in perfect ...,4,Cute!,1382313600,"10 21, 2013"
4,ATX7CZYFXI1KW,120401325X,patrice m rogoza,"[2, 3]","awesome! stays on, and looks great. can be use...",5,leopard home button sticker for iphone 4s,1359849600,"02 3, 2013"


In [35]:
len(df.reviewText[0])

189

#### Simple Preprocessing & Tokenization
The first thing to do for any data science task is to clean the data. For NLP, we apply various processing like converting all the words to lower case, trimming spaces, removing punctuations. This is something we will do over here too.

Additionally, we can also remove stop words like 'and', 'or', 'is', 'the', 'a', 'an' and convert words to their root forms like 'running' to 'run'.

In [34]:
len(gensim.utils.simple_preprocess(df.reviewText[0]))

33

In [36]:
review_text = df.reviewText.apply(gensim.utils.simple_preprocess)
review_text

0         [they, look, good, and, stick, good, just, don...
1         [these, stickers, work, like, the, review, say...
2         [these, are, awesome, and, make, my, phone, lo...
3         [item, arrived, in, great, time, and, was, in,...
4         [awesome, stays, on, and, looks, great, can, b...
                                ...                        
194434    [works, great, just, like, my, original, one, ...
194435    [great, product, great, packaging, high, quali...
194436    [this, is, great, cable, just, as, good, as, t...
194437    [really, like, it, becasue, it, works, well, w...
194438    [product, as, described, have, wasted, lot, of...
Name: reviewText, Length: 194439, dtype: object

In [37]:
review_text.loc[0]

['they',
 'look',
 'good',
 'and',
 'stick',
 'good',
 'just',
 'don',
 'like',
 'the',
 'rounded',
 'shape',
 'because',
 'was',
 'always',
 'bumping',
 'it',
 'and',
 'siri',
 'kept',
 'popping',
 'up',
 'and',
 'it',
 'was',
 'irritating',
 'just',
 'won',
 'buy',
 'product',
 'like',
 'this',
 'again']

#### Training the Word2Vec Model
Train the model for reviews. Use a window of size 10 i.e. 10 words before the present word and 10 words ahead. A sentence with at least 2 words should only be considered, configure this using min_count parameter.

Workers define how many CPU threads to be used.

In [38]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2,
    workers=4,
)

In [39]:
model.build_vocab(review_text, progress_per=1000)

In [40]:
model.corpus_count

194439

In [41]:
model.epochs

5

In [42]:
model.train(review_text, total_examples=model.corpus_count, epochs=model.epochs)

(61507941, 83868975)

In [43]:
model.save("E:/DS/Datasets/word2vec-amazon-cell-accessories-reviews-short.model")

In [44]:
model.wv.most_similar("bad")

[('terrible', 0.6802547574043274),
 ('shabby', 0.6135326027870178),
 ('horrible', 0.5993013381958008),
 ('good', 0.5745214819908142),
 ('awful', 0.5598459243774414),
 ('crappy', 0.5416638851165771),
 ('cheap', 0.5270234942436218),
 ('okay', 0.5225250720977783),
 ('funny', 0.5218287706375122),
 ('legit', 0.5145692825317383)]

In [45]:
model.wv.similarity(w1="cheap", w2="inexpensive")

0.4994413

In [46]:
model.wv.similarity(w1="great", w2="good")

0.7878908

##### Tensorflow Pipeline

In [47]:
import tensorflow as tf

In [48]:
daily_sales_numbers = [21, 22, -108, 31, -1, 32, 34,31]

tf_dataset = tf.data.Dataset.from_tensor_slices(daily_sales_numbers)
tf_dataset

<TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.int32, name=None)>

In [49]:
for sales in tf_dataset:
    print(sales.numpy())

21
22
-108
31
-1
32
34
31


In [50]:
for sales in tf_dataset.as_numpy_iterator():
    print(sales)

21
22
-108
31
-1
32
34
31


In [51]:
for sales in tf_dataset.take(3):
    print(sales.numpy())

21
22
-108


In [52]:
tf_dataset = tf_dataset.filter(lambda x: x>0)
for sales in tf_dataset.as_numpy_iterator():
    print(sales)

21
22
31
32
34
31


In [53]:
tf_dataset = tf_dataset.map(lambda x: x*72)
for sales in tf_dataset.as_numpy_iterator():
    print(sales)

1512
1584
2232
2304
2448
2232


In [55]:
tf_dataset = tf_dataset.shuffle(2)
for sales in tf_dataset.as_numpy_iterator():
    print(sales)

1584
2304
2232
2232
1512
2448


In [56]:
for sales_batch in tf_dataset.batch(2):
    print(sales_batch.numpy())

[1512 2304]
[2448 2232]
[1584 2232]


In [57]:
tf_dataset = tf.data.Dataset.from_tensor_slices(daily_sales_numbers)

tf_dataset = tf_dataset.filter(lambda x: x>0).map(lambda y: y*72).shuffle(2).batch(2)
for sales in tf_dataset.as_numpy_iterator():
    print(sales)

[1584 2232]
[1512 2448]
[2304 2232]


In [58]:
images_ds = tf.data.Dataset.list_files('E:DS/Datasets/catdog/*/*', shuffle=False)

In [59]:
image_count = len(images_ds)
image_count

198

In [60]:
for file in images_ds.take(3):
    print(file.numpy())

b'E:DS\\Datasets\\catdog\\cat\\00tb-cats1-mediumSquareAt3X.jpg'
b'E:DS\\Datasets\\catdog\\cat\\016f72c5812e1b8f71bdbf19d8c7558b.jpg'
b'E:DS\\Datasets\\catdog\\cat\\07CAT-STRIPES-mediumSquareAt3X-v2.jpg'


In [61]:
images_ds = images_ds.shuffle(200)
for file in images_ds.take(3):
    print(file.numpy())

b'E:DS\\Datasets\\catdog\\cat\\maxresdefault (2).jpg'
b'E:DS\\Datasets\\catdog\\dog\\15014.jpg'
b'E:DS\\Datasets\\catdog\\dog\\golden-retriever-sitting-in-front-of-a-white-background.jpg'


In [62]:
class_names = ["cat","dog"]

In [63]:
train_size = int(image_count*0.8)
train_ds = images_ds.take(train_size)
test_ds = images_ds.skip(train_size)

In [64]:
len(train_ds)

158

In [65]:
len(test_ds)

40

In [68]:
s='Hi\\are\\you'
s.split('\\')[-2]

'are'

In [69]:
def get_label(file_path):
    import os
    parts = tf.strings.split(file_path, os.path.sep)
    return parts[-2]

In [70]:
def process_image(file_path):
    label = get_label(file_path)
    img = tf.io.read_file(file_path) # load the raw data from the file as a string
    img = tf.image.decode_jpeg(img)
    img = tf.image.resize(img, [128, 128])
    return img, label

In [71]:
train_ds = train_ds.map(process_image)
test_ds = test_ds.map(process_image)

In [72]:
def scale(image, label):
    return image/255, label

In [73]:
train_ds = train_ds.map(scale)

#### Optimize tensorflow pipeline performance with prefetch and caching

In [74]:
import tensorflow as tf
import time

In [76]:
class FileDataset(tf.data.Dataset):
    def read_file_in_batches(num_samples):
        # Opening the file
        time.sleep(0.03)

        for sample_idx in range(num_samples):
            # Reading data (line, record) from the file
            time.sleep(0.015)

            yield (sample_idx,)

    def __new__(cls, num_samples=3):
        return tf.data.Dataset.from_generator(
            cls.read_file_in_batches,
            output_signature = tf.TensorSpec(shape = (1,), dtype = tf.int64),
            args=(num_samples,)
        )

In [77]:
def benchmark(dataset, num_epochs=2):
    for epoch_num in range(num_epochs):
        for sample in dataset:
            # Performing a training step
            time.sleep(0.01)

In [78]:
%%timeit
benchmark(FileDataset())

574 ms ± 34.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [79]:
%%timeit
benchmark(FileDataset().prefetch(1))

526 ms ± 38 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [80]:
%%timeit
benchmark(FileDataset().prefetch(tf.data.AUTOTUNE))

515 ms ± 91.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [81]:
dataset = tf.data.Dataset.range(5)
dataset = dataset.map(lambda x: x**2)
dataset = dataset.cache("mycache.txt")
# The first time reading through the data will generate the data using
# `range` and `map`.
list(dataset.as_numpy_iterator())

[0, 1, 4, 9, 16]

In [82]:
def mapped_function(s):
    # Do some hard pre-processing
    tf.py_function(lambda: time.sleep(0.03), [], ())
    return s

In [83]:
%%timeit -r1 -n1
benchmark(FileDataset().map(mapped_function), 5)

1.93 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [84]:
%%timeit -r1 -n1
benchmark(FileDataset().map(mapped_function).cache(), 5)

628 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
