# Task 1 – Basic Word Tokenization + Embedding Layer
**  1. Take a short set of sentences (5–6 sentences).

    2. Tokenize them using Keras Tokenizer.
    
    3. Create an Embedding layer (say output_dim=8) and run them through a simple neural network.
    
    4. Just print the embedding output for each word.
    
    📌 Goal: Understand how words turn into vectors inside the embedding.**

In [13]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense


In [14]:
docs= [
    'Creating an immersive world for a story is like building a house from the ground up, brick by brick.',
    'Every detail, from the scent of the air to the texture of the cobblestones underfoot, contributes to its foundation.',
    'The goal is to make the setting feel so real that the reader can step inside, becoming a part of the landscape.',
    'A well-crafted world isnt just a backdrop its a character in its own right, with a history, a culture, and secrets waiting to be unearthed.',
    'When a story world is thoughtfully constructed, it enriches the plot and allows the characters to feel more authentic.',
    'Its this deep level of engagement that keeps readers turning pages long into the night.'
]

In [15]:
print(type(docs))

<class 'list'>


In [16]:
docs[0]

'Creating an immersive world for a story is like building a house from the ground up, brick by brick.'

In [17]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(docs)


In [18]:
tokenizer.word_index

{'the': 1,
 'a': 2,
 'to': 3,
 'of': 4,
 'its': 5,
 'world': 6,
 'is': 7,
 'story': 8,
 'from': 9,
 'brick': 10,
 'feel': 11,
 'that': 12,
 'and': 13,
 'creating': 14,
 'an': 15,
 'immersive': 16,
 'for': 17,
 'like': 18,
 'building': 19,
 'house': 20,
 'ground': 21,
 'up': 22,
 'by': 23,
 'every': 24,
 'detail': 25,
 'scent': 26,
 'air': 27,
 'texture': 28,
 'cobblestones': 29,
 'underfoot': 30,
 'contributes': 31,
 'foundation': 32,
 'goal': 33,
 'make': 34,
 'setting': 35,
 'so': 36,
 'real': 37,
 'reader': 38,
 'can': 39,
 'step': 40,
 'inside': 41,
 'becoming': 42,
 'part': 43,
 'landscape': 44,
 'well': 45,
 'crafted': 46,
 'isnt': 47,
 'just': 48,
 'backdrop': 49,
 'character': 50,
 'in': 51,
 'own': 52,
 'right': 53,
 'with': 54,
 'history': 55,
 'culture': 56,
 'secrets': 57,
 'waiting': 58,
 'be': 59,
 'unearthed': 60,
 'when': 61,
 'thoughtfully': 62,
 'constructed': 63,
 'it': 64,
 'enriches': 65,
 'plot': 66,
 'allows': 67,
 'characters': 68,
 'more': 69,
 'authentic': 70,

In [19]:
len(tokenizer.word_index)

81

In [20]:
tokenizer.word_counts

OrderedDict([('creating', 1),
             ('an', 1),
             ('immersive', 1),
             ('world', 3),
             ('for', 1),
             ('a', 9),
             ('story', 2),
             ('is', 3),
             ('like', 1),
             ('building', 1),
             ('house', 1),
             ('from', 2),
             ('the', 12),
             ('ground', 1),
             ('up', 1),
             ('brick', 2),
             ('by', 1),
             ('every', 1),
             ('detail', 1),
             ('scent', 1),
             ('of', 4),
             ('air', 1),
             ('to', 5),
             ('texture', 1),
             ('cobblestones', 1),
             ('underfoot', 1),
             ('contributes', 1),
             ('its', 4),
             ('foundation', 1),
             ('goal', 1),
             ('make', 1),
             ('setting', 1),
             ('feel', 2),
             ('so', 1),
             ('real', 1),
             ('that', 2),
             ('reader', 1),
 

In [21]:
tokenizer.document_count

6

In [22]:
sequence=tokenizer.texts_to_sequences(docs)
sequence

[[14, 15, 16, 6, 17, 2, 8, 7, 18, 19, 2, 20, 9, 1, 21, 22, 10, 23, 10],
 [24, 25, 9, 1, 26, 4, 1, 27, 3, 1, 28, 4, 1, 29, 30, 31, 3, 5, 32],
 [1,
  33,
  7,
  3,
  34,
  1,
  35,
  11,
  36,
  37,
  12,
  1,
  38,
  39,
  40,
  41,
  42,
  2,
  43,
  4,
  1,
  44],
 [2,
  45,
  46,
  6,
  47,
  48,
  2,
  49,
  5,
  2,
  50,
  51,
  5,
  52,
  53,
  54,
  2,
  55,
  2,
  56,
  13,
  57,
  58,
  3,
  59,
  60],
 [61, 2, 8, 6, 7, 62, 63, 64, 65, 1, 66, 13, 67, 1, 68, 3, 11, 69, 70],
 [5, 71, 72, 73, 4, 74, 12, 75, 76, 77, 78, 79, 80, 1, 81]]

In [23]:
len(sequence)

6

In [24]:
from keras.utils import pad_sequences

pad_sequences=pad_sequences(sequence,padding='post')
print(pad_sequences)


[[14 15 16  6 17  2  8  7 18 19  2 20  9  1 21 22 10 23 10  0  0  0  0  0
   0  0]
 [24 25  9  1 26  4  1 27  3  1 28  4  1 29 30 31  3  5 32  0  0  0  0  0
   0  0]
 [ 1 33  7  3 34  1 35 11 36 37 12  1 38 39 40 41 42  2 43  4  1 44  0  0
   0  0]
 [ 2 45 46  6 47 48  2 49  5  2 50 51  5 52 53 54  2 55  2 56 13 57 58  3
  59 60]
 [61  2  8  6  7 62 63 64 65  1 66 13 67  1 68  3 11 69 70  0  0  0  0  0
   0  0]
 [ 5 71 72 73  4 74 12 75 76 77 78 79 80  1 81  0  0  0  0  0  0  0  0  0
   0  0]]


In [25]:
model=Sequential()
model.add(Embedding(input_dim=81,output_dim=128,input_length=139))
model.add(LSTM(128))
model.add(Dense(1,activation='sigmoid'))

model.build(input_shape=(None,139))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

