In [2]:
# Ensure these installations are done if not already:
# !pip install gensim transformers torch

import nltk
import re
import string
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('punkt_tab')

sentences = [
    "Turn on the living room light",
    "Increase the thermostat temperature",
    "Sensor reading delayed due to network issue",
    "Check humidity levels in the greenhouse"
]

# Basic preprocessing: lowercase, remove punctuation, tokenize
def simple_preprocess(text):
    text = text.lower()
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    tokens = word_tokenize(text)
    return tokens

tokenized_sentences = [simple_preprocess(s) for s in sentences]
print("Tokenized Sentences:")
print(tokenized_sentences)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...


Tokenized Sentences:
[['turn', 'on', 'the', 'living', 'room', 'light'], ['increase', 'the', 'thermostat', 'temperature'], ['sensor', 'reading', 'delayed', 'due', 'to', 'network', 'issue'], ['check', 'humidity', 'levels', 'in', 'the', 'greenhouse']]


[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [3]:
# Get vocabulary
all_tokens = [token for sent in tokenized_sentences for token in sent]
vocab = list(set(all_tokens))
vocab.sort()
print("Vocabulary:", vocab)

# Create a mapping from word to index
word_to_index = {w: i for i, w in enumerate(vocab)}

def one_hot_encode(token, word_to_index):
    vector = [0]*(len(word_to_index))
    if token in word_to_index:
        vector[word_to_index[token]] = 1
    return vector

# Example: encode "thermostat"
thermostat_vector = one_hot_encode("thermostat", word_to_index)
print("One-hot for 'thermostat':", thermostat_vector)


Vocabulary: ['check', 'delayed', 'due', 'greenhouse', 'humidity', 'in', 'increase', 'issue', 'levels', 'light', 'living', 'network', 'on', 'reading', 'room', 'sensor', 'temperature', 'the', 'thermostat', 'to', 'turn']
One-hot for 'thermostat': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]


In [4]:
from gensim.models import Word2Vec

# Train a Word2Vec model
w2v_model = Word2Vec(sentences=tokenized_sentences, vector_size=50, window=5, min_count=1, workers=4)
w2v_model.train(tokenized_sentences, total_examples=w2v_model.corpus_count, epochs=10)

# Get embedding for a word
thermostat_embedding = w2v_model.wv["thermostat"]
print("Word2Vec embedding for 'thermostat':\n", thermostat_embedding)




Word2Vec embedding for 'thermostat':
 [-0.01427785  0.00248062 -0.01435238 -0.00448586  0.00743716  0.01166521
  0.00240026  0.00420425 -0.00822123  0.01444707 -0.01261343  0.00929421
 -0.01643505  0.00406795 -0.0099551  -0.00849571 -0.00621964  0.01130703
  0.01159536 -0.00995255  0.0015448  -0.01698785  0.01562211  0.0185174
 -0.00548613  0.00159988  0.00148695  0.01095375 -0.01721281  0.00116747
  0.01374174  0.00446148  0.00225555 -0.01864281  0.01696442 -0.0125275
 -0.00598699  0.00698934 -0.00154697  0.00282321  0.00356525 -0.01365327
 -0.01945076  0.01807933  0.01240023 -0.01382502  0.00680445  0.00041468
  0.00950904 -0.01424117]


In [5]:
print("Words similar to 'temperature':", w2v_model.wv.most_similar("temperature"))


Words similar to 'temperature': [('room', 0.22449137270450592), ('due', 0.12598390877246857), ('to', 0.11852198094129562), ('the', 0.09997450560331345), ('turn', 0.09713273495435715), ('light', 0.08990946412086487), ('in', 0.058561909943819046), ('check', 0.0489499568939209), ('greenhouse', 0.0013737966073676944), ('increase', -0.0013596273493021727)]


In [6]:
from transformers import BertTokenizer, BertModel
import torch

# Load a pretrained BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Encode a single sentence
sentence = "Turn on the living room light"
inputs = tokenizer(sentence, return_tensors='pt')

# Get hidden states from BERT
with torch.no_grad():
    outputs = model(**inputs)
    # outputs is a tuple: (last_hidden_state, pooler_output, hidden_states)
    last_hidden_state = outputs.last_hidden_state

# last_hidden_state[0] is a matrix: [number_of_tokens x embedding_size]
# For 'bert-base-uncased', embedding_size=768
print("Shape of contextual embeddings:", last_hidden_state.shape)

# Retrieve the embedding for the token "light"
tokens = tokenizer.tokenize(sentence)
light_index = tokens.index("light") + 1  # +1 for CLS token offset
light_embedding = last_hidden_state[0, light_index, :].numpy()
print("Contextual embedding for 'light':\n", light_embedding[:10], "...")  # Show first 10 dimensions for brevity


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Shape of contextual embeddings: torch.Size([1, 8, 768])
Contextual embedding for 'light':
 [ 0.16429132 -0.39750677 -0.19628845 -0.12849177  0.9441986  -0.34701523
  0.19516525 -0.20958914 -0.24018592 -1.0789583 ] ...
