## Skip Gram Query Check experimental Implementation

## Library 

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from collections import Counter
import re

## TextProcessor -> Count,Filter and get embedding information with word and its id

In [19]:
class TextProcessor:
    def __init__(self, min_count=5):
        self.min_count = min_count
        self.word2idx = {}
        self.idx2word = {}
        self.vocabulary_size = 0

    def build_vocab(self, texts):
        word_counts = Counter()
        for text in texts:
            words = self._preprocess_text(text)
            word_counts.update(words)

        filtered_words = [word for word, count in word_counts.items() if count >= self.min_count]

        for idx, word in enumerate(filtered_words):
            self.word2idx[word] = idx
            self.idx2word[idx] = word

        self.vocabulary_size = len(self.word2idx)

    def _preprocess_text(self, text):
        text = text.lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        return text.split()

    def text_to_indices(self, text):
        words = self._preprocess_text(text)
        return [self.word2idx[word] for word in words if word in self.word2idx]

## Dataset processor

In [4]:
class SkipGramDataset(Dataset):
    def __init__(self, texts, text_processor, window_size=2):
        self.window_size = window_size
        self.text_processor = text_processor
        self.data = []

        for text in texts:
            indices = text_processor.text_to_indices(text)

        for i in range(len(indices)):
            for w in range(-window_size, window_size + 1):
                if w == 0:
                    continue

                context_pos = i + w
                if 0 <= context_pos < len(indices):
                    self.data.append((indices[i], indices[context_pos]))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        target, context = self.data[idx]
        return torch.tensor(target), torch.tensor(context)

## SkipGram NN architecture

In [9]:
class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.output = nn.Linear(embedding_dim, vocab_size)

    def forward(self, x):
        x = self.embeddings(x)
        x = self.output(x)
        return x

## Training Sequence

In [26]:
def train_skipgram(model, train_loader, num_epochs, learning_rate=0.001):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        total_loss = 0
        for batch_size, (target, context) in enumerate(train_loader):
            optimizer.zero_grad()
            output = model(target)
            loss = criterion(output, context)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')

## Similar words extraction

In [29]:
def get_similar_words(model, word, text_processor, top_k=5):
    if word not in text_processor.word2idx:
        return []

    word_idx = text_processor.word2idx[word]
    word_vector = model.embeddings(torch.tensor([word_idx])).detach() #temp detach

    similarities = []
    for idx in range(text_processor.vocabulary_size):
        if idx == word_idx:
            continue


        other_vector = model.embeddings(torch.tensor([idx])).detach()
        similarity = torch.cosine_similarity(word_vector, other_vector)
        similarities.append((text_processor.idx2word[idx], similarity.item()))

    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:top_k]

## Testing

In [82]:
sample_texts = [
    "wireless bluetooth earbuds with noise isolation",
    "premium leather bag with hidden compartment",
    "stainless steel travel mug double walled",
    "ergonomic office chair with lumbar support",
    "portable solar charger with dual outputs",
    "durable hiking boots with waterproof lining",
    "high-speed USB drive with encryption software",
    "compact mirrorless camera with zoom lens",
    "smart fitness tracker with sleep monitor",
    "noise-cancelling headphones with wireless connectivity",
    "luxury silk pillowcase with cooling properties",
    "rechargeable electric toothbrush with timer",
    "adjustable standing desk with memory settings",
    "multi-purpose blender with glass pitcher",
    "heavy-duty toolset with torque wrench",
    "cordless vacuum cleaner with HEPA filter",
    "LED desk lamp with wireless charging pad",
    "waterproof phone case with screen protector",
    "stainless steel cookware set induction ready",
    "4K ultra HD smart TV with voice assistant",
    "wireless mechanical keyboard with tactile feedback",
    "gaming mouse with customizable RGB lighting",
    "home security system with motion detection",
    "electric kettle with temperature presets",
    "single-serve coffee maker with reusable filter",
    "digital kitchen scale with tare function",
    "portable camping stove with piezo ignition",
    "foldable electric scooter with LED display",
    "lightweight luggage set with expandable zippers",
    "baby stroller with adjustable reclining seat",
    "handheld massage gun with deep tissue capability",
    "outdoor patio heater with propane tank",
    "robotic vacuum cleaner with smart mapping",
    "wireless charging dock with phone stand",
    "inflatable kayak with reinforced seams",
    "professional drone with 4K video recording",
    "noise-isolating headset with detachable microphone",
    "energy-efficient refrigerator with ice dispenser",
    "compact air purifier with HEPA filter",
    "ergonomic gaming chair with footrest extension",
    "smart thermostat with app integration",
    "premium yoga mat with alignment lines",
    "fitness smartwatch with music streaming",
    "adjustable dumbbells with easy locking mechanism",
    "electric hair clipper with ceramic blades",
    "automatic pet feeder with voice recording",
    "video doorbell with live streaming features",
    "camping table with aluminum frame",
    "solar-powered garden lanterns with sensors",
    "memory foam mattress topper with gel cooling",
    "outdoor hammock with steel stand",
    "smart light bulbs with color changing modes",
    "fire-resistant document bag with lock",
    "high-performance GPU with ray tracing technology",
    "waterproof hiking jacket with zip pockets",
    "stainless steel pressure cooker with timer",
    "leather laptop sleeve with magnetic closure",
    "soundproof curtains with thermal insulation",
    "adjustable baby carrier with padded straps",
    "mini projector with built-in speaker",
    "smart water bottle with LED reminders",
    "LED ring light with adjustable height tripod",
    "space heater with automatic shutoff feature",
    "multifunction printer with duplex scanning",
    "cordless leaf blower with turbo button",
    "power bank with fast wireless charging",
    "camping lantern with USB port",
    "resistance bands with door anchor",
    "portable air compressor with LED gauge",
    "touchscreen laptop with 360-degree hinge",
    "fire pit with mesh safety screen",
    "kids tablet with educational apps",
    "dual-band WiFi router with parental controls",
    "lightweight sleeping bag with waterproof shell",
    "electric chainsaw with safety brake",
    "weatherproof tent with double-layer design",
    "monitor riser with cable management slots",
    "LED flashlight with zoomable focus",
    "folding treadmill with heart rate monitor",
    "smart door lock with fingerprint scanner",
    "ionic hair dryer with diffuser attachment",
    "electric pressure washer with adjustable nozzle",
    "noise-reducing wireless earbuds with bass boost",
    "gaming monitor with ultra-fast response time",
    "high-capacity SSD with shock resistance",
    "ceramic frying pan with nonstick coating",
    "water-resistant speaker with deep bass",
    "robot vacuum cleaner with self-emptying bin",
    "shoe organizer with adjustable tiers",
    "portable jump starter with safety features",
    "foldable electric scooter with safety lights",
    "electric blanket with digital controller",
    "home gym set with adjustable weights",
    "programmable coffee maker with milk frother",
    "smart plug with voice control compatibility",
    "air mattress with built-in electric pump",
    "food storage containers with airtight seals",
    "sewing machine with automatic threader",
    "cloud-connected digital photo frame",
    "compact binoculars with waterproof coating",
    "spacious tent with removable dividers",
    "solar power bank with multiple ports",
    "hammock with mosquito net and rain fly",
    "pressure washer with detergent tank",
    "ergonomic wireless mouse with silent clicks",
    "laptop cooling pad with dual fans",
    "portable fridge with freezer section",
    "cordless drill with multi-speed settings",
    "air fryer with touch screen controls",
    "LED headlamp with adjustable brightness",
    "car charger with quick charge feature",
    "hiking poles with foam grips",
    "standing desk converter with keyboard tray",
    "solar panel with foldable design",
    "plush towel set with cotton blend",
    "smart smoke detector with real-time alerts",
    "alarm clock with sunrise simulation",
    "hand vacuum with crevice tool",
    "fitness watch with calorie counter",
    "outdoor floodlight with motion detection",
    "Bluetooth watch with waterproof case",
    "wireless keyboard with silent keys",
    "ceramic knife set with block",
    "inflatable paddleboard with repair kit",
    "indoor grill with removable plates",
    "baby gate with safety lock",
    "tripod stand with phone mount",
]


In [83]:
text_processor = TextProcessor(min_count=1)
text_processor.build_vocab(sample_texts)

In [84]:
dataset = SkipGramDataset(sample_texts, text_processor, window_size=3)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

In [85]:
model = SkipGramModel(vocab_size=text_processor.vocabulary_size, embedding_dim=500)
train_skipgram(model, dataloader, num_epochs=500)

Epoch [1/500], Loss: 6.3083
Epoch [2/500], Loss: 5.8922
Epoch [3/500], Loss: 5.4841
Epoch [4/500], Loss: 5.0837
Epoch [5/500], Loss: 4.6912
Epoch [6/500], Loss: 4.3078
Epoch [7/500], Loss: 3.9356
Epoch [8/500], Loss: 3.5779
Epoch [9/500], Loss: 3.2387
Epoch [10/500], Loss: 2.9231
Epoch [11/500], Loss: 2.6360
Epoch [12/500], Loss: 2.3818
Epoch [13/500], Loss: 2.1633
Epoch [14/500], Loss: 1.9811
Epoch [15/500], Loss: 1.8338
Epoch [16/500], Loss: 1.7176
Epoch [17/500], Loss: 1.6272
Epoch [18/500], Loss: 1.5570
Epoch [19/500], Loss: 1.5020
Epoch [20/500], Loss: 1.4582
Epoch [21/500], Loss: 1.4231
Epoch [22/500], Loss: 1.3949
Epoch [23/500], Loss: 1.3727
Epoch [24/500], Loss: 1.3557
Epoch [25/500], Loss: 1.3433
Epoch [26/500], Loss: 1.3348
Epoch [27/500], Loss: 1.3293
Epoch [28/500], Loss: 1.3258
Epoch [29/500], Loss: 1.3235
Epoch [30/500], Loss: 1.3216
Epoch [31/500], Loss: 1.3197
Epoch [32/500], Loss: 1.3176
Epoch [33/500], Loss: 1.3154
Epoch [34/500], Loss: 1.3130
Epoch [35/500], Loss: 1

In [88]:
test_word = "LED"
similar_words = get_similar_words(model, test_word, text_processor)

In [89]:
print(f"\nWords similar to '{test_word}':")
for word, similarity in similar_words:
    print(f"{word}: {similarity:.4f}")


Words similar to 'LED':


In [44]:
import random

# Predefined lists of words for generating sample texts
nouns = [
    "headphones", "wallet", "bottle", "laptop", "phone", "watch",
    "camera", "tablet", "charger", "monitor", "keyboard", "mouse"
]

adjectives = [
    "wireless", "premium", "stainless steel", "leather", "waterproof",
    "noise cancelling", "compact", "portable", "vaccum insulated",
    "lightweight", "durable", "high quality", "stylish", "eco-friendly"
]

verbs = [
    "with", "and", "for", "of", "from", "by", "using", "to", "in", "on"
]

sample_texts = []

# Generate 500 lines of text
for _ in range(500):
    # Randomly select words from the predefined lists
    noun = random.choice(nouns)
    adjective1 = random.choice(adjectives)
    adjective2 = random.choice(adjectives) if random.random() < 0.3 else ""
    verb = "with" if adjective2 else ""  # Use "with" only if there's a second adjective
    adjective3 = random.choice(adjectives) if adjective2 and random.random() < 0.5 else ""
    
    # Create the sentence with exactly 5 words
    text = f"{adjective1} {adjective2} {noun} {verb} {adjective3}".strip()
    sample_texts.append(text)

# Print a few samples to verify the output
for i in range(100):
    print(sample_texts[i])


compact  bottle
stylish  bottle
premium  wallet
premium  laptop
eco-friendly  headphones
portable  mouse
leather  laptop
noise cancelling  headphones
noise cancelling stainless steel bottle with
compact  keyboard
stainless steel  phone
premium  tablet
eco-friendly  camera
vaccum insulated  keyboard
wireless  monitor
noise cancelling  mouse
high quality  laptop
noise cancelling portable charger with stainless steel
high quality  tablet
durable portable camera with
stainless steel  camera
durable leather charger with
stylish  camera
vaccum insulated  keyboard
high quality  wallet
compact  bottle
stainless steel  mouse
durable portable monitor with stylish
durable  watch
vaccum insulated premium phone with
waterproof stainless steel keyboard with
premium  tablet
durable  tablet
durable leather bottle with lightweight
eco-friendly  bottle
waterproof  watch
wireless  bottle
leather lightweight wallet with wireless
lightweight compact wallet with
eco-friendly eco-friendly watch with stainles