#Program 1
***Design and implement a neural based network for generating word embedding for words in a document corpus.***

In [None]:
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

In [None]:
corpus=[
    "Deep learning is a core subject of artificial intelligence",
    "Machine learning is a subbranch of deep learning",
    "Convolutional Neural Network (CNN) is a basic deep neural network in deep learning",
    "Alex and Visual Geometry Group (VGG) neural networks are pre trained deep neural networks",
    "Deep residual network is used in image recognition"
]

In [None]:
tokenized_corpus=[simple_preprocess(line) for line in corpus]

In [None]:
print(tokenized_corpus)

[['deep', 'learning', 'is', 'core', 'subject', 'of', 'artificial', 'intelligence'], ['machine', 'learning', 'is', 'subbranch', 'of', 'deep', 'learning'], ['convolutional', 'neural', 'network', 'cnn', 'is', 'basic', 'deep', 'neural', 'network', 'in', 'deep', 'learning'], ['alex', 'and', 'visual', 'geometry', 'group', 'vgg', 'neural', 'networks', 'are', 'pre', 'trained', 'deep', 'neural', 'networks'], ['deep', 'residual', 'network', 'is', 'used', 'in', 'image', 'recognition']]


In [None]:
model = Word2Vec(
    sentences=tokenized_corpus,
    vector_size=300,
    window=3,
    min_count=1,
    sg=1,
    epochs=100
)

In [None]:
words = [word for sentence in tokenized_corpus for word in sentence]
vocab = set(words)
word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for word, idx in word2idx.items()}
vocab_size = len(vocab)

In [None]:
print(words)

['deep', 'learning', 'is', 'core', 'subject', 'of', 'artificial', 'intelligence', 'machine', 'learning', 'is', 'subbranch', 'of', 'deep', 'learning', 'convolutional', 'neural', 'network', 'cnn', 'is', 'basic', 'deep', 'neural', 'network', 'in', 'deep', 'learning', 'alex', 'and', 'visual', 'geometry', 'group', 'vgg', 'neural', 'networks', 'are', 'pre', 'trained', 'deep', 'neural', 'networks', 'deep', 'residual', 'network', 'is', 'used', 'in', 'image', 'recognition']


In [None]:
print(vocab)

{'cnn', 'group', 'used', 'neural', 'learning', 'core', 'subject', 'machine', 'recognition', 'networks', 'pre', 'trained', 'network', 'image', 'intelligence', 'geometry', 'residual', 'subbranch', 'are', 'basic', 'vgg', 'in', 'and', 'of', 'alex', 'is', 'deep', 'artificial', 'convolutional', 'visual'}


In [None]:
print(vocab_size)

30


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

class WordEmbeddingModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(WordEmbeddingModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, inputs):
      embeds = self.embeddings(inputs)
      output = self.linear(embeds)
      return output

vector_size = 300
model = WordEmbeddingModel(vocab_size, vector_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

training_data = []
for sentence in tokenized_corpus:
    for i, target_word in enumerate(sentence):
        target_idx = word2idx[target_word]
        context_idx = word2idx[target_word]
        training_data.append((context_idx, target_idx))

epochs = 100
for epoch in range(epochs):
    total_loss = 0
    for context_idx, target_idx in training_data:
        context_tensor = torch.LongTensor([context_idx])
        target_tensor = torch.LongTensor([target_idx])

        outputs = model(context_tensor)
        loss = criterion(outputs, target_tensor)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        if (epoch + 1) % 10 == 0:
          print(f'Epoch [{epoch+1}/{epochs}], Loss: {total_loss/len(training_data):.4f}')

word_embeddings = model.embeddings.weight.data

print("\nWord embeddings:")
for word in ["deep", "learning", "intelligence", "network"]:
    if word in word2idx:
        idx = word2idx[word]
        print(f"{word}: {word_embeddings[idx].numpy()}")

Epoch [10/100], Loss: 0.0001
Epoch [10/100], Loss: 0.0004
Epoch [10/100], Loss: 0.0006
Epoch [10/100], Loss: 0.0017
Epoch [10/100], Loss: 0.0027
Epoch [10/100], Loss: 0.0031
Epoch [10/100], Loss: 0.0043
Epoch [10/100], Loss: 0.0050
Epoch [10/100], Loss: 0.0060
Epoch [10/100], Loss: 0.0063
Epoch [10/100], Loss: 0.0065
Epoch [10/100], Loss: 0.0074
Epoch [10/100], Loss: 0.0078
Epoch [10/100], Loss: 0.0079
Epoch [10/100], Loss: 0.0081
Epoch [10/100], Loss: 0.0092
Epoch [10/100], Loss: 0.0095
Epoch [10/100], Loss: 0.0098
Epoch [10/100], Loss: 0.0107
Epoch [10/100], Loss: 0.0110
Epoch [10/100], Loss: 0.0118
Epoch [10/100], Loss: 0.0119
Epoch [10/100], Loss: 0.0122
Epoch [10/100], Loss: 0.0125
Epoch [10/100], Loss: 0.0131
Epoch [10/100], Loss: 0.0132
Epoch [10/100], Loss: 0.0134
Epoch [10/100], Loss: 0.0141
Epoch [10/100], Loss: 0.0151
Epoch [10/100], Loss: 0.0158
Epoch [10/100], Loss: 0.0167
Epoch [10/100], Loss: 0.0175
Epoch [10/100], Loss: 0.0182
Epoch [10/100], Loss: 0.0185
Epoch [10/100]