In [1]:
import numpy as np
import gensim
from gensim.models import KeyedVectors
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Lambda
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer

# Sample English paragraph (5-10 sentences)
paragraph = "Natural Language Processing (NLP) is a field of artificial intelligence that enables computers to understand, interpret, and generate human language. It combines computational linguistics with machine learning and deep learning techniques. Applications of NLP include sentiment analysis, machine translation, chatbots, and text summarization. Researchers are continuously improving models to make machines understand context and semantics more accurately. The continuous bag-of-words (CBOW) model is one technique used for learning word embeddings. It predicts a target word from its surrounding context words and is widely used in NLP tasks."

# Split paragraph into sentences
sentences = paragraph.strip().split('.')
sentences = [s.lower().split() for s in sentences if len(s) > 0]
# Flatten all words to get total vocab
all_words = [word for sentence in sentences for word in sentence]
# Total number of sentences and words
total_sentences = len(sentences)
total_words = len(all_words)
print("Total Sentences:", total_sentences)
print("Total Words:", total_words)

from keras.preprocessing.sequence import pad_sequences
def cbow_model(data, window_size, total_vocab):
    total_length = window_size * 2
    for text in data:
        text_len = len(text)
        for idx, word in enumerate(text):
            context_word = []
            target = []
            begin = idx - window_size
            end = idx + window_size + 1
            for i in range(begin, end):
                if 0 <= i < text_len and i != idx:
                    context_word.append(text[i])
            target.append(word)
            # Pad sequences and one-hot encode
            contextual = pad_sequences([context_word], maxlen=total_length)
            final_target = to_categorical(all_words.index(word), num_classes=total_vocab)
            yield (contextual, final_target)

# Vocabulary size
vocab_size = len(set(all_words))
# Model parameters
embedding_dim = 50
window_size = 2
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=window_size*2))
model.add(Lambda(lambda x: tf.reduce_mean(x, axis=1)))
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.summary()

# Example: create random word vectors and write to file
dimensions = 100
vect_file = open('vectors.txt', 'w')
vect_file.write(f"{vocab_size} {dimensions}\n")
weights = np.random.rand(vocab_size, dimensions)
for i, word in enumerate(list(set(all_words))):
    final_vec = ' '.join(map(str, weights[i,:]))
    vect_file.write(f"{word} {final_vec}\n")
vect_file.close()

# Load vectors into Gensim and similarity search
cbow_output = gensim.models.KeyedVectors.load_word2vec_format('vectors.txt', binary=False)
similar_words = cbow_output.most_similar(positive=['language'], topn=5)
print(similar_words)


Total Sentences: 6
Total Words: 87




[('natural', 0.8157430291175842), ('is', 0.8093864917755127), ('techniques', 0.7974379062652588), ('a', 0.7912761569023132), ('technique', 0.7876584529876709)]
