In [178]:
from gensim.models import Word2Vec

def find_similar_words(input_word, model, topn=5):
    """
    Find and print similar words to the input word using a pre-trained Word2Vec model.

    Parameters:
    - input_word (str): The input word for which to find similar words.
    - model (Word2Vec): The pre-trained Word2Vec model.
    - topn (int): Number of similar words to retrieve (default is 5).
    """
    try:
        # Find similar words to the input word
        similar_words = model.wv.most_similar(positive=[input_word], topn=topn)

        # Print the results
        print(f"Similar words to '{input_word}':")
        for word, similarity in similar_words:
            print(f"{word}: {similarity}")

        # Find the most similar word and its similarity score
        most_similar_word, max_similarity = max(similar_words, key=lambda x: x[1])
        print(f"\nMost similar word: {most_similar_word} (Similarity: {max_similarity})")

    except KeyError:
        print(f"Word '{input_word}' not found in the vocabulary.")

# Example usage:
# Load your pre-trained Word2Vec model (replace 'your_model_path' with the actual path to your model file)
word2vec_model = Word2Vec.load('bbc.model')

# Input word for which to find similar words
input_word = "software"

# Call the function
find_similar_words(input_word, word2vec_model)


Similar words to 'software':
computer: 0.9748510122299194
search: 0.9716047644615173
internet: 0.9667606353759766
access: 0.9660167694091797
user: 0.9633614420890808

Most similar word: computer (Similarity: 0.9748510122299194)


In [18]:
from gensim.models import Word2Vec
from gensim.models import CoherenceModel
import numpy as np
import random
from nltk.tokenize import word_tokenize
from gensim import corpora

# Function to preprocess the text
def preprocess_text(text):
    return word_tokenize(text.lower())

# Function to find similar words
def find_similar_words(input_word, model, topn=5):
    try:
        # Find similar words to the input word
        similar_words = model.wv.most_similar(positive=[input_word], topn=topn)
        return [word for word, _ in similar_words]
    except KeyError:
        print(f"Word '{input_word}' not found in the vocabulary.")
        return []

# Function to extract topic distributions
def extract_topic_distributions(model, input_text, num_topics=1):
    topic_distributions = []

    # Tokenize the input text
    input_tokens = preprocess_text(input_text)

    for _ in range(num_topics):
        # Choose a random word from the input text
        random_word = np.random.choice(input_tokens)

        # Find similar words to the random word
        similar_words = find_similar_words(random_word, model)

        # Normalize the distribution
        topic_distribution = {word: 1.0 / len(similar_words) for word in similar_words}
        topic_distributions.append(topic_distribution)

    return topic_distributions

# Perform Gibbs sampling to reveal hidden topics
def gibbs_sampling(text, model, topic_distributions, num_iterations=100):
    tokenized_text = preprocess_text(text)
    num_topics = len(topic_distributions)

    for _ in range(num_iterations):
        for i, word in enumerate(tokenized_text):
            if word in model.wv.key_to_index and random.random() < 0.5:
                sampled_topic = np.random.choice(num_topics)
                sampled_word = np.random.choice(list(topic_distributions[sampled_topic].keys()), p=list(topic_distributions[sampled_topic].values()))
                tokenized_text[i] = sampled_word

    return ' '.join(tokenized_text)

# Example usage:
# Load your pre-trained Word2Vec model
word2vec_model = Word2Vec.load('bbc.model')

# Input text for which to extract topic distributions
input_text = "digital technology"

# Call the function to extract topic distributions
topic_distributions = extract_topic_distributions(word2vec_model, input_text)

# Print the extracted topic distributions
print("Extracted Topic Distributions:")
for i, topic_distribution in enumerate(topic_distributions, 1):
    print(f"Topic {i}: {topic_distribution}")

# Perform Gibbs sampling to reveal hidden topics
sampled_text = gibbs_sampling(input_text, word2vec_model, topic_distributions)

# Print original and sampled text
print("\nOriginal Text:")
print(input_text)
print("\nSampled Text with Revealed Topics:")
print(sampled_text)

# Calculate Coherence Score (C_v)
#texts = [preprocess_text(sampled_text)]  # Assuming you want to calculate coherence for the sampled text
#dictionary = corpora.Dictionary(texts)
#corpus = [dictionary.doc2bow(text) for text in texts]

#coherence_model = CoherenceModel(topics=[list(topic.keys()) for topic in topic_distributions], texts=texts, dictionary=dictionary, coherence='c_v')
#coherence_score = coherence_model.get_coherence()

#print(f"\nC_v Coherence Score: {coherence_score}")


Extracted Topic Distributions:
Topic 1: {'search': 0.2, 'software': 0.2, 'information': 0.2, 'internet': 0.2, 'computer': 0.2}

Original Text:
digital technology

Sampled Text with Revealed Topics:
internet computer


In [65]:
from gensim.models import Word2Vec
from gensim import corpora
from gensim.models import LdaModel
from nltk.tokenize import word_tokenize
from gensim import models
import gensim
import pandas as pd

# Load your pre-trained Word2Vec model
word2vec_model = Word2Vec.load('bbc.model')

# Load the BBC News train dataset 
dataset_path = 'bbc News Train.csv'
df = pd.read_csv(dataset_path)

# Preprocess the text (tokenize)
texts = [word_tokenize(text.lower()) for text in df['Text']]

# Create a Gensim dictionary from the Word2Vec vocabulary
dictionary = corpora.Dictionary(texts)

# Create a corpus of bag-of-words vectors
corpus = [dictionary.doc2bow(text) for text in texts]

# Train the LDA model
train_lda_model = gensim.models.LdaModel(corpus=corpus,
                                   id2word=dictionary,
                                   num_topics=5,  # Specify the number of topics
                                   passes=20)   # Number of training iterations
# Assuming lda_model is your trained LDA model
train_lda_model.save("train_lda_model")
# Print the topics identified by the model
#print(lda_model.print_topics())

# Analyze a new document
new_doc = "I enjoy playing sports and listening to music."
new_doc_bow = dictionary.doc2bow(word_tokenize(new_doc.lower()))
print(train_lda_model.get_document_topics(new_doc_bow))


[(0, 0.020024076), (1, 0.4085462), (2, 0.5305317), (3, 0.020530822), (4, 0.020367196)]


In [126]:
from gensim.models import Word2Vec, CoherenceModel
from gensim import corpora
from gensim.models import LdaModel
from nltk.tokenize import word_tokenize
import numpy as np
import random

# Function to preprocess the text
def preprocess_text(text):
    return word_tokenize(text.lower())

# Function to find similar words
def find_similar_words(input_word, model, topn=5):
    try:
        # Find similar words to the input word
        similar_words = model.wv.most_similar(positive=[input_word], topn=topn)
        return [word for word, _ in similar_words]
    except KeyError:
        print(f"Word '{input_word}' not found in the vocabulary.")
        return []

# Function to extract topic distributions using LDA
def extract_lda_topic_distributions(text, num_topics=2):
    tokenized_text = preprocess_text(text)
    dictionary = corpora.Dictionary([tokenized_text])
    corpus = [dictionary.doc2bow(tokenized_text)]

    #lda_model = LdaModel(corpus, num_topics=num_topics)
    topic_distributions = [{word: probability for word, probability in lda_model.show_topic(topic)} for topic in range(num_topics)]

    return lda_model, topic_distributions

# Perform Gibbs sampling to reveal hidden topics using LDA
def gibbs_sampling(text, word2vec_model, lda_model, topic_distributions, num_iterations=100):
    tokenized_text = preprocess_text(text)
    num_topics = len(topic_distributions)
    tokenized_text = preprocess_text(text)
    dictionary = corpora.Dictionary([tokenized_text])
    corpus = [dictionary.doc2bow(tokenized_text)]
    for _ in range(num_iterations):
        for i, word in enumerate(tokenized_text):
            if word in word2vec_model.wv.key_to_index and random.random() < 0.5:
                # Sample a topic using LDA's word-topic distribution
                topic_distribution = lda_model.get_document_topics(dictionary.doc2bow(tokenized_text))[0]
                normalized_probs = np.array(topic_distribution) / np.sum(topic_distribution)
                sampled_topic = np.random.choice(num_topics, p=normalized_probs)

                # Sample a word from the chosen topic distribution
                #sampled_word = np.random.choice(list(topic_distributions[sampled_topic].keys()),
                 #                               p=list(topic_distributions[sampled_topic].values()))
                topic_probs = list(topic_distributions[sampled_topic].values())
                normalized_probs = np.array(topic_probs) / np.sum(topic_probs)
                sampled_word = np.random.choice(list(topic_distributions[sampled_topic].keys()), p=normalized_probs)

                tokenized_text[i] = sampled_word

    return ' '.join(tokenized_text)

# Example usage:
# Load your pre-trained Word2Vec model
word2vec_model = Word2Vec.load('bbc.model')

# Input text for which to extract LDA topic distributions
input_text = "Technology software"

# Call the function to extract LDA topic distributions
lda_model, lda_topic_distributions = extract_lda_topic_distributions(input_text)

# Print the extracted LDA topic distributions
print("Extracted LDA Topic Distributions:")
for i, topic_distribution in enumerate(lda_topic_distributions, 1):
    print(f"Topic {i}: {topic_distribution}")

# Perform Gibbs sampling to reveal hidden topics using LDA
sampled_text_lda = gibbs_sampling(input_text, word2vec_model, lda_model, lda_topic_distributions)

# Print original and sampled text
print("\nText:")
print(input_text)

# Calculate Coherence Score (C_v)
#texts_lda = [preprocess_text(sampled_text_lda)]
#dictionary_lda = corpora.Dictionary(texts_lda)
#corpus_lda = [dictionary_lda.doc2bow(text) for text in texts_lda]

#coherence_model_lda = CoherenceModel(topics=[list(topic.keys()) for topic in lda_topic_distributions], texts=texts_lda, dictionary=dictionary_lda, coherence='c_v')
#coherence_score_lda = coherence_model_lda.get_coherence()

#print(f"\nC_v Coherence Score using LDA: {coherence_score_lda}")


Extracted LDA Topic Distributions:
Topic 1: {'.': 8.848936e-05, 'the': 7.940277e-05, 'to': 6.110881e-05, 'of': 5.836123e-05, 'in': 5.4996082e-05, 'and': 5.475499e-05, 'a': 4.9052138e-05, 'on': 4.844017e-05, 'for': 4.8261707e-05, 'is': 4.822316e-05}
Topic 2: {'the': 0.05383811, '.': 0.043967824, 'to': 0.022821352, 'and': 0.02087702, 'a': 0.020793112, 'in': 0.01971501, 'of': 0.015657177, 's': 0.012384042, 'for': 0.01030647, 'i': 0.009409825}

Text:
Technology software


In [131]:
# Function to extract topic distributions using LDA
def topic_distributions(text, num_topics=4):
    tokenized_text = preprocess_text(text)
    dictionary = corpora.Dictionary([tokenized_text])
    corpus = [dictionary.doc2bow(tokenized_text)]

    #lda_model = LdaModel(corpus, num_topics=num_topics)
    topic_distributions = [{word: probability for word, probability in train_lda_model.show_topic(topic)} for topic in range(num_topics)]

    return topic_distributions
new_doc = "I enjoy playing sports and listening to music."
new_doc_bow = dictionary.doc2bow(word_tokenize(new_doc.lower()))
#print(train_lda_model.get_document_topics(new_doc_bow))

lda_topic_distributions = topic_distributions(new_doc)
doc_word_matrix = lda_topic_distributions
print(lda_topic_distributions)

[{'.': 8.848936e-05, 'the': 7.940277e-05, 'to': 6.110881e-05, 'of': 5.836123e-05, 'in': 5.4996082e-05, 'and': 5.475499e-05, 'a': 4.9052138e-05, 'on': 4.844017e-05, 'for': 4.8261707e-05, 'is': 4.822316e-05}, {'the': 0.05383811, '.': 0.043967824, 'to': 0.022821352, 'and': 0.02087702, 'a': 0.020793112, 'in': 0.01971501, 'of': 0.015657177, 's': 0.012384042, 'for': 0.01030647, 'i': 0.009409825}, {'the': 0.051936768, '.': 0.041534677, 'to': 0.028651876, 'of': 0.024257185, 'and': 0.01992346, 'a': 0.018725405, 'in': 0.018152766, 'is': 0.010437443, 'that': 0.010417253, 'it': 0.009499688}, {'the': 0.046005614, '.': 0.03853317, 'to': 0.024643684, 'of': 0.021529086, 'and': 0.020040892, 'a': 0.019108659, 'in': 0.017469147, 'is': 0.009071474, 'that': 0.00905134, 'mr': 0.008966147}]


In [196]:
def gibbs_sampling(text, lda_model, word2vec_model, nn, lambda_, num_iterations=1):
    tokenized_text = word_tokenize(text.lower())
    num_topics = lda_model.num_topics

    for iteration in range(num_iterations):
        print(f"Iteration {iteration+1}:")

        # Estimate topic distributions
        topic_distributions = lda_model.get_document_topics(
            corpora.Dictionary([tokenized_text]).doc2bow(tokenized_text)
        )
        print("Topic Distributions:", topic_distributions)

        # Extract probabilities and normalize
        probabilities = np.array([topic[1] for topic in topic_distributions])
        normalized_probs = probabilities / np.sum(probabilities)

        for i, word in enumerate(tokenized_text):
            if word in word2vec_model.wv.key_to_index and random.random() < lambda_:
                # Sample a topic
                sampled_topic = np.random.choice(num_topics, p=normalized_probs)
                print(f"Sampled Topic: {sampled_topic}")
                # Sample a word vector
                p = [prob for _, prob in lda_model.get_topic_terms(sampled_topic, topn=len(word2vec_model.wv))]
                p = [prob / np.sum(p) for _, prob in lda_model.get_topic_terms(sampled_topic, topn=len(word2vec_model.wv))]
                p = np.asarray(p).astype('float64')
                p = p / np.sum(p)
                sampled_word_vector = word2vec_model.wv[np.random.choice(word2vec_model.wv.index_to_key, p=p)]

                # Find the nearest word
                nearest_word_index = nn.kneighbors(sampled_word_vector.reshape(1, -1), return_distance=False)[0][0]
                
                # Check if the nearest word is an actual word
                if nearest_word_index < len(word2vec_model.wv.index_to_key):
                    nearest_word = word2vec_model.wv.index_to_key[nearest_word_index]
                    print(f"Nearest Word: {nearest_word}")

                    # Update counts (implementation depends on your LDA model)
                    # lda_model.update_counts(tokenized_text, i, sampled_topic)

                    # Replace the original word
                    tokenized_text[i] = nearest_word

    print("\nFinal Sampled Text:")
    # Convert elements in tokenized_text to strings before joining
    tokenized_text_str = [str(token) for token in tokenized_text]
    return ' '.join(tokenized_text_str)

# Example usage
input_text = "education of science is great"
sampled_text = gibbs_sampling(input_text, lda_model, word2vec_model, nn, lambda_=0.5)
print(sampled_text)


Iteration 1:
Topic Distributions: [(0, 0.033351753), (1, 0.034304705), (2, 0.57570213), (3, 0.3227982), (4, 0.03384328)]
Sampled Topic: 3
Nearest Word: its
Sampled Topic: 2
Nearest Word: aggression
Sampled Topic: 4
Nearest Word: would

Final Sampled Text:
its aggression science is would


In [189]:
import numpy as np
from gensim.models import LdaModel, Word2Vec
from gensim import corpora
from nltk.tokenize import word_tokenize
import random
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
# Load pre-trained models (replace with actual paths)
lda_model = train_lda_model
word2vec_model = Word2Vec.load('bbc.model')

# Initialize LSH or k-d tree
nn = NearestNeighbors(n_neighbors=1, algorithm='auto')
nn.fit(word2vec_model.wv.vectors)

def gibbs_sampling(text, lda_model, word2vec_model, nn, lambda_, num_iterations=2):
    tokenized_text = word_tokenize(text.lower())
    num_topics = lda_model.num_topics

    for iteration in range(num_iterations):
        print(f"Iteration {iteration+1}:")

        # Estimate topic distributions
        topic_distributions = lda_model.get_document_topics(
            corpora.Dictionary([tokenized_text]).doc2bow(tokenized_text)
        )
        print("Topic Distributions:", topic_distributions)

        # Extract probabilities and normalize
        probabilities = np.array([topic[1] for topic in topic_distributions])
        normalized_probs = probabilities / np.sum(probabilities)

        for i, word in enumerate(tokenized_text):
            if word in word2vec_model.wv.key_to_index and random.random() < lambda_:
                # Sample a topic
                sampled_topic = np.random.choice(num_topics, p=normalized_probs)
                print(f"Sampled Topic: {sampled_topic}")
                # Sample a word vector
                p = [prob for _, prob in lda_model.get_topic_terms(sampled_topic, topn=len(word2vec_model.wv))]
                p = [prob / np.sum(p) for _, prob in lda_model.get_topic_terms(sampled_topic, topn=len(word2vec_model.wv))]
                p = np.asarray(p).astype('float64')
                p = p / np.sum(p)
                sampled_word_vector = word2vec_model.wv[np.random.choice(word2vec_model.wv.index_to_key, p=p)]

                # Find the nearest word
                nearest_word_index = nn.kneighbors(sampled_word_vector.reshape(1, -1), return_distance=False)[0][0]
                
                # Check if the nearest word is an actual word
                if nearest_word_index < len(word2vec_model.wv.index_to_key):
                    nearest_word = word2vec_model.wv.index_to_key[nearest_word_index]
                    print(f"Nearest Word: {nearest_word}")

                    # Update counts (implementation depends on your LDA model)
                    # lda_model.update_counts(tokenized_text, i, sampled_topic)

                    # Replace the original word
                    tokenized_text[i] = nearest_word
                
    print("\nFinal Sampled Text:")
    # Convert elements in tokenized_text to strings before joining
    tokenized_text_str = [str(token) for token in tokenized_text]
    return ' '.join(tokenized_text_str)

# Example usage
input_text = "software digital technology is outstanding"
sampled_text = gibbs_sampling(input_text, lda_model, word2vec_model, nn, lambda_=0.5)
print(sampled_text)


Iteration 1:
Topic Distributions: [(0, 0.033351745), (1, 0.034304738), (2, 0.57568455), (3, 0.32281566), (4, 0.033843286)]
Sampled Topic: 2
Nearest Word: in
Sampled Topic: 2
Nearest Word: of
Sampled Topic: 2
Nearest Word: microsoft
Iteration 2:
Topic Distributions: [(0, 0.03335176), (1, 0.03430451), (2, 0.57580173), (3, 0.32269874), (4, 0.033843216)]
Sampled Topic: 3
Nearest Word: actor

Final Sampled Text:
software actor technology of microsoft


In [185]:
import numpy as np
from gensim.models import LdaModel, Word2Vec
from gensim import corpora
from nltk.tokenize import word_tokenize
import random
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity

# Assuming you have loaded your LDA and Word2Vec models
lda_model = train_lda_model
word2vec_model = Word2Vec.load('bbc.model')

# Initialize Nearest Neighbors
nn = NearestNeighbors(n_neighbors=1, algorithm='auto')
nn.fit(word2vec_model.wv.vectors)

def find_similar_words(input_word, model, topn=5):
    """
    Find and print similar words to the input word using a pre-trained Word2Vec model.

    Parameters:
    - input_word (str): The input word for which to find similar words.
    - model (Word2Vec): The pre-trained Word2Vec model.
    - topn (int): Number of similar words to retrieve (default is 5).
    """
    try:
        # Find similar words to the input word
        similar_words = model.wv.most_similar(positive=[input_word], topn=topn)

        # Print the results
        print(f"Similar words to '{input_word}':")
        for word, similarity in similar_words:
            print(f"{word}: {similarity}")

        # Find the most similar word and its similarity score
        most_similar_word, max_similarity = max(similar_words, key=lambda x: x[1])
        print(f"\nMost similar word: {most_similar_word} (Similarity: {max_similarity})")

        return most_similar_word

    except KeyError:
        print(f"Word '{input_word}' not found in the vocabulary.")
        return None

def gibbs_sampling(text, lda_model, word2vec_model, nn, lambda_, num_iterations=1):
    tokenized_text = word_tokenize(text.lower())
    num_topics = lda_model.num_topics

    for iteration in range(num_iterations):
        print(f"Iteration {iteration+1}:")

        # Estimate topic distributions
        topic_distributions = lda_model.get_document_topics(
            corpora.Dictionary([tokenized_text]).doc2bow(tokenized_text)
        )
        print("Topic Distributions:", topic_distributions)

        # Extract probabilities and normalize
        probabilities = np.array([topic[1] for topic in topic_distributions])
        normalized_probs = probabilities / np.sum(probabilities)

        for i, word in enumerate(tokenized_text):
            if word in word2vec_model.wv.key_to_index and random.random() < lambda_:
                # Sample a topic
                sampled_topic = np.random.choice(num_topics, p=normalized_probs)
                print(f"Sampled Topic: {sampled_topic}")

                # Sample a word vector
                p = [prob for _, prob in lda_model.get_topic_terms(sampled_topic, topn=len(word2vec_model.wv))]
                p = [prob / np.sum(p) for _, prob in lda_model.get_topic_terms(sampled_topic, topn=len(word2vec_model.wv))]
                p = np.asarray(p).astype('float64')
                p = p / np.sum(p)
                sampled_word_vector = word2vec_model.wv[np.random.choice(word2vec_model.wv.index_to_key, p=p)]

                # Find the most similar word using the find_similar_words function
                most_similar_word = find_similar_words(word, word2vec_model, topn=1)
                print(f"Most Similar Word: {most_similar_word}")

                if most_similar_word:
                    # Replace the original word
                    tokenized_text[i] = most_similar_word

    print("\nFinal Sampled Text:")
    # Convert elements in tokenized_text to strings before joining
    tokenized_text_str = [str(token) for token in tokenized_text]
    return ' '.join(tokenized_text_str)

# Example usage
input_text = "software digital technology is outstanding"
sampled_text = gibbs_sampling(input_text, lda_model, word2vec_model, nn, lambda_=0.5)
print(sampled_text)


Iteration 1:
Topic Distributions: [(0, 0.03335175), (1, 0.03430472), (2, 0.5756946), (3, 0.32280558), (4, 0.03384328)]
Sampled Topic: 2
Similar words to 'software':
computer: 0.9748510122299194

Most similar word: computer (Similarity: 0.9748510122299194)
Most Similar Word: computer
Sampled Topic: 1
Similar words to 'digital':
recorders: 0.9725573658943176

Most similar word: recorders (Similarity: 0.9725573658943176)
Most Similar Word: recorders
Sampled Topic: 4
Similar words to 'technology':
search: 0.9623100161552429

Most similar word: search (Similarity: 0.9623100161552429)
Most Similar Word: search

Final Sampled Text:
computer recorders search is outstanding
