#**GENSIM EXPLORATION**

**NAME :** SAHANA RAO

**SRN:** PES1UG20CS588

**SECTION:** J

In [1]:
!pip install gensim

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


#**Tokenising text using Gensim** 

In [6]:
#Text Preprocessing
from gensim.utils import tokenize

text = "My name is Sahana Rao."
tokens = list(tokenize(text))
print(tokens)

['My', 'name', 'is', 'Sahana', 'Rao']


#**SIMILARITY RETRIVAL USING GENSIM**

It is a  process of finding documents or words that are similar to a given query document or word based on some similarity metric. 

In [10]:
from gensim import corpora, models, similarities

# Create a corpus of documents
corpus = [
    ['rose', 'lily', 'tulip'],
    ['tulip', 'garland'],
    ['rose', 'garland', 'tulip', 'hibiscus'],
    ['lily', 'garland']
]

# Create a dictionary from the corpus
dictionary = corpora.Dictionary(corpus)

# Convert the corpus into a bag-of-words representation
bow_corpus = [dictionary.doc2bow(text) for text in corpus]

# Train a TF-IDF model on the corpus
tfidf_model = models.TfidfModel(bow_corpus)
tfidf_corpus = tfidf_model[bow_corpus]


matrix_sim = similarities.MatrixSimilarity(tfidf_corpus)
sparse_sim = similarities.SparseMatrixSimilarity(tfidf_corpus, num_features=len(dictionary))

# Compute the similarities between the documents
query = ['rose', 'garland']
query_bow = dictionary.doc2bow(query)
query_tfidf = tfidf_model[query_bow]


sims1 = matrix_sim[query_tfidf]
print(list(enumerate(sims1)))



[(0, 0.6266618), (1, 0.27105728), (2, 0.46833566), (3, 0.14694409)]


#**models.Doc2Vec**

models.Doc2Vec.most_similar is a method in the Doc2Vec model class in Gensim that returns the top-N most similar documents to a given query document based on their cosine similarity scores in the vector space model.

In [3]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import simple_preprocess


documents = [
    "The quick brown fox jumps over the lazy dog.",
    "The lazy dog is very lazy.",
    "Hello world."
]


tagged_documents = [TaggedDocument(simple_preprocess(doc), [i]) for i, doc in enumerate(documents)]

# Train a Doc2Vec model on the TaggedDocument objects
model = Doc2Vec(tagged_documents, vector_size=100, window=5, min_count=1, epochs=50)


query_doc = "The brown fox is quick and the dog is lazy."
query_vector = model.infer_vector(simple_preprocess(query_doc))


similar_docs = model.docvecs.most_similar(positive=[query_vector], topn=2)


for doc_id, similarity in similar_docs:
    print(documents[doc_id])



The quick brown fox jumps over the lazy dog.
Hello world.


  similar_docs = model.docvecs.most_similar(positive=[query_vector], topn=2)


#**models.Word2Vec.most_similar**

This method returns the most similar words to a given word in a pre-trained Word2Vec model. It computes cosine similarity between the word's vector representation and the vectors of all other words in the model.

In [5]:
from gensim.models import Word2Vec

# define a list of sentences
sentences = [["this", "is", "the", "first", "sentence", "for", "word2vec"],
             ["this", "is", "the", "second", "sentence"],
             ["yet", "another", "sentence"],
             ["one", "more", "sentence"],
             ["and", "the", "final", "sentence"]]

# train a Word2Vec model on the sentences
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, epochs=50)

# find the most similar words to a given word
similar_words = model.wv.most_similar("sentence",topn=5)
print(similar_words)
for word, similarity in similar_words:
    print(word)

[('yet', 0.21735575795173645), ('for', 0.09428801387548447), ('one', 0.09294721484184265), ('word2vec', 0.08002333343029022), ('second', 0.0633990541100502)]
yet
for
one
word2vec
second


# **Topic Modeling**

Topic modeling is a natural language processing technique that allows you to identify the main topics or themes in a collection of documents. Gensim provides a simple and powerful way to perform topic modeling using the Latent Dirichlet Allocation (LDA) algorithm.

In [1]:
# Topic Modeling
import gensim
from gensim import corpora
from pprint import pprint

# Sample corpus
corpus = [
    "Machine learning is a subset of artificial intelligence that focuses on building systems that can learn from data.",
    "Natural language processing is a field of study focused on making computers understand human language.",
    "Deep learning is a subset of machine learning that uses neural networks with many layers to model complex data.",
    "Computer vision is the field of study focused on teaching machines to interpret and understand visual data.",
    "Reinforcement learning is a type of machine learning that trains agents to take actions in an environment to maximize a reward."
]


# Create dictionary
dictionary = corpora.Dictionary([doc.split() for doc in corpus])

# Create bag of words corpus
bow_corpus = [dictionary.doc2bow(doc.split()) for doc in corpus]

# Train LDA model
lda_model = gensim.models.LdaModel(
    corpus=bow_corpus,
    id2word=dictionary,
    num_topics=3, 
    random_state=42,
    passes=10,
    alpha='auto',
    per_word_topics=True
)

# Print topics
pprint(lda_model.print_topics())

[(0,
  '0.037*"machines" + 0.037*"vision" + 0.037*"study" + 0.037*"focused" + '
  '0.037*"field" + 0.037*"of" + 0.037*"interpret" + 0.037*"data." + '
  '0.037*"Computer" + 0.037*"teaching"'),
 (1,
  '0.069*"learning" + 0.056*"a" + 0.056*"that" + 0.043*"to" + 0.043*"of" + '
  '0.043*"is" + 0.030*"machine" + 0.030*"data." + 0.030*"subset" + '
  '0.017*"Reinforcement"'),
 (2,
  '0.039*"on" + 0.039*"is" + 0.039*"of" + 0.039*"understand" + 0.039*"field" + '
  '0.039*"focused" + 0.039*"a" + 0.039*"study" + 0.039*"computers" + '
  '0.039*"human"')]


In [2]:
# Infer topics for new document
new_doc = "Artificial intelligence is the field of study focused on building intelligent machines."

new_doc_bow = dictionary.doc2bow(new_doc.split())
new_doc_topics, word_topics, phi_values = lda_model.get_document_topics(new_doc_bow, per_word_topics=True)
pprint(new_doc_topics)

[(0, 0.7400925), (1, 0.25486773)]


#**Word Embeddings**

Word embeddings are a powerful technique for representing words as dense vectors in a high-dimensional space. Gensim provides an easy-to-use implementation of several popular word embedding algorithms, including Word2Vec and FastText. 

In [8]:
# Word Embeddings
import gensim

# Sample corpus
corpus = [
    "Machine learning is a subset of artificial intelligence that focuses on building systems that can learn from data.",
    "Natural language processing is a field of study focused on making computers understand human language.",
    "Deep learning is a subset of machine learning that uses neural networks with many layers to model complex data.",
    "Computer vision is the field of study focused on teaching machines to interpret and understand visual data.",
    "Reinforcement learning is a type of machine learning that trains agents to take actions in an environment to maximize a reward."
]

# Preprocessing the corpus
preprocessed_corpus = [doc.split() for doc in corpus]

# Train Word2Vec model
model = gensim.models.Word2Vec(preprocessed_corpus, vector_size=100, window=5, min_count=1, workers=4)

# Get word embeddings
print(model.wv['Machine'])


[-6.4630997e-03  7.3367576e-03  6.1177979e-03 -4.9100812e-03
 -1.7541874e-03 -2.5121467e-03  3.1211844e-03 -4.5212783e-04
 -2.7682818e-03 -9.0516107e-03  6.4337696e-03 -9.6248072e-03
 -8.6518833e-03  1.3884846e-03  2.9792865e-03 -7.4751347e-07
  3.8094976e-04  2.5881811e-03  1.8030650e-03  7.5679389e-03
 -3.4414798e-03 -7.2295824e-03 -7.8792162e-03  7.9229185e-03
  1.9487789e-03 -5.8045038e-03  6.3820719e-03  8.4539428e-03
  7.9488838e-03 -6.8302597e-03 -5.1375949e-03 -2.0091264e-03
 -6.6235662e-03  4.7205924e-03  5.6654154e-03 -5.9954687e-03
  7.2007733e-03 -7.6522226e-03  6.2051262e-03 -4.6260231e-03
  1.9630603e-03 -3.2784708e-03  3.3939371e-03  7.7177049e-03
 -1.3956031e-03 -4.9823239e-03 -8.5704457e-03  3.6361234e-03
  4.2371978e-03  6.6932952e-03 -4.2884005e-03 -9.7401831e-03
  6.8214135e-03 -4.5232675e-03  4.4192504e-03  8.0564367e-03
 -3.9324570e-03  1.6356074e-03 -3.5863242e-03 -6.6093006e-03
 -5.6845113e-03 -1.3819299e-04 -7.5054220e-03 -1.9887215e-03
 -2.4843570e-03  6.71127

#**corpora.Dictionary**

 The corpora.Dictionary class in gensim is a utility for creating and manipulating a mapping between words and their integer ids. This is useful for many natural language processing tasks, including topic modeling and text classification.

In [9]:
# Corpus to Dictionary
from gensim import corpora

# List of documents
documents = ['rose is a flower', 'tulip is also a flower', 'Frog is an animal']

# Create dictionary
dictionary = corpora.Dictionary([doc.split() for doc in documents])

# Print dictionary
print(dictionary.token2id)

{'a': 0, 'flower': 1, 'is': 2, 'rose': 3, 'also': 4, 'tulip': 5, 'Frog': 6, 'an': 7, 'animal': 8}
