In [1]:
! pip install scikit-learn


Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# Cell 1: Import Libraries
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Sample corpus
corpus = [
    "Deep learning and artificial intelligence are advancing rapidly.",
    "Data science and machine learning are transforming industries.",
    "Natural language processing is a subfield of AI.",
    "The application of AI in healthcare is growing.",
    "AI in finance is a game-changer.",
    "Machine learning is a subset of artificial intelligence.",
    "Data science involves the use of machine learning algorithms.",
    "Healthcare data is becoming more valuable with AI tools."
]

# Text Preprocessing (Vectorization using TF-IDF)
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(corpus)


In [3]:
# Cell 2: Apply NMF for Topic Modeling
n_topics = 2  # Choose the number of topics

nmf_model = NMF(n_components=n_topics, random_state=42)
W = nmf_model.fit_transform(X)  # Document-Topic Matrix
H = nmf_model.components_  # Topic-Term Matrix

# Display the top words in each topic
feature_names = vectorizer.get_feature_names_out()

def display_topics(model, feature_names, no_top_words=5):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic #{topic_idx + 1}:")
        top_words_idx = topic.argsort()[:-no_top_words - 1:-1]
        top_words = [feature_names[i] for i in top_words_idx]
        print(" ".join(top_words))

# Display Topics
display_topics(nmf_model, feature_names)


Topic #1:
learning machine artificial intelligence science
Topic #2:
ai healthcare growing application valuable


In [4]:
# Cell 3: Evaluate Reconstruction Error
X_reconstructed = np.dot(W, H)
reconstruction_error = mean_squared_error(X.toarray(), X_reconstructed)

print(f"Reconstruction Error (Mean Squared Error): {reconstruction_error:.4f}")


Reconstruction Error (Mean Squared Error): 0.0211


B) Word Disambiguation using WordNet

In [5]:
! pip install nltk

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
# Cell 1: Import Required Libraries
import nltk
from nltk.corpus import wordnet as wn
from nltk.wsd import lesk

nltk.download('wordnet')
nltk.download('omw-1.4')  # Download the WordNet lexicon and its multilingual version

# Example sentence
sentence = "The bank will not allow withdrawal after hours."


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...


In [7]:
# Cell 2: Word Sense Disambiguation using Lesk Algorithm
def disambiguate_word(context, word):
    # Use the Lesk algorithm to get the most likely sense of the word
    sense = lesk(context, word)
    if sense:
        return sense.name(), sense.definition()
    else:
        return "No sense found", None

# Disambiguate the word 'bank' in the context of the sentence
context = nltk.word_tokenize(sentence)
word = 'bank'
sense_name, sense_definition = disambiguate_word(context, word)

print(f"Word Sense: {sense_name}")
print(f"Definition: {sense_definition}")


Word Sense: deposit.v.02
Definition: put into a bank account
