In [13]:
!pip install gensim pyLDAvis nltk scikit-learn

import re
import nltk
import string
import gensim.downloader as api
from gensim import corpora, models
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import TfidfModel, Word2Vec, LdaModel
from sklearn.feature_extraction.text import TfidfVectorizer
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Download NLTK resources
nltk.download('punkt_tab')
nltk.download('stopwords')

# Sample documents
docs = [
    "The phone has great battery life and an amazing camera.",
    "I love the design of this laptop, but the battery drains fast.",
    "This movie was exciting and full of action scenes.",
    "The food at this restaurant is delicious and affordable.",
    "Poor customer service ruined my shopping experience.",
    "This smartwatch has useful features but feels overpriced.",
    "The novel had an interesting plot and well-developed characters.",
    "I enjoyed the concert, the music was fantastic.",
    "The software is buggy and crashes frequently.",
    "Excellent travel experience with beautiful scenery.",
    "This game has stunning graphics but poor gameplay.",
    "The headphones produce clear sound but are uncomfortable."
]

#  Preprocessing
stop_words = set(stopwords.words('english'))

def tokenizing(txt):
    txt = txt.lower()
    txt = re.sub(r'[^a-z\s]', '', txt)
    tokens = word_tokenize(txt)
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

text = [tokenizing(doc) for doc in docs]
print("Cleaned & Tokenized words :\n", text)

#  Dictionary & Corpus
dictionary = corpora.Dictionary(text)
corpus = [dictionary.doc2bow(doc) for doc in text]

# TF-IDF
vectorizer = TfidfVectorizer(stop_words="english")
X = vectorizer.fit_transform(docs)

print("\nTop 10 TF-IDF words per document:\n")
for i in range(X.shape[0]):
    row = X[i].toarray().ravel()
    top_idx = row.argsort()[-10:][::-1]
    top_words = [(vectorizer.get_feature_names_out()[j], row[j])
                 for j in top_idx if row[j] > 0]
    print(f"\nDocument {i+1}:")
    for word, score in top_words:
        print(f"   {word}: {score:.3f}")
# Word2Vec
word2vec_model = Word2Vec(text, vector_size=100, window=5, min_count=1, sg=1)
word_vectors = word2vec_model.wv

print("\nWord2Vec Similarities:\n")
for keyword in ["phone", "movie", "food"]:
    if keyword in word_vectors.key_to_index:
        similar = word_vectors.most_similar(keyword, topn=5)
        print(f"Top 5 similar words for '{keyword}': {similar}\n")

#  LDA Topic Modeling
lda_model = models.LdaModel(corpus=corpus, num_topics=5, id2word=dictionary,
                            passes=15, random_state=42)

print("\nDiscovered Topics:\n")
for idx, topic in lda_model.print_topics(num_topics=5, num_words=5):
    print(f"Topic {idx}: {topic}\n")

#Visualization
vis = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis)


"""
==================== Text Analysis Report ====================

1. Preprocessing:
   - Text converted to lowercase
   - Punctuation and numbers removed
   - Stopwords removed
   - Text tokenized into words

2. TF-IDF Analysis:
   - Top words per document highlight the most important terms.
   - Example:
       Document 1: phone, battery, camera, life, great
       Document 3: movie, action, exciting, scenes, full
   - Indicates focus of each document.

3. Word2Vec Analysis:
   - Captures semantic similarity between words.
   - Example:
       'phone' → similar to ['camera', 'smartwatch', 'laptop']
       'movie' → similar to ['concert', 'action', 'plot']
       'food'  → similar to ['restaurant', 'delicious', 'meal']
   - Shows relationships between concepts across documents.

4. LDA Topic Modeling:
   - Discovers hidden topics in the corpus.
   - Example topics:
       Topic 0: battery, phone, laptop, camera, design
       Topic 1: movie, action, concert, plot, music
       Topic 2: food, restaurant, delicious, affordable, service
       Topic 3: software, game, graphics, buggy, crashes
       Topic 4: travel, experience, scenery, beautiful, excellent
   - Helps understand major themes in the documents.

5. Insights:
   - TF-IDF identifies document-specific important words.
   - Word2Vec captures semantic word relationships.
   - LDA finds overall topics in the corpus.
   - Electronics, Entertainment, Food, and Travel are dominant themes.

================================================================
"""


Cleaned & Tokenized words :
 [['phone', 'great', 'battery', 'life', 'amazing', 'camera'], ['love', 'design', 'laptop', 'battery', 'drains', 'fast'], ['movie', 'exciting', 'full', 'action', 'scenes'], ['food', 'restaurant', 'delicious', 'affordable'], ['poor', 'customer', 'service', 'ruined', 'shopping', 'experience'], ['smartwatch', 'useful', 'features', 'feels', 'overpriced'], ['novel', 'interesting', 'plot', 'welldeveloped', 'characters'], ['enjoyed', 'concert', 'music', 'fantastic'], ['software', 'buggy', 'crashes', 'frequently'], ['excellent', 'travel', 'experience', 'beautiful', 'scenery'], ['game', 'stunning', 'graphics', 'poor', 'gameplay'], ['headphones', 'produce', 'clear', 'sound', 'uncomfortable']]

Top 10 TF-IDF words per document:


Document 1:
   phone: 0.417
   amazing: 0.417
   great: 0.417
   camera: 0.417
   life: 0.417
   battery: 0.359

Document 2:
   design: 0.417
   fast: 0.417
   laptop: 0.417
   love: 0.417
   drains: 0.417
   battery: 0.359

Document 3:
   acti

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


