**PRACTICAL 5**

***Implement a code for aspect mining and topic modeling.***

**1. ASPECT MINING**

In [None]:
import spacy
from textblob import TextBlob

# Load the spacy model for English
sp = spacy.load("en_core_web_sm")

# Creating a list of positive and negative sentences.
mixed_sen = [
    'This chocolate truffle cake is really tasty',
    'This party is amazing!',
    'My mom is the best!',
    'App response is very slow!',
    'The trip to India was very enjoyable'
]

# An empty list for obtaining the extracted aspects from sentences.
ext_aspects = []

# Performing Aspect Extraction
for sen in mixed_sen:
    important = sp(sen)
    descriptive_item = ''
    target = ''

    for token in important:
        if token.dep_ == 'nsubj' and token.pos_ == 'NOUN':
            target = token.text
        if token.pos_ == 'ADJ':
            added_terms = ''
            for mini_token in token.children:
                if mini_token.pos_ != 'ADV':
                    continue
                added_terms += mini_token.text + ' '
            descriptive_item = added_terms + token.text

    ext_aspects.append({'aspect': target, 'description': descriptive_item})

print("ASPECT EXTRACTION\n")
print(ext_aspects)

# Associating Sentiment
for aspect in ext_aspects:
    aspect['sentiment'] = TextBlob(aspect['description']).sentiment

print("\nSENTIMENT ASSOCIATION\n")
print(ext_aspects)

print("")
print("")

import spacy
import gensim
import gensim.corpora as corpora
from nltk.corpus import stopwords
from pprint import pprint
import nltk

# Download stopwords
nltk.download('stopwords')

# Load spacy model for lemmatization
nlp = spacy.load("en_core_web_sm")

# Sample data for topic modeling
documents = [
    'This chocolate truffle cake is really tasty',
    'The party was amazing and everyone enjoyed it!',
    'My mom is the best and she loves me so much',
    'The app response is very slow, and it frustrates me',
    'The trip to India was very enjoyable and the experience was unforgettable',
]

# 1. Preprocessing (tokenization, stopwords removal, lemmatization)
stop_words = set(stopwords.words('english'))

def preprocess(doc):
    # Tokenize and lemmatize
    doc = nlp(doc)
    tokens = [token.lemma_ for token in doc if token.is_alpha and token.text.lower() not in stop_words]
    return tokens

processed_docs = [preprocess(doc) for doc in documents]

# 2. Create Dictionary and Corpus
# Create a dictionary representation of the documents
id2word = corpora.Dictionary(processed_docs)

# Create the Bag of Words corpus
corpus = [id2word.doc2bow(text) for text in processed_docs]

# 3. Applying LDA Model (Topic Modeling)
lda_model = gensim.models.LdaMulticore(corpus, id2word=id2word, num_topics=3, passes=10, workers=2, random_state=42)

# 4. Output the topics
pprint(lda_model.print_topics())

# Show the topic distribution for each document
for i, topic_distribution in enumerate(lda_model[corpus]):
    print(f"\nDocument {i + 1} Topic Distribution:")
    print(topic_distribution)


ASPECT EXTRACTION

[{'aspect': 'cake', 'description': 'really tasty'}, {'aspect': 'party', 'description': 'amazing'}, {'aspect': 'mom', 'description': 'best'}, {'aspect': 'response', 'description': 'very slow'}, {'aspect': 'trip', 'description': 'very enjoyable'}]

SENTIMENT ASSOCIATION

[{'aspect': 'cake', 'description': 'really tasty', 'sentiment': Sentiment(polarity=0.2, subjectivity=0.2)}, {'aspect': 'party', 'description': 'amazing', 'sentiment': Sentiment(polarity=0.6000000000000001, subjectivity=0.9)}, {'aspect': 'mom', 'description': 'best', 'sentiment': Sentiment(polarity=1.0, subjectivity=0.3)}, {'aspect': 'response', 'description': 'very slow', 'sentiment': Sentiment(polarity=-0.39000000000000007, subjectivity=0.52)}, {'aspect': 'trip', 'description': 'very enjoyable', 'sentiment': Sentiment(polarity=0.65, subjectivity=0.78)}]




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[(0,
  '0.117*"amazing" + 0.117*"everyone" + 0.117*"enjoy" + 0.117*"party" + '
  '0.029*"truffle" + 0.029*"much" + 0.029*"response" + 0.029*"frustrate" + '
  '0.029*"slow" + 0.029*"app"'),
 (1,
  '0.082*"experience" + 0.082*"enjoyable" + 0.082*"India" + '
  '0.082*"unforgettable" + 0.082*"trip" + 0.082*"app" + 0.082*"slow" + '
  '0.082*"response" + 0.082*"frustrate" + 0.020*"much"'),
 (2,
  '0.082*"chocolate" + 0.082*"really" + 0.082*"cake" + 0.082*"tasty" + '
  '0.082*"truffle" + 0.082*"good" + 0.082*"love" + 0.082*"mom" + 0.082*"much" '
  '+ 0.020*"frustrate"')]

Document 1 Topic Distribution:
[(0, 0.05613074), (1, 0.055941958), (2, 0.8879273)]

Document 2 Topic Distribution:
[(0, 0.8660328), (1, 0.0669837), (2, 0.06698354)]

Document 3 Topic Distribution:
[(0, 0.06736673), (1, 0.067134984), (2, 0.86549824)]

Document 4 Topic Distribution:
[(0, 0.06736115), (1, 0.86550134), (2, 0.06713752)]

Document 5 Topic Distribution:
[(0, 0.05612808), (1, 0.8879305), (2, 0.055941414)]


2. **TOPIC MODELING**

In [None]:
import spacy
import gensim
import gensim.corpora as corpora
from nltk.corpus import stopwords
from pprint import pprint
import nltk

# Download stopwords
nltk.download('stopwords')

# Load spacy model for lemmatization
nlp = spacy.load("en_core_web_sm")

# Sample data for topic modeling
documents = [
    'This chocolate truffle cake is really tasty',
    'The party was amazing and everyone enjoyed it!',
    'My mom is the best and she loves me so much',
    'The app response is very slow, and it frustrates me',
    'The trip to India was very enjoyable and the experience was unforgettable',
]

# 1. Preprocessing (tokenization, stopwords removal, lemmatization)
stop_words = set(stopwords.words('english'))

def preprocess(doc):
    # Tokenize and lemmatize
    doc = nlp(doc)
    tokens = [token.lemma_ for token in doc if token.is_alpha and token.text.lower() not in stop_words]
    return tokens

processed_docs = [preprocess(doc) for doc in documents]

# 2. Create Dictionary and Corpus
# Create a dictionary representation of the documents
id2word = corpora.Dictionary(processed_docs)

# Create the Bag of Words corpus
corpus = [id2word.doc2bow(text) for text in processed_docs]

# 3. Applying LDA Model (Topic Modeling)
lda_model = gensim.models.LdaMulticore(corpus, id2word=id2word, num_topics=3, passes=10, workers=2, random_state=42)

# 4. Output the topics
pprint(lda_model.print_topics())

# Show the topic distribution for each document
for i, topic_distribution in enumerate(lda_model[corpus]):
    print(f"\nDocument {i + 1} Topic Distribution:")
    print(topic_distribution)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


[(0,
  '0.115*"amazing" + 0.115*"everyone" + 0.115*"enjoy" + 0.115*"party" + '
  '0.031*"truffle" + 0.031*"cake" + 0.031*"tasty" + 0.031*"really" + '
  '0.031*"chocolate" + 0.030*"much"'),
 (1,
  '0.082*"experience" + 0.082*"enjoyable" + 0.082*"India" + '
  '0.082*"unforgettable" + 0.082*"trip" + 0.079*"app" + 0.079*"slow" + '
  '0.079*"response" + 0.079*"frustrate" + 0.021*"much"'),
 (2,
  '0.080*"good" + 0.080*"love" + 0.080*"mom" + 0.080*"much" + '
  '0.080*"chocolate" + 0.080*"really" + 0.080*"cake" + 0.080*"tasty" + '
  '0.079*"truffle" + 0.024*"frustrate"')]

Document 1 Topic Distribution:
[(0, 0.056265045), (1, 0.05598232), (2, 0.8877526)]

Document 2 Topic Distribution:
[(0, 0.86596537), (1, 0.067020714), (2, 0.067013904)]

Document 3 Topic Distribution:
[(0, 0.06741821), (1, 0.06718809), (2, 0.86539376)]

Document 4 Topic Distribution:
[(0, 0.067449905), (1, 0.8651176), (2, 0.06743247)]

Document 5 Topic Distribution:
[(0, 0.056152932), (1, 0.88788563), (2, 0.0559614)]
