**Topic Modeling**

In [27]:
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [28]:
doc1 = "Sugar is bad to consume. My sister likes to have sugar, but not my father."
doc2 = "My father spends a lot of time driving my sister around to dance practice."
doc3 = "Doctors suggest that driving may cause increased stress and blood pressure."
doc4 = "Sometimes I feel pressure to perform well at school, but my father never seems to do."
doc5 = "Health experts say that Sugar is not good for your lifestyle."

# compile documents
doc_complete = [doc1, doc2, doc3, doc4, doc5]


In [29]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()


In [30]:
def clean(doc):
 stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
 punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
 normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
 return normalized



In [37]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [38]:
doc_clean = [clean(doc).split() for doc in doc_complete] 

In [39]:
# Importing Gensim
import gensim
from gensim import corpora


In [40]:
# Creating the term dictionary of our courpus, where every unique term is assigned an index
dictionary = corpora.Dictionary(doc_clean)


In [41]:
# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

In [42]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel


In [43]:
# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=2, id2word = dictionary, passes=50)

In [44]:
print(ldamodel.print_topics(num_topics=2, num_words=2))


[(0, '0.072*"father" + 0.072*"sugar"'), (1, '0.059*"driving" + 0.058*"pressure"')]


**Aspect Mining**

In [58]:
pip install stanza


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [59]:
pip install stanfordnlp

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting stanfordnlp
  Downloading stanfordnlp-0.2.0-py3-none-any.whl (158 kB)
[K     |████████████████████████████████| 158 kB 5.7 MB/s 
Installing collected packages: stanfordnlp
Successfully installed stanfordnlp-0.2.0


In [None]:
#pip install stanza
# Make sure you have downloaded the StanfordNLP English model and other essential tools usi
#stanfordnlp.download('en')
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
#stanza.download('en')


In [60]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import stanfordnlp
import stanza


In [61]:
txt = "The Sound Quality is great but the battery life is very bad."

In [63]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [64]:
txt = txt.lower() # LowerCasing the given Text
sentList = nltk.sent_tokenize(txt) # Splitting the text into sentences

In [65]:
fcluster = []
totalfeatureList = []
finalcluster = []
categories = []
dic = {}

In [67]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [68]:
for line in sentList:
 txt_list = nltk.word_tokenize(line) # Splitting up into words
 taggedList = nltk.pos_tag(txt_list) # Doing Part-of-Speech Tagging to each word
print(taggedList)


[('the', 'DT'), ('sound', 'NN'), ('quality', 'NN'), ('is', 'VBZ'), ('great', 'JJ'), ('but', 'CC'), ('the', 'DT'), ('battery', 'NN'), ('life', 'NN'), ('is', 'VBZ'), ('very', 'RB'), ('bad', 'JJ'), ('.', '.')]


In [71]:
newwordList = []
flag = 0
for i in range(0,len(taggedList)-1):
    if(taggedList[i][1]=="NN" and taggedList[i+1][1]=="NN"): # If two consecutive words
         newwordList.append(taggedList[i][0]+taggedList[i+1][0])
         flag=1
    else:
         if(flag==1):
             flag=0
             continue
         newwordList.append(taggedList[i][0])
         if(i==len(taggedList)-2):
              newwordList.append(taggedList[i+1][0])
finaltxt = ' '.join(word for word in newwordList)
print(finaltxt)

the soundquality is great but the batterylife is very bad .


In [72]:
stop_words = set(stopwords.words('english'))
new_txt_list = nltk.word_tokenize(finaltxt)
wordsList = [w for w in new_txt_list if not w in stop_words]
taggedList = nltk.pos_tag(wordsList)


In [74]:
nlp = stanza.Pipeline('en') # initialize English neural pipeline
doc = nlp(finaltxt) # Object of Stanford NLP Pipeleine
# Getting the dependency relations betwwen the words
dep_node = []
for dep_edge in doc.sentences[0].dependencies:
  dep_node.append([dep_edge[2].text, dep_edge[0].id, dep_edge[1]])
# Coverting it into appropriate format
for i in range(0, len(dep_node)):
 if (int(dep_node[i][1]) != 0):
  dep_node[i][1] = newwordList[(int(dep_node[i][1]) - 1)]
#print(dep_node)

INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.1/models/tokenize/combined.pt:   0%|    …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.1/models/pos/combined.pt:   0%|         …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.1/models/lemma/combined.pt:   0%|       …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.1/models/depparse/combined.pt:   0%|    …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.1/models/sentiment/sstplus.pt:   0%|    …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.1/models/constituency/wsj.pt:   0%|     …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.1/models/ner/ontonotes.pt:   0%|        …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.1/models/forward_charlm/1billion.pt:   0…

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.1/models/pretrain/combined.pt:   0%|    …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.1/models/pretrain/fasttextcrawl.pt:   0%…

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.1/models/backward_charlm/1billion.pt:   …

INFO:stanza:Loading these models for language: en (English):
| Processor    | Package   |
----------------------------
| tokenize     | combined  |
| pos          | combined  |
| lemma        | combined  |
| depparse     | combined  |
| sentiment    | sstplus   |
| constituency | wsj       |
| ner          | ontonotes |

INFO:stanza:Use device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: depparse
INFO:stanza:Loading: sentiment
INFO:stanza:Loading: constituency
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


In [75]:
featureList = []
categories = []
for i in taggedList:
 if(i[1]=='JJ' or i[1]=='NN' or i[1]=='JJR' or i[1]=='NNS' or i[1]=='RB'):
     featureList.append(list(i)) # For features for each sentence
     totalfeatureList.append(list(i)) # This list will store all the features for eve
     categories.append(i[0])
print(featureList)
#print(categoriesList)


[['soundquality', 'NN'], ['great', 'JJ'], ['batterylife', 'NN'], ['bad', 'JJ']]


In [83]:
fcluster = []
for i in featureList:
    filist = []
    for j in dep_node:
        if((j[0]==i[0] or j[1]==i[0]) and (j[2] in  ["nsubj", "acl:relcl", "obj", "dobj", "agent", "advmod", "amod", "neg", "prep_of", "acomp", "xcomp", "compound"])):
            if(j[0]==i[0]):
                filist.append(j[1])
            else:
                  filist.append(j[0])
    fcluster.append([i[0], filist])
print(fcluster)

[['soundquality', ['great']], ['great', ['soundquality']], ['batterylife', ['bad']], ['bad', ['batterylife', 'very']]]


In [85]:
finalcluster = []
dic = {}
for i in featureList:
     dic[i[0]] = i[1]
for i in fcluster:
     if(dic[i[0]]=="NN"):
          finalcluster.append(i)
print(finalcluster)

[['soundquality', ['great']], ['batterylife', ['bad']]]
