In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer 
import stanza
stanza.download('en') # download English model
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.2.0.json: 128kB [00:00, 10.6MB/s]                    
2021-03-03 13:52:18 INFO: Downloading default packages for language: en (English)...
2021-03-03 13:52:20 INFO: File exists: /Users/vibhukrovvidi/stanza_resources/en/default.zip.
2021-03-03 13:52:24 INFO: Finished downloading models and saved to /Users/vibhukrovvidi/stanza_resources.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vibhukrovvidi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vibhukrovvidi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/vibhukrovvidi/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
def feature_extraction(txt, nlp):
    try:
        txt = txt.lower()
    except:
        pass;

    sentList = nltk.sent_tokenize(txt)

    retlist = [];
    
    for line in sentList:
        txt_list = nltk.word_tokenize(line)
        taggedList = nltk.pos_tag(txt_list)
        
        newwordList = []
        flag = 0
        for i in range(0,len(taggedList)-1):
            if(taggedList[i][1]=="NN" and taggedList[i+1][1]=="NN"):
                newwordList.append(taggedList[i][0]+taggedList[i+1][0])
                flag=1
            else:
                if(flag==1):
                    flag=0
                    continue
                newwordList.append(taggedList[i][0])
                if(i==len(taggedList)-2):
                    newwordList.append(taggedList[i+1][0])
        finaltxt = ' '.join(word for word in newwordList)
    
    
        stop_words = set(stopwords.words('english'))
        new_txt_list = nltk.word_tokenize(finaltxt)
        wordsList = [w for w in new_txt_list if not w in stop_words]
        taggedList = nltk.pos_tag(wordsList)
        
        doc = nlp(finaltxt)
        dep_node = []
        try:
            for dep_edge in doc.sentences[0].dependencies:
                dep_node.append([dep_edge[2].text, dep_edge[0].id, dep_edge[1]])
            for i in range(0, len(dep_node)):
                if (int(dep_node[i][1]) != 0):
                    dep_node[i][1] = newwordList[(int(dep_node[i][1]) - 1)]
        except:
            pass;
        
        #print(dep_node)
        
        featureList = []
        categories = []
        for i in taggedList:
            if(i[1]=='JJ' or i[1]=='NN' or i[1]=='JJR' or i[1]=='NNS' or i[1]=='RB'):
                featureList.append(list(i))
                categories.append(i[0])
        #print(featureList)
        #print(categories)
        
        
        fcluster = []
        for i in featureList:
            filist = []
            for j in dep_node:
                if((j[0]==i[0] or j[1]==i[0]) and (j[2] in ["nsubj", "acl:relcl", "obj", "dobj", "agent", "advmod", "amod", "neg", "prep_of", "acomp", "xcomp", "compound"])):
                    if(j[0]==i[0]):
                        filist.append(j[1])
                    else:
                        filist.append(j[0])
            fcluster.append([i[0], filist])
        print(fcluster) 
        
        # Remove all features with no sentiment word:
        
        retlist.append(fcluster)
    return retlist;
    

In [32]:
def do_extraction(df, nlp, content_str = "Content"):
    idx = 0;
    review_list = df[content_str].to_list()
    feat_count = dict()
    feat_sent = dict()
    #nlp = stanza.Pipeline('en')
    
    # Replace "" with nan's for removal
    df[content_str].replace('', np.nan, inplace=True)
    df.dropna(subset=[content_str], inplace=True)
    print(" Processing : " , df.shape[0], "rows of data")
    for review in review_list:
        print("Review Number : ", idx);
        idx += 1;
        if idx >= df.shape[0]:
            break;
        try:
            output = feature_extraction(review, nlp);
        except:
            pass;
        for sent in output:
            for pair in sent:
                print(pair)
                if pair[0] in feat_sent:
                    if pair[1] is not None:
                        flist = feat_sent[pair[0]]
                        if isinstance(pair[1], list):
                            for i in pair[1]:
                                flist.append(i)
                        else:
                            flist.append(pair[1])
                        feat_sent[pair[0]] = flist;
                else:
                    if pair[1] is not None:
                        flist = pair[1]
                    else:
                        flist = list()
                    feat_sent[pair[0]] = flist;
                
                if pair[0] in feat_count:
                    feat_count[pair[0]] = feat_count[pair[0]] + 1;
                else:
                    feat_count[pair[0]] = 1

    #print(feat_count);
    return feat_count, feat_sent;

In [35]:
def get_sentiment(a, b, nlp):

    sentiment_score = dict()

    # Delete features with no descriptors
    cob = b.copy()
    for feat in cob.keys():
        #print(cob[feat])
        if cob[feat] == []:
            del b[feat]

    # Run pre-built sentiment score and take avg of all descriptors
    for f in b.keys():
        #print(f);
        ssum = 0;
        for g in b[f]:
            try:
                doc = nlp(g);

                for i in doc.sentences:

                        #print(i.sentiment)
                        ssum += i.sentiment;
            except:
                pass;

        sentiment_score[f] = ssum / len(b[f])

        adf = pd.DataFrame.from_dict(a, orient='index', columns=['Freq'])
    adf.sort_values(by="Freq", ascending=False, inplace = True)



    avg_sent = pd.DataFrame.from_dict(sentiment_score, orient='index', columns=["Avg_sent"])

    final_sent = avg_sent.merge(adf, left_index=True, right_index=True)
    final_sent.sort_values(by="Freq", ascending=False, inplace=True)
    return final_sent;

In [36]:
rdr = pd.read_csv('../../GReviewsData/cmpb.csv')

nlp = stanza.Pipeline('en')

a, b = do_extraction(rdr, nlp)
final_sent = get_sentiment(a, b, nlp)

2021-03-03 14:13:43 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| sentiment | sstplus   |
| ner       | ontonotes |

2021-03-03 14:13:43 INFO: Use device: cpu
2021-03-03 14:13:43 INFO: Loading: tokenize
2021-03-03 14:13:43 INFO: Loading: pos
2021-03-03 14:13:43 INFO: Loading: lemma
2021-03-03 14:13:43 INFO: Loading: depparse
2021-03-03 14:13:44 INFO: Loading: sentiment
2021-03-03 14:13:45 INFO: Loading: ner
2021-03-03 14:13:47 INFO: Done loading processors!


 Processing :  52 rows of data
Review Number :  0
[['professional', ['very', 'people']], ['people', ['professional', 'there']]]
[['recommended⭐⭐⭐⭐⭐', ['kind']], ['patient', ['very', 'kind']], ['kind', ['recommended⭐⭐⭐⭐⭐', 'patient', 'respectful']], ['respectful', ['kind']], ['..', []], ['smooth', ['very']], ['medical', [14]], ['check-up', []]]
['professional', ['very', 'people']]
['people', ['professional', 'there']]
['recommended⭐⭐⭐⭐⭐', ['kind']]
['patient', ['very', 'kind']]
['kind', ['recommended⭐⭐⭐⭐⭐', 'patient', 'respectful']]
['respectful', ['kind']]
['..', []]
['smooth', ['very']]
['medical', [14]]
['check-up', []]
Review Number :  1
[['reviews', ['other', 'suggest']], ['people', ['friendly', 'here']], ['adequately', ['friendly']], ['friendly', ['people', 'adequately']], ['anywhere', ['else']], ['else', ['anywhere']], ['sg', []]]
[['place', ['clean']], ['really', ['clean']], ['clean', ['place', 'really']], ['efficient', []]]
['reviews', ['other', 'suggest']]
['people', ['friendl

[['mo', ['seemed']], ['hand', ['other']], ['si', []], ['pehbuay', ['buaysong']], ['buaysong', ['pehbuay', 'seemed']]]
[['probably', ['this']], ['…', []]]
['ok', []]
['lah', []]
['review', []]
['visit', []]
['january', []]
['maybe', ['sikit']]
['sikit', ['so', 'maybe']]
['date', []]
['nsf', ['staff']]
['staff', ['nsf', 'ok']]
['ok', ['staff']]
['typical', ['bochap']]
['bochap', ['typical']]
['happy', []]
['bird', []]
['tio', ['switch', 'vocation']]
['switch', ['tio']]
['vocation', ['tio']]
['mo', ['seemed']]
['hand', ['other']]
['si', []]
['pehbuay', ['buaysong']]
['buaysong', ['pehbuay', 'seemed']]
['probably', ['this']]
['…', []]
Review Number :  14
[['tbh', []], ['bad', ['not', 'that']], ['place', []], ['visit', []], ['contrary', []], ['others', ['saying']]]
['tbh', []]
['bad', ['not', 'that']]
['place', []]
['visit', []]
['contrary', []]
['others', ['saying']]
Review Number :  15
[['enlisting', ['here']], ['sure', ['make']], ['medical', ['conditions']], ['conditions', ['medical', 'd

[['hard', ['so']], ['get', ['there']], ['mrt', ['stations']], ['stations', ['not', 'mrt']]]
['highly', ['inaccessible']]
['inaccessible', ['highly']]
['hard', ['so']]
['get', ['there']]
['mrt', ['stations']]
['stations', ['not', 'mrt']]
Review Number :  36
[['bane', []], ['existence', []]]
['bane', []]
['existence', []]
Review Number :  37
[['cookhouse', []], ['book', ['get', 'everyday']], ['everyday', ['book']]]
[['troublesome', ['most', 'thing']], ['thing', ['troublesome', 'discussing']], ['discussing', ['thing', 'what']], ['lunch', []]]
[['sidenote', []], ['菜贩', ['canteenb']], ['@', []], ['canteen', []], ['canteenb', ['菜贩', '@canteen', 'bad']], ['bad', ['canteenb', 'real']], ['real', ['bad']], ['bad', ['canteenb', 'real']]]
['cookhouse', []]
['book', ['get', 'everyday']]
['everyday', ['book']]
['troublesome', ['most', 'thing']]
['thing', ['troublesome', 'discussing']]
['discussing', ['thing', 'what']]
['lunch', []]
['sidenote', []]
['菜贩', ['canteenb']]
['@', []]
['canteen', []]
['ca

In [37]:
final_sent

Unnamed: 0,Avg_sent,Freq
staff,0.909091,11
medical,1.200000,10
people,1.250000,10
bad,1.058824,8
n't,1.300000,8
...,...,...
directions,0.000000,1
expect,1.000000,1
whole,1.000000,1
process,1.000000,1
