In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer 
import stanza
stanza.download('en') # download English model
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.2.0.json: 128kB [00:00, 14.5MB/s]                    
2021-03-03 16:38:00 INFO: Downloading default packages for language: en (English)...
2021-03-03 16:38:01 INFO: File exists: /Users/vibhukrovvidi/stanza_resources/en/default.zip.
2021-03-03 16:38:07 INFO: Finished downloading models and saved to /Users/vibhukrovvidi/stanza_resources.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vibhukrovvidi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vibhukrovvidi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/vibhukrovvidi/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
def feature_extraction(txt, nlp):
    try:
        txt = txt.lower()
    except:
        pass;

    sentList = nltk.sent_tokenize(txt)

    retlist = [];
    
    for line in sentList:
        txt_list = nltk.word_tokenize(line)
        taggedList = nltk.pos_tag(txt_list)
        
        newwordList = []
        flag = 0
        for i in range(0,len(taggedList)-1):
            if(taggedList[i][1]=="NN" and taggedList[i+1][1]=="NN"):
                newwordList.append(taggedList[i][0]+taggedList[i+1][0])
                flag=1
            else:
                if(flag==1):
                    flag=0
                    continue
                newwordList.append(taggedList[i][0])
                if(i==len(taggedList)-2):
                    newwordList.append(taggedList[i+1][0])
        finaltxt = ' '.join(word for word in newwordList)
    
    
        stop_words = set(stopwords.words('english'))
        new_txt_list = nltk.word_tokenize(finaltxt)
        wordsList = [w for w in new_txt_list if not w in stop_words]
        taggedList = nltk.pos_tag(wordsList)
        
        doc = nlp(finaltxt)
        dep_node = []
        try:
            for dep_edge in doc.sentences[0].dependencies:
                dep_node.append([dep_edge[2].text, dep_edge[0].id, dep_edge[1]])
            for i in range(0, len(dep_node)):
                if (int(dep_node[i][1]) != 0):
                    dep_node[i][1] = newwordList[(int(dep_node[i][1]) - 1)]
        except:
            pass;
        
        #print(dep_node)
        
        featureList = []
        categories = []
        for i in taggedList:
            if(i[1]=='JJ' or i[1]=='NN' or i[1]=='JJR' or i[1]=='NNS' or i[1]=='RB'):
                featureList.append(list(i))
                categories.append(i[0])
        #print(featureList)
        #print(categories)
        
        
        fcluster = []
        for i in featureList:
            filist = []
            for j in dep_node:
                if((j[0]==i[0] or j[1]==i[0]) and (j[2] in ["nsubj", "acl:relcl", "obj", "dobj", "agent", "advmod", "amod", "neg", "prep_of", "acomp", "xcomp", "compound"])):
                    if(j[0]==i[0]):
                        filist.append(j[1])
                    else:
                        filist.append(j[0])
            fcluster.append([i[0], filist])
        print(fcluster) 
        
        # Remove all features with no sentiment word:
        
        retlist.append(fcluster)
    return retlist;
    

In [3]:
def do_extraction(df, nlp, content_str = "Content"):
    idx = 0;
    review_list = df[content_str].to_list()
    feat_count = dict()
    feat_sent = dict()
    #nlp = stanza.Pipeline('en')
    
    # Replace "" with nan's for removal
    df[content_str].replace('', np.nan, inplace=True)
    df.dropna(subset=[content_str], inplace=True)
    print(" Processing : " , df.shape[0], "rows of data")
    for review in review_list:
        print("Review Number : ", idx);
        idx += 1;
        if idx >= df.shape[0]:
            break;
        try:
            output = feature_extraction(review, nlp);
        except:
            pass;
        for sent in output:
            for pair in sent:
                print(pair)
                if pair[0] in feat_sent:
                    if pair[1] is not None:
                        flist = feat_sent[pair[0]]
                        if isinstance(pair[1], list):
                            for i in pair[1]:
                                flist.append(i)
                        else:
                            flist.append(pair[1])
                        feat_sent[pair[0]] = flist;
                else:
                    if pair[1] is not None:
                        flist = pair[1]
                    else:
                        flist = list()
                    feat_sent[pair[0]] = flist;
                
                if pair[0] in feat_count:
                    feat_count[pair[0]] = feat_count[pair[0]] + 1;
                else:
                    feat_count[pair[0]] = 1

    #print(feat_count);
    return feat_count, feat_sent;

In [4]:
def get_sentiment(a, b, nlp):

    sentiment_score = dict()

    # Delete features with no descriptors
    cob = b.copy()
    for feat in cob.keys():
        #print(cob[feat])
        if cob[feat] == []:
            del b[feat]

    # Run pre-built sentiment score and take avg of all descriptors
    for f in b.keys():
        print(f);
        ssum = 0;
        for g in b[f]:
            try:
                doc = nlp(g);

                for i in doc.sentences:

                        #print(i.sentiment)
                        ssum += i.sentiment;
            except:
                pass;

        sentiment_score[f] = ssum / len(b[f])

        adf = pd.DataFrame.from_dict(a, orient='index', columns=['Freq'])
    adf.sort_values(by="Freq", ascending=False, inplace = True)



    avg_sent = pd.DataFrame.from_dict(sentiment_score, orient='index', columns=["Avg_sent"])

    final_sent = avg_sent.merge(adf, left_index=True, right_index=True)
    final_sent.sort_values(by="Freq", ascending=False, inplace=True)
    return final_sent;

In [5]:
rdr = pd.read_csv('../ScrapedOutput/cmpb.csv')

nlp = stanza.Pipeline('en')

a, b = do_extraction(rdr, nlp)
final_sent = get_sentiment(a, b, nlp)

2021-03-03 16:38:08 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| sentiment | sstplus   |
| ner       | ontonotes |

2021-03-03 16:38:08 INFO: Use device: cpu
2021-03-03 16:38:08 INFO: Loading: tokenize
2021-03-03 16:38:08 INFO: Loading: pos
2021-03-03 16:38:08 INFO: Loading: lemma
2021-03-03 16:38:08 INFO: Loading: depparse
2021-03-03 16:38:09 INFO: Loading: sentiment
2021-03-03 16:38:09 INFO: Loading: ner
2021-03-03 16:38:10 INFO: Done loading processors!


 Processing :  52 rows of data
Review Number :  0
[['professional', ['very', 'people']], ['people', ['professional', 'there']]]
[['recommended⭐⭐⭐⭐⭐', ['kind']], ['patient', ['very', 'kind']], ['kind', ['recommended⭐⭐⭐⭐⭐', 'patient', 'respectful']], ['respectful', ['kind']], ['..', []], ['smooth', ['very']], ['medical', [14]], ['check-up', []]]
['professional', ['very', 'people']]
['people', ['professional', 'there']]
['recommended⭐⭐⭐⭐⭐', ['kind']]
['patient', ['very', 'kind']]
['kind', ['recommended⭐⭐⭐⭐⭐', 'patient', 'respectful']]
['respectful', ['kind']]
['..', []]
['smooth', ['very']]
['medical', [14]]
['check-up', []]
Review Number :  1
[['reviews', ['other', 'suggest']], ['people', ['friendly', 'here']], ['adequately', ['friendly']], ['friendly', ['people', 'adequately']], ['anywhere', ['else']], ['else', ['anywhere']], ['sg', []]]
[['place', ['clean']], ['really', ['clean']], ['clean', ['place', 'really']], ['efficient', []]]
['reviews', ['other', 'suggest']]
['people', ['friendl

[['mo', ['seemed']], ['hand', ['other']], ['si', []], ['pehbuay', ['buaysong']], ['buaysong', ['pehbuay', 'seemed']]]
[['probably', ['this']], ['…', []]]
['ok', []]
['lah', []]
['review', []]
['visit', []]
['january', []]
['maybe', ['sikit']]
['sikit', ['so', 'maybe']]
['date', []]
['nsf', ['staff']]
['staff', ['nsf', 'ok']]
['ok', ['staff']]
['typical', ['bochap']]
['bochap', ['typical']]
['happy', []]
['bird', []]
['tio', ['switch', 'vocation']]
['switch', ['tio']]
['vocation', ['tio']]
['mo', ['seemed']]
['hand', ['other']]
['si', []]
['pehbuay', ['buaysong']]
['buaysong', ['pehbuay', 'seemed']]
['probably', ['this']]
['…', []]
Review Number :  14
[['tbh', []], ['bad', ['not', 'that']], ['place', []], ['visit', []], ['contrary', []], ['others', ['saying']]]
['tbh', []]
['bad', ['not', 'that']]
['place', []]
['visit', []]
['contrary', []]
['others', ['saying']]
Review Number :  15
[['enlisting', ['here']], ['sure', ['make']], ['medical', ['conditions']], ['conditions', ['medical', 'd

[['highly', ['inaccessible']], ['inaccessible', ['highly']]]
[['hard', ['so']], ['get', ['there']], ['mrt', ['stations']], ['stations', ['not', 'mrt']]]
['highly', ['inaccessible']]
['inaccessible', ['highly']]
['hard', ['so']]
['get', ['there']]
['mrt', ['stations']]
['stations', ['not', 'mrt']]
Review Number :  36
[['bane', []], ['existence', []]]
['bane', []]
['existence', []]
Review Number :  37
[['cookhouse', []], ['book', ['get', 'everyday']], ['everyday', ['book']]]
[['troublesome', ['most', 'thing']], ['thing', ['troublesome', 'discussing']], ['discussing', ['thing', 'what']], ['lunch', []]]
[['sidenote', []], ['菜贩', ['canteenb']], ['@', []], ['canteen', []], ['canteenb', ['菜贩', '@canteen', 'bad']], ['bad', ['canteenb', 'real']], ['real', ['bad']], ['bad', ['canteenb', 'real']]]
['cookhouse', []]
['book', ['get', 'everyday']]
['everyday', ['book']]
['troublesome', ['most', 'thing']]
['thing', ['troublesome', 'discussing']]
['discussing', ['thing', 'what']]
['lunch', []]
['siden

In [6]:
final_sent

Unnamed: 0,Avg_sent,Freq
staff,0.909091,11
medical,1.200000,10
people,1.250000,10
bad,1.058824,8
n't,1.300000,8
...,...,...
directions,0.000000,1
expect,1.000000,1
whole,1.000000,1
process,1.000000,1


In [7]:
fdr = pd.read_csv('../ScrapedOutput/nationalservicesg_comments.csv')
nlp = stanza.Pipeline('en')

c, d = do_extraction(fdr[0:50], nlp, "body")
final_sent2 = get_sentiment(c, d, nlp)

2021-03-03 16:39:31 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| sentiment | sstplus   |
| ner       | ontonotes |

2021-03-03 16:39:31 INFO: Use device: cpu
2021-03-03 16:39:31 INFO: Loading: tokenize
2021-03-03 16:39:31 INFO: Loading: pos
2021-03-03 16:39:31 INFO: Loading: lemma
2021-03-03 16:39:31 INFO: Loading: depparse
2021-03-03 16:39:32 INFO: Loading: sentiment
2021-03-03 16:39:32 INFO: Loading: ner
2021-03-03 16:39:34 INFO: Done loading processors!
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stabl

 Processing :  50 rows of data
Review Number :  0
[['mo', []]]
[['ever', ['have']], ['haunt', ['you']], ['age', []]]
['mo', []]
['ever', ['have']]
['haunt', ['you']]
['age', []]
Review Number :  1
[['legit', ['means']], ['rsi', ['ba']], ['ba', ['rsi']]]
[['u', ['want']], ['wait', ['want']], ['bookout', ['you']], ['also', ['diff']], ['diff', ['also']], ['superiors', []]]
[['rsi', ['just']]]
['legit', ['means']]
['rsi', ['ba']]
['ba', ['rsi']]
['u', ['want']]
['wait', ['want']]
['bookout', ['you']]
['also', ['diff']]
['diff', ['also']]
['superiors', []]
['rsi', ['just']]
Review Number :  2
[['mean', ['i']], ['really', ['worried']], ['bad', ['impression']], ['impression', ['bad']], ['weekend', []], ['let', ['them', 'know']], ['’', []], ['bookout', ['you']], ['keng', ['time']], ['time', ['keng', 'trying']]]
[['mean', ['i']], ['health', ['neglect']], ['impression', []], ['’', []], ['superiors', []]]
['mean', ['i']]
['really', ['worried']]
['bad', ['impression']]
['impression', ['bad']]
['we

[['’', []], ['even', ['possible']], ['possible', ['even']]]
['’', []]
['even', ['possible']]
['possible', ['even']]
Review Number :  20
[['draw', ['had', 'ic']], ['ic', ['draw', 'new']], ['new', ['ic']], ['ic', ['draw', 'new']], ['change', []], ['particulars', []]]
[['icdepartment/cmpb', []], ['enquire', []], ['ur', []], ['ic', ['ic']], ['valid', ['relevant', 'reason']], ['reason', ['valid']], ['relevant', ['valid', 'do']], ['document', []]]
['draw', ['had', 'ic']]
['ic', ['draw', 'new']]
['new', ['ic']]
['ic', ['draw', 'new']]
['change', []]
['particulars', []]
['icdepartment/cmpb', []]
['enquire', []]
['ur', []]
['ic', ['ic']]
['valid', ['relevant', 'reason']]
['reason', ['valid']]
['relevant', ['valid', 'do']]
['document', []]
Review Number :  21
[['long', ['not', 'enough']], ['enough', ['long']], ['complain', []]]
[['guy', ['encountered']], ['didnt', []], ['complain', ['he']], ['abt', []], ['collectiontime', []], ['id', ['assume']], ['<', []], ['=15mins', []]]
['long', ['not', 'eno

[['engineers', []], ['first', ['weeks']], ['weeks', ['first']], ['nee', []], ['soon', ['camp']], ['camp', ['soon']], ['counter', ['training']], ['training', ['counter', 'ied', 'undergoing']]]
[['badge', ['cied', 'get']], ['weeks', []]]
[['learn', ['you']], ['ignite', []], ['live', ['charges']], ['charges', ['live', 'prepare']], ['time', []]]
[['outfields', []], ['packsense', ['fieldpack']], ['outfield', ['training']], ['training', ['outfield', 'is']]]
[['average', ['day']], ['day', ['average']], ['time', []], ['last', ['year']], ['year', ['last']]]
[['high', ['keys']], ['keys', ['high', 'include']], ['demo', ['firing']], ['live', ['firing']], ['firing', ['demo', 'live', 'include']], ['summex', []]]
[['weeks', []], ['split', []], ['specializations', ['further']], ['armour', []], ['field', []], ['plant', []], ['bridging', []], ['cbrd', []], ['eod', []]]
[['batch', ['bridging', 'cbre']], ['bridging', ['batch']], ['cbre', ['batch']], ['batch', ['bridging', 'cbre']]]
[['next', ['batch']], [

[['personally', ['rejected']], ['command', ['going', 'school']], ['school', ['command']], ['leadership', ['system']], ['system', ['leadership', 'hated']], ['saf', []]]
[['proven', ['right']], ['right', ['proven']], ['unit', ['go']], ['men', ['seeing', 'lead']], ['better', ['lead']], ['commanders', ['it']], ['sgt', []], ['officer', []]]
['thing', ['same', 'applies']]
['applies', ['thing']]
['result', ['also']]
['also', ['result']]
['ippt', []]
['sit', ['then', 'then', 'test', 'appraisal']]
['test', ['sit']]
['peer', ['appraisal']]
['appraisal', ['then', 'peer', 'sit']]
['leadership', ['qualities']]
['qualities', ['leadership', 'impt']]
['impt', ['qualities', 'also']]
['also', ['impt']]
['form', ['just']]
['initiative', ['taking']]
['point', ['not', 'find']]
['section', ['mates']]
['annoying', ['find']]
['fieldcamp', ['leadership']]
['leadership', ['fieldcamp', 'important']]
['important', ['leadership', 'more']]
['normal', ['days']]
['days', ['normal']]
['commanders', ['keep']]
['really'

In [8]:
final_sent2

Unnamed: 0,Avg_sent,Freq
ic,1.152174,21
also,0.857143,7
bmt,1.000000,6
back,1.250000,5
u,1.000000,5
...,...,...
reminder,1.500000,1
friendly,1.000000,1
care,1.000000,1
mental,1.000000,1
