In [1]:
import pandas as pd
import numpy as np
import nltk
import regex
import re
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer 
from sklearn.feature_extraction.text import TfidfVectorizer
import stanza
stanza.download('en') # download English model
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.2.0.json: 128kB [00:00, 64.1MB/s]
2021-03-13 16:08:45 INFO: Downloading default packages for language: en (English)...
2021-03-13 16:08:46 INFO: File exists: C:\Users\vibkr\stanza_resources\en\default.zip.
2021-03-13 16:08:49 INFO: Finished downloading models and saved to C:\Users\vibkr\stanza_resources.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vibkr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vibkr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\vibkr\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
def feature_extraction(txt, nlp):

    sentList = nltk.sent_tokenize(txt)

    retlist = [];
    
    for line in sentList:
        
        txt_list = nltk.word_tokenize(line)
        taggedList = nltk.pos_tag(txt_list)
        
        newwordList = []
        flag = 0
        for i in range(0,len(taggedList)-1):
            if(taggedList[i][1]=="NN" and taggedList[i+1][1]=="NN"):
                newwordList.append(taggedList[i][0]+taggedList[i+1][0])
                flag=1
            else:
                if(flag==1):
                    flag=0
                    continue
                newwordList.append(taggedList[i][0])
                if(i==len(taggedList)-2):
                    newwordList.append(taggedList[i+1][0])
        finaltxt = ' '.join(word for word in newwordList)
    
    
        stop_words = set(stopwords.words('english'))
        new_txt_list = nltk.word_tokenize(finaltxt)
        wordsList = [w for w in new_txt_list if not w in stop_words]
        taggedList = nltk.pos_tag(wordsList)
        
        doc = nlp(finaltxt)
        dep_node = []
        try:
            for dep_edge in doc.sentences[0].dependencies:
                dep_node.append([dep_edge[2].text, dep_edge[0].id, dep_edge[1]])
            for i in range(0, len(dep_node)):
                if (int(dep_node[i][1]) != 0):
                    dep_node[i][1] = newwordList[(int(dep_node[i][1]) - 1)]
        except:
            pass;
        
        #print(dep_node)
        
        featureList = []
        categories = []
        for i in taggedList:
            if(i[1]=='JJ' or i[1]=='NN' or i[1]=='JJR' or i[1]=='NNS' or i[1]=='RB'):
                featureList.append(list(i))
                categories.append(i[0])
        #print(featureList)
        #print(categories)
        
        
        fcluster = []
        for i in featureList:
            filist = []
            for j in dep_node:
                if((j[0]==i[0] or j[1]==i[0]) and (j[2] in [
                    # Different types of words that are identified as potential features
                    "nsubj",
                    "acl:relcl",
                    "obj",
                    "dobj",
                    "agent",
                    "advmod",
                    "amod",
                    "neg",
                    "prep_of",
                    "acomp",
                    "xcomp",
                    "compound"
                ])):
                    if(j[0]==i[0]):
                        filist.append(j[1])
                    else:
                        filist.append(j[0])
            fcluster.append([i[0], filist])
        print(fcluster) 
        
        # Remove all features with no sentiment word:
        
        retlist.append(fcluster)
    return retlist;
    

In [3]:
def do_extraction(df, nlp, feat_count, feat_sent, content_str = "Content"):
    idx = 0;
    # Replace "" with nan's for removal
    df[content_str].replace('', np.nan, inplace=True)
    df.dropna(subset=[content_str], inplace=True)
    review_list = df[content_str].to_list()
    #feat_count = dict()
    #feat_sent = dict()
    #nlp = stanza.Pipeline('en')
    
    
    
    
    print(" Processing : " , df.shape[0], "rows of data")
    for review in tqdm(review_list):
        print("Review Number : ", idx);
        
        # Some data pre-processing
        
        review = review.lower()
        
        # Merge hyphenated words
        separate = review.split('-')
        review = ''.join(separate)
        
        # Remove non-alphabets
        review = re.sub(r'[^a-z\s\t.!?]', '', review)
        
        idx += 1;
        if idx >= df.shape[0]:
            break;
        try:
            output = feature_extraction(review, nlp);
        except:
            pass;
        for sent in output:
            for pair in sent:
                print(pair)
                if pair[0] in feat_sent:
                    if pair[1] is not None:
                        flist = feat_sent[pair[0]]
                        if isinstance(pair[1], list):
                            for i in pair[1]:
                                flist.append(i)
                        else:
                            flist.append(pair[1])
                        feat_sent[pair[0]] = flist;
                else:
                    if pair[1] is not None:
                        flist = pair[1]
                    else:
                        flist = list()
                    feat_sent[pair[0]] = flist;
                
                if pair[0] in feat_count:
                    feat_count[pair[0]] = feat_count[pair[0]] + 1;
                else:
                    feat_count[pair[0]] = 1
    
    return feat_count, feat_sent;

In [4]:
def get_sentiment(feat_count, feat_sent, nlp):

    sentiment_score = dict()

    # Delete features with no descriptors
    cob = feat_sent.copy()
    for feat in cob.keys():
        #print(cob[feat])
        
        if cob[feat] == []:
            del feat_sent[feat]
        else:
            feat_sent[feat] = ' ,'.join(feat_sent[feat])

    # Run pre-built sentiment score and take avg of all descriptors
    for f in tqdm(feat_sent.keys()):
        print("Calculating Sentiment for: ", f);
        ssum = 0;
        for g in feat_sent[f]:
            try:
                doc = nlp(g);

                for i in doc.sentences:

                        #print(i.sentiment)
                        ssum += i.sentiment;
            except:
                pass;

        sentiment_score[f] = ssum / len(b[f])

    adf = pd.DataFrame.from_dict(feat_count, orient='index', columns=['Freq'])
    adf.sort_values(by="Freq", ascending=False, inplace = True)

    

    avg_sent = pd.DataFrame.from_dict(sentiment_score, orient='index', columns=["Avg_sent"])
    desc_words = pd.DataFrame.from_dict(feat_sent, orient="index", columns=["Descriptors"])
    
    avg_sent = avg_sent.merge(desc_words, left_index=True, right_index=True)
    
    
    final_sent = avg_sent.merge(adf, left_index=True, right_index=True)
    final_sent.sort_values(by="Freq", ascending=False, inplace=True)
    return final_sent;

In [5]:
rdr = pd.read_csv('../ScrapedOutput/cmpb.csv')

nlp = stanza.Pipeline('en')
a = dict()
b = dict()
a, b = do_extraction(rdr, nlp, a, b)

2021-03-13 16:08:49 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| sentiment | sstplus   |
| ner       | ontonotes |

2021-03-13 16:08:49 INFO: Use device: cpu
2021-03-13 16:08:49 INFO: Loading: tokenize
2021-03-13 16:08:49 INFO: Loading: pos
2021-03-13 16:08:50 INFO: Loading: lemma
2021-03-13 16:08:50 INFO: Loading: depparse
2021-03-13 16:08:50 INFO: Loading: sentiment
2021-03-13 16:08:50 INFO: Loading: ner
2021-03-13 16:08:51 INFO: Done loading processors!
  0%|                                                                                           | 0/52 [00:00<?, ?it/s]

 Processing :  52 rows of data
Review Number :  0
[['professional', ['very', 'people']], ['people', ['professional', 'there']]]


  2%|█▌                                                                                 | 1/52 [00:00<00:25,  1.99it/s]

[['patient', ['very', 'recommended']], ['kind', ['respectful']], ['respectful', ['kind']], ['..', []], ['smooth', ['very']], ['medical', ['checkup']], ['checkup', ['medical']]]
['professional', ['very', 'people']]
['people', ['professional', 'there']]
['patient', ['very', 'recommended']]
['kind', ['respectful']]
['respectful', ['kind']]
['..', []]
['smooth', ['very']]
['medical', ['checkup']]
['checkup', ['medical']]
Review Number :  1


  4%|███▏                                                                               | 2/52 [00:00<00:23,  2.13it/s]

[['reviews', ['other', 'suggest']], ['people', ['suggest', 'here']], ['adequately', ['friendlylike']], ['anywhere', ['friendlylike', 'else']], ['else', ['anywhere']], ['sg', []]]
[['place', ['clean']], ['really', ['clean']], ['clean', ['place', 'really']], ['efficient', []]]
['reviews', ['other', 'suggest']]
['people', ['suggest', 'here']]
['adequately', ['friendlylike']]
['anywhere', ['friendlylike', 'else']]
['else', ['anywhere']]
['sg', []]
['place', ['clean']]
['really', ['clean']]
['clean', ['place', 'really']]
['efficient', []]
Review Number :  2
[['place', ['fine']], ['overall', ['fine']], ['fine', ['place', 'overall']]]
[['good', ['experience']], ['experience', ['good']]]
[['tip', ['just']], ['u', []], ['medical', ['checkup']], ['checkup', ['medical']]]


  6%|████▊                                                                              | 3/52 [00:01<00:26,  1.87it/s]

[['dont', []], ['late', ['go']], ['else', []], ['u', ['have']], ['back', ['come']], ['day', ['other']], ['complete', ['rest']], ['rest', ['complete']]]
['place', ['fine']]
['overall', ['fine']]
['fine', ['place', 'overall']]
['good', ['experience']]
['experience', ['good']]
['tip', ['just']]
['u', []]
['medical', ['checkup']]
['checkup', ['medical']]
['dont', []]
['late', ['go']]
['else', []]
['u', ['have']]
['back', ['come']]
['day', ['other']]
['complete', ['rest']]
['rest', ['complete']]
Review Number :  3


  8%|██████▍                                                                            | 4/52 [00:02<00:27,  1.74it/s]

[['nscheckup', []], ['checkuptoday', []], ['august', ['am']], ['gateentrance', ['checkcounter']], ['entrancesecurity', ['checkcounter']], ['checkcounter', ['gateentrance', 'entrancesecurity', 'securitycheck']], ['stickerpass', ['take']], ['walk', ['just']], ['gate', []], ['dont', []], ['thinking', []], ['scan', ['i', 'the']]]
['nscheckup', []]
['checkuptoday', []]
['august', ['am']]
['gateentrance', ['checkcounter']]
['entrancesecurity', ['checkcounter']]
['checkcounter', ['gateentrance', 'entrancesecurity', 'securitycheck']]
['stickerpass', ['take']]
['walk', ['just']]
['gate', []]
['dont', []]
['thinking', []]
['scan', ['i', 'the']]
Review Number :  4
[['dontbother', ['end']], ['youll', []], ['end', ['dontbother', 'waiting']], ['hours', ['more']], ['realise', ['only', 'that', 'youre', 'person']], ['youre', ['realise']], ['last', ['person']], ['person', ['last', 'realise']], ['line', []]]


 10%|███████▉                                                                           | 5/52 [00:02<00:30,  1.56it/s]

[['doctors', ['care']], ['really', ['care', 'have']], ['fair', ['which']], ['dont', []], ['really', ['care', 'have']], ['choice', ['have']]]
[['place', ['waste']], ['complete', ['waste']], ['waste', ['place', 'complete']], ['space', []], ['time', []]]
['dontbother', ['end']]
['youll', []]
['end', ['dontbother', 'waiting']]
['hours', ['more']]
['realise', ['only', 'that', 'youre', 'person']]
['youre', ['realise']]
['last', ['person']]
['person', ['last', 'realise']]
['line', []]
['doctors', ['care']]
['really', ['care', 'have']]
['fair', ['which']]
['dont', []]
['really', ['care', 'have']]
['choice', ['have']]
['place', ['waste']]
['complete', ['waste']]
['waste', ['place', 'complete']]
['space', []]
['time', []]
Review Number :  5


 12%|█████████▌                                                                         | 6/52 [00:03<00:21,  2.10it/s]

[['nsf', ['reviews']], ['reviews', ['nsf']], ['lol', []]]
['nsf', ['reviews']]
['reviews', ['nsf']]
['lol', []]
Review Number :  6
[['extremely', ['long']], ['long', ['extremely', 'time']], ['time', ['long', 'takes']], ['due', []], ['waiting', []]]


 13%|███████████▏                                                                       | 7/52 [00:03<00:18,  2.45it/s]

[['overall', ['waste']], ['complete', ['waste']], ['waste', ['overall', 'complete']], ['time', []]]
['extremely', ['long']]
['long', ['extremely', 'time']]
['time', ['long', 'takes']]
['due', []]
['waiting', []]
['overall', ['waste']]
['complete', ['waste']]
['waste', ['overall', 'complete']]
['time', []]
Review Number :  7
[['staff', ['professional']], ['professional', ['staff']], ['knows', []]]


 15%|████████████▊                                                                      | 8/52 [00:03<00:19,  2.27it/s]

[['idk', []], ['bad', ['reviews']], ['personal', ['experienceeveryone']], ['experienceeveryone', ['personal']], ['helpful', ['very', 'is']], ['initiative', ['take']], ['help', ['me']]]
['staff', ['professional']]
['professional', ['staff']]
['knows', []]
['idk', []]
['bad', ['reviews']]
['personal', ['experienceeveryone']]
['experienceeveryone', ['personal']]
['helpful', ['very', 'is']]
['initiative', ['take']]
['help', ['me']]
Review Number :  8
[['unfriendly', ['staff']], ['staff', ['unfriendly']]]
[['guards', ['doing']], ['job', ['doing']], ['staff', ['keep']], ['stuff', ['more', 'do', 'suppose']], ['suppose', ['they', 'not', 'stuff']]]
[['absolutely', ['atrocious']], ['atrocious', ['absolutely']]]
[['woman', ['is']], ['keeps', ['who', 'vac', 'changing']], ['tone', ['changing']], ['talks', ['she']], ['people', ['threatens']], ['people', ['threatens']]]


 17%|██████████████▎                                                                    | 9/52 [00:04<00:26,  1.63it/s]

[['always', ['removes']], ['mask', ['removes']], ['talk', []], ['people', []], ['expressions', ['show']]]
['unfriendly', ['staff']]
['staff', ['unfriendly']]
['guards', ['doing']]
['job', ['doing']]
['staff', ['keep']]
['stuff', ['more', 'do', 'suppose']]
['suppose', ['they', 'not', 'stuff']]
['absolutely', ['atrocious']]
['atrocious', ['absolutely']]
['woman', ['is']]
['keeps', ['who', 'vac', 'changing']]
['tone', ['changing']]
['talks', ['she']]
['people', ['threatens']]
['people', ['threatens']]
['always', ['removes']]
['mask', ['removes']]
['talk', []]
['people', []]
['expressions', ['show']]
Review Number :  9


 19%|███████████████▊                                                                  | 10/52 [00:05<00:20,  2.01it/s]

[['staff', ['rude']], ['medical', ['screening']], ['screening', ['medical', 'station']], ['station', ['screening']], ['weight', []], ['extremely', ['rude']], ['rude', ['staff', 'extremely']], ['unfriendly', []]]
['staff', ['rude']]
['medical', ['screening']]
['screening', ['medical', 'station']]
['station', ['screening']]
['weight', []]
['extremely', ['rude']]
['rude', ['staff', 'extremely']]
['unfriendly', []]
Review Number :  10
[['staff', ['impatient']], ['serious', ['impatient']], ['impatient', ['staff', 'not', 'serious']]]


 21%|█████████████████▎                                                                | 11/52 [00:05<00:19,  2.14it/s]

[['undesirably', ['long']], ['long', ['undesirably', 'times']], ['times', ['long', 'waiting']]]
[['cmpb', ['recommend']], ['friend', []]]
['staff', ['impatient']]
['serious', ['impatient']]
['impatient', ['staff', 'not', 'serious']]
['undesirably', ['long']]
['long', ['undesirably', 'times']]
['times', ['long', 'waiting']]
['cmpb', ['recommend']]
['friend', []]
Review Number :  11
[['sent', ['just', 'son']], ['son', ['sent']], ['preenlistment', ['enlistmentcheckup']], ['enlistmentcheckup', ['preenlistment']], ['morning', []]]
[['guard', ['give']], ['give', ['guard', 'instructions']], ['clear', ['instructions']], ['instructions', ['clear', 'give']]]


 23%|██████████████████▉                                                               | 12/52 [00:06<00:22,  1.78it/s]

[['son', ['got']], ['alight', ['got']], ['couldnt', []], ['drive', ['we', 'in']]]
[['helloplease', []], ['train', ['army']], ['army', ['train']]]
['sent', ['just', 'son']]
['son', ['sent']]
['preenlistment', ['enlistmentcheckup']]
['enlistmentcheckup', ['preenlistment']]
['morning', []]
['guard', ['give']]
['give', ['guard', 'instructions']]
['clear', ['instructions']]
['instructions', ['clear', 'give']]
['son', ['got']]
['alight', ['got']]
['couldnt', []]
['drive', ['we', 'in']]
['helloplease', []]
['train', ['army']]
['army', ['train']]
Review Number :  12
[['inconvenient', ['most', 'locations']], ['locations', ['inconvenient', 'seen']], ['ever', ['seen']]]
[['terrible', ['directions']], ['directions', ['terrible']]]
[['rude', ['staff']], ['staff', ['rude']]]


 25%|████████████████████▌                                                             | 13/52 [00:06<00:21,  1.77it/s]

[['expect', ['process']], ['whole', ['process']], ['process', ['whole', 'expect']], ['take', ['hours']], ['hours', ['take']]]
[]
['inconvenient', ['most', 'locations']]
['locations', ['inconvenient', 'seen']]
['ever', ['seen']]
['terrible', ['directions']]
['directions', ['terrible']]
['rude', ['staff']]
['staff', ['rude']]
['expect', ['process']]
['whole', ['process']]
['process', ['whole', 'expect']]
['take', ['hours']]
['hours', ['take']]
Review Number :  13
[['ok', []], ['lah', []], ['review', []], ['visitjanuary', []], ['maybe', ['sikit']], ['sikit', ['so', 'maybe']], ['date', []]]
[['nsf', ['staff']], ['staff', ['nsf', 'ok']], ['ok', ['staff', 'typical']], ['typical', ['ok']], ['bochap', []], ['happy', []], ['bird', []], ['tio', ['switch', 'vocation']], ['switch', ['tio']], ['vocation', ['tio']]]


 27%|██████████████████████                                                            | 14/52 [00:07<00:22,  1.67it/s]

[['mo', ['seemed']], ['hand', ['other']], ['si', []], ['pehbuay', ['buaysong']], ['buaysong', ['pehbuay', 'seemed']]]
[['probably', ['this']]]
['ok', []]
['lah', []]
['review', []]
['visitjanuary', []]
['maybe', ['sikit']]
['sikit', ['so', 'maybe']]
['date', []]
['nsf', ['staff']]
['staff', ['nsf', 'ok']]
['ok', ['staff', 'typical']]
['typical', ['ok']]
['bochap', []]
['happy', []]
['bird', []]
['tio', ['switch', 'vocation']]
['switch', ['tio']]
['vocation', ['tio']]
['mo', ['seemed']]
['hand', ['other']]
['si', []]
['pehbuay', ['buaysong']]
['buaysong', ['pehbuay', 'seemed']]
['probably', ['this']]
Review Number :  14


 29%|███████████████████████▋                                                          | 15/52 [00:07<00:18,  2.04it/s]

[['tbh', []], ['bad', ['not', 'that']], ['place', []], ['visit', ['contrary']], ['contrary', ['visit']], ['others', ['saying']]]
['tbh', []]
['bad', ['not', 'that']]
['place', []]
['visit', ['contrary']]
['contrary', ['visit']]
['others', ['saying']]
Review Number :  15
[['please', []], ['sure', ['make']], ['medical', ['conditions']], ['conditions', ['medical', 'declare']]]
[['severe', []], ['minor', []], ['medical', ['officer']], ['officer', ['medical']], ['checkup', []]]


 31%|█████████████████████████▏                                                        | 16/52 [00:08<00:19,  1.83it/s]

[['believe', ['you', 'not']], ['well', ['fare']], ['combatpesfitbmt', []], ['medical', ['specialistletter']], ['specialistletter', ['medical', 'get']]]
['please', []]
['sure', ['make']]
['medical', ['conditions']]
['conditions', ['medical', 'declare']]
['severe', []]
['minor', []]
['medical', ['officer']]
['officer', ['medical']]
['checkup', []]
['believe', ['you', 'not']]
['well', ['fare']]
['combatpesfitbmt', []]
['medical', ['specialistletter']]
['specialistletter', ['medical', 'get']]
Review Number :  16
[['guards', ['rude']], ['rude', ['guards', 'very']]]
[['ask', ['question']], ['question', ['ask', 'ignore']], ['ignore', ['they', 'question', 'you']]]


 33%|██████████████████████████▊                                                       | 17/52 [00:08<00:17,  2.01it/s]

[['rest', ['nice']], ['staff', []], ['nice', ['rest', 'though']], ['friendly', []]]
['guards', ['rude']]
['rude', ['guards', 'very']]
['ask', ['question']]
['question', ['ask', 'ignore']]
['ignore', ['they', 'question', 'you']]
['rest', ['nice']]
['staff', []]
['nice', ['rest', 'though']]
['friendly', []]
Review Number :  17


 35%|████████████████████████████▍                                                     | 18/52 [00:09<00:13,  2.45it/s]

[['overall', ['experience']], ['great', ['experience']], ['experience', ['overall', 'great']], ['medic', ['professional']], ['professional', ['medic']], ['blooddraw', []]]
['overall', ['experience']]
['great', ['experience']]
['experience', ['overall', 'great']]
['medic', ['professional']]
['professional', ['medic']]
['blooddraw', []]
Review Number :  18
[['kind', ['very', 'people']], ['people', ['kind']], ['cmpd', []], ['medical', ['check']], ['check', ['medical', 'up']]]


 37%|█████████████████████████████▉                                                    | 19/52 [00:09<00:12,  2.67it/s]

[['constantly', ['greeted']], ['smile', []], ['patience', []]]
['kind', ['very', 'people']]
['people', ['kind']]
['cmpd', []]
['medical', ['check']]
['check', ['medical', 'up']]
['constantly', ['greeted']]
['smile', []]
['patience', []]
Review Number :  19
[['dont', []], ['bully', ['me']]]
['dont', []]
['bully', ['me']]
Review Number :  20
[['meh', []]]


 40%|█████████████████████████████████                                                 | 21/52 [00:09<00:07,  3.92it/s]

[['staff', ['nice']], ['pretty', ['nice']], ['nice', ['staff', 'pretty']]]
['meh', []]
['staff', ['nice']]
['pretty', ['nice']]
['nice', ['staff', 'pretty']]
Review Number :  21
[['lousy', ['dk']], ['service', ['dk']], ['dk', ['lousy', 'service', 'help']], ['help', ['dk', 'people']], ['people', ['help']]]


 42%|██████████████████████████████████▋                                               | 22/52 [00:09<00:08,  3.73it/s]

[['ask', ['question', 'taiji']], ['question', ['ask']], ['also', ['say']], ['dont', []], ['ask', ['question', 'taiji']], ['taiji', ['not', 'ask']]]
['lousy', ['dk']]
['service', ['dk']]
['dk', ['lousy', 'service', 'help']]
['help', ['dk', 'people']]
['people', ['help']]
['ask', ['question', 'taiji']]
['question', ['ask']]
['also', ['say']]
['dont', []]
['ask', ['question', 'taiji']]
['taiji', ['not', 'ask']]
Review Number :  22
[['tuesday', []], ['negative', ['reviews']], ['reviews', ['negative']], ['share', ['i', 'opinion']], ['quick', ['opinion']], ['personal', ['opinion']], ['opinion', ['quick', 'personal', 'share']]]


 44%|████████████████████████████████████▎                                             | 23/52 [00:10<00:10,  2.75it/s]

[['perhaps', ['varies']], ['varies', ['perhaps', 'it']], ['person', []], ['person', []], ['trip', ['great']], ['cmpb', []], ['ultimately', ['great']], ['great', ['trip', 'ultimately']], ['definitely', ['experience']], ['memorable', ['experience']], ['experience', ['definitely', 'memorable']]]
[['medical', []]]
['tuesday', []]
['negative', ['reviews']]
['reviews', ['negative']]
['share', ['i', 'opinion']]
['quick', ['opinion']]
['personal', ['opinion']]
['opinion', ['quick', 'personal', 'share']]
['perhaps', ['varies']]
['varies', ['perhaps', 'it']]
['person', []]
['person', []]
['trip', ['great']]
['cmpb', []]
['ultimately', ['great']]
['great', ['trip', 'ultimately']]
['definitely', ['experience']]
['memorable', ['experience']]
['experience', ['definitely', 'memorable']]
['medical', []]
Review Number :  23


 48%|███████████████████████████████████████▍                                          | 25/52 [00:10<00:06,  3.95it/s]

[['medical', ['check']], ['check', ['medical', 'place']], ['place', ['check']], ['saf', []]]
['medical', ['check']]
['check', ['medical', 'place']]
['place', ['check']]
['saf', []]
Review Number :  24
[['others', []], ['dirt', []]]
['others', []]
['dirt', []]
Review Number :  25


 50%|█████████████████████████████████████████                                         | 26/52 [00:11<00:07,  3.57it/s]

[['preenlistment', ['screening']], ['sessions', ['few', 'counselling', 'screening']], ['charge', ['answer']]]
[['inconvenient', ['location']], ['location', ['inconvenient']]]
['preenlistment', ['screening']]
['sessions', ['few', 'counselling', 'screening']]
['charge', ['answer']]
['inconvenient', ['location']]
['location', ['inconvenient']]
Review Number :  26
[['cookhouse', []]]


 52%|██████████████████████████████████████████▌                                       | 27/52 [00:11<00:06,  3.68it/s]

[['nsf', ['need']], ['meagre', ['pay']], ['pay', ['meagre']]]
['cookhouse', []]
['nsf', ['need']]
['meagre', ['pay']]
['pay', ['meagre']]
Review Number :  27
[['officerattitude', ['good']], ['good', ['officerattitude', 'not']], ['patience', []], ['service', []]]


 54%|████████████████████████████████████████████▏                                     | 28/52 [00:11<00:07,  3.02it/s]

[['dont', []], ['understand', ['i', 'just', 'what']], ['use', ['he', 'tone']], ['unfriendly', ['tone']], ['tone', ['unfriendly', 'use']], ['repeat', []], ['language', []]]
['officerattitude', ['good']]
['good', ['officerattitude', 'not']]
['patience', []]
['service', []]
['dont', []]
['understand', ['i', 'just', 'what']]
['use', ['he', 'tone']]
['unfriendly', ['tone']]
['tone', ['unfriendly', 'use']]
['repeat', []]
['language', []]
Review Number :  28


 58%|███████████████████████████████████████████████▎                                  | 30/52 [00:12<00:06,  3.38it/s]

[['idk', []], ['many', ['so', 'people']], ['people', ['many', 'give']], ['negative', ['reviews']], ['reviews', ['negative', 'give']], ['medical', ['check']], ['check', ['medical', 'up']], ['staff', ['friendly']], ['friendly', ['staff']], ['nsf', ['cool']], ['cool', ['nsf']], ['overall', ['had']], ['good', ['experience']], ['experience', ['good', 'had', 'there']]]
['idk', []]
['many', ['so', 'people']]
['people', ['many', 'give']]
['negative', ['reviews']]
['reviews', ['negative', 'give']]
['medical', ['check']]
['check', ['medical', 'up']]
['staff', ['friendly']]
['friendly', ['staff']]
['nsf', ['cool']]
['cool', ['nsf']]
['overall', ['had']]
['good', ['experience']]
['experience', ['good', 'had', 'there']]
Review Number :  29
[['gold', ['star']], ['star', ['gold']], ['public', ['service']], ['service', ['public']]]
['gold', ['star']]
['star', ['gold']]
['public', ['service']]
['service', ['public']]
Review Number :  30
[['hrs', ['here']], ['form', []], ['meeting', []]]
[['even', ['bor

 62%|██████████████████████████████████████████████████▍                               | 32/52 [00:12<00:05,  3.70it/s]

[['cold', ['conditioning']], ['air', ['conditioning']], ['wifi', []]]
['hrs', ['here']]
['form', []]
['meeting', []]
['even', ['bored']]
['really', ['bored']]
['cold', ['conditioning']]
['air', ['conditioning']]
['wifi', []]
Review Number :  31
[['accessible', ['not']]]
['accessible', ['not']]
Review Number :  32


 65%|█████████████████████████████████████████████████████▌                            | 34/52 [00:13<00:03,  4.99it/s]

[['extremely', ['poor']], ['poor', ['extremely', 'customerservice']], ['rude', []], ['customerservice', ['poor']]]
['extremely', ['poor']]
['poor', ['extremely', 'customerservice']]
['rude', []]
['customerservice', ['poor']]
Review Number :  33
[['worst', ['day']], ['day', ['worst']], ['life', []]]
['worst', ['day']]
['day', ['worst']]
['life', []]
Review Number :  34


 67%|███████████████████████████████████████████████████████▏                          | 35/52 [00:13<00:03,  5.18it/s]

[['wooo', []], ['real', ['edgy']], ['edgy', ['real', 'ziyuan']], ['ziyuan', ['edgy', 'writer']], ['novel', ['writer']], ['writer', ['ziyuan', 'you', 'novel']]]
['wooo', []]
['real', ['edgy']]
['edgy', ['real', 'ziyuan']]
['ziyuan', ['edgy', 'writer']]
['novel', ['writer']]
['writer', ['ziyuan', 'you', 'novel']]
Review Number :  35
[['highly', ['inaccessible']], ['inaccessible', ['highly']]]


 71%|██████████████████████████████████████████████████████████▎                       | 37/52 [00:13<00:02,  5.72it/s]

[['hard', ['so']], ['get', ['there']], ['mrt', ['stations']], ['stations', ['not', 'mrt']]]
['highly', ['inaccessible']]
['inaccessible', ['highly']]
['hard', ['so']]
['get', ['there']]
['mrt', ['stations']]
['stations', ['not', 'mrt']]
Review Number :  36
[['bane', []], ['existence', []]]
['bane', []]
['existence', []]
Review Number :  37
[['cookhouse', []], ['book', ['get', 'everyday']], ['everyday', ['book']]]
[['troublesome', ['most', 'thing']], ['thing', ['troublesome', 'discussing']], ['whats', ['discussing']], ['lunch', []]]


 75%|█████████████████████████████████████████████████████████████▌                    | 39/52 [00:14<00:03,  4.12it/s]

[['sidenotecanteen', ['canteenb']], ['canteenb', ['sidenotecanteen', 'bad']], ['bad', ['canteenb', 'real']], ['real', ['bad']], ['bad', ['canteenb', 'real']]]
['cookhouse', []]
['book', ['get', 'everyday']]
['everyday', ['book']]
['troublesome', ['most', 'thing']]
['thing', ['troublesome', 'discussing']]
['whats', ['discussing']]
['lunch', []]
['sidenotecanteen', ['canteenb']]
['canteenb', ['sidenotecanteen', 'bad']]
['bad', ['canteenb', 'real']]
['real', ['bad']]
['bad', ['canteenb', 'real']]
Review Number :  38
[['people', ['go']], ['even', ['go']], ['place', []], ['middle', []], ['nowhere', []]]
['people', ['go']]
['even', ['go']]
['place', []]
['middle', []]
['nowhere', []]
Review Number :  39


 77%|███████████████████████████████████████████████████████████████                   | 40/52 [00:14<00:03,  3.85it/s]

[['officertalk', ['prepared']], ['money', ['own']], ['hard', ['very']], ['middle', []], ['village', []], ['something', []]]
['officertalk', ['prepared']]
['money', ['own']]
['hard', ['very']]
['middle', []]
['village', []]
['something', []]
Review Number :  40
[['rude', ['staff']], ['staff', ['rude']]]
['rude', ['staff']]
['staff', ['rude']]
Review Number :  41


 83%|███████████████████████████████████████████████████████████████████▊              | 43/52 [00:15<00:01,  5.29it/s]

[['inaccessible', ['need']], ['need', ['inaccessible']], ['h', []], ['time', ['travel']], ['waste', ['more']], ['time', ['travel']]]
['inaccessible', ['need']]
['need', ['inaccessible']]
['h', []]
['time', ['travel']]
['waste', ['more']]
['time', ['travel']]
Review Number :  42
[['far', ['so', 'away']], ['away', ['far']], ['middle', []], ['nowhere', []]]
['far', ['so', 'away']]
['away', ['far']]
['middle', []]
['nowhere', []]
Review Number :  43


 87%|██████████████████████████████████████████████████████████████████████▉           | 45/52 [00:15<00:01,  5.78it/s]

[['bad', ['very', 'troopers', 'very', 'attitude']], ['security', ['troopers']], ['troopers', ['bad', 'security', 'have']], ['bad', ['very', 'troopers', 'very', 'attitude']], ['attitude', ['bad', 'have']], ['towards', []], ['public', []]]
['bad', ['very', 'troopers', 'very', 'attitude']]
['security', ['troopers']]
['troopers', ['bad', 'security', 'have']]
['bad', ['very', 'troopers', 'very', 'attitude']]
['attitude', ['bad', 'have']]
['towards', []]
['public', []]
Review Number :  44
[['middle', []], ['nowhere', []]]
['middle', []]
['nowhere', []]
Review Number :  45


 88%|████████████████████████████████████████████████████████████████████████▌         | 46/52 [00:15<00:01,  5.38it/s]

[['place', []], ['well', ['kept']], ['people', ['kept']], ['unbelievably', ['rude']], ['rude', ['unbelievably']]]
['place', []]
['well', ['kept']]
['people', ['kept']]
['unbelievably', ['rude']]
['rude', ['unbelievably']]
Review Number :  46
[['bad', ['service']], ['service', ['bad']]]
['bad', ['service']]
['service', ['bad']]
Review Number :  47


 92%|███████████████████████████████████████████████████████████████████████████▋      | 48/52 [00:15<00:00,  6.12it/s]

[['interestingly', ['enough']], ['enough', ['interestingly', 'removed']], ['negative', ['reviews']], ['reviews', ['negative']]]
['interestingly', ['enough']]
['enough', ['interestingly', 'removed']]
['negative', ['reviews']]
['reviews', ['negative']]
Review Number :  48
[['bad', ['service']], ['service', ['bad']]]
['bad', ['service']]
['service', ['bad']]
Review Number :  49
[['sheat', []]]


 98%|████████████████████████████████████████████████████████████████████████████████▍ | 51/52 [00:16<00:00,  3.11it/s]

[['dirty', ['pigs']], ['pigs', ['dirty', 'training']], ['step', []], ['minefields', []]]
['sheat', []]
['dirty', ['pigs']]
['pigs', ['dirty', 'training']]
['step', []]
['minefields', []]
Review Number :  50
[['gncpresent', []]]
['gncpresent', []]
Review Number :  51





In [None]:
fin = get_sentiment(a, b, nlp)

  0%|                                                                                          | 0/197 [00:00<?, ?it/s]

Calculating Sentiment for:  professional


  1%|▍                                                                                 | 1/197 [00:01<03:32,  1.08s/it]

Calculating Sentiment for:  people


  1%|▊                                                                                 | 2/197 [00:05<09:16,  2.85s/it]

Calculating Sentiment for:  patient


  2%|█▏                                                                                | 3/197 [00:05<06:09,  1.91s/it]

Calculating Sentiment for:  kind


  3%|██                                                                                | 5/197 [00:07<03:29,  1.09s/it]

Calculating Sentiment for:  respectful
Calculating Sentiment for:  smooth


  3%|██▍                                                                               | 6/197 [00:07<02:32,  1.25it/s]

Calculating Sentiment for:  medical


  4%|██▉                                                                               | 7/197 [00:11<05:54,  1.87s/it]

Calculating Sentiment for:  checkup


  4%|███▎                                                                              | 8/197 [00:12<04:46,  1.52s/it]

Calculating Sentiment for:  reviews


  5%|███▋                                                                              | 9/197 [00:14<05:40,  1.81s/it]

Calculating Sentiment for:  adequately


  5%|████                                                                             | 10/197 [00:15<04:30,  1.45s/it]

Calculating Sentiment for:  anywhere


  6%|████▌                                                                            | 11/197 [00:16<03:56,  1.27s/it]

Calculating Sentiment for:  else


  6%|████▉                                                                            | 12/197 [00:16<03:07,  1.01s/it]

Calculating Sentiment for:  place


  7%|█████▎                                                                           | 13/197 [00:17<03:16,  1.07s/it]

Calculating Sentiment for:  really


  7%|█████▊                                                                           | 14/197 [00:19<03:47,  1.24s/it]

Calculating Sentiment for:  clean


  8%|██████▏                                                                          | 15/197 [00:20<03:17,  1.08s/it]

Calculating Sentiment for:  overall


  8%|██████▌                                                                          | 16/197 [00:21<03:34,  1.19s/it]

Calculating Sentiment for:  fine


  9%|██████▉                                                                          | 17/197 [00:22<03:04,  1.03s/it]

Calculating Sentiment for:  good


  9%|███████▍                                                                         | 18/197 [00:24<04:02,  1.35s/it]

Calculating Sentiment for:  experience


 10%|████████▏                                                                        | 20/197 [00:27<03:39,  1.24s/it]

Calculating Sentiment for:  tip


 11%|█████████                                                                        | 22/197 [00:27<01:58,  1.47it/s]

Calculating Sentiment for:  u
Calculating Sentiment for:  late
Calculating Sentiment for:  back


 12%|█████████▍                                                                       | 23/197 [00:27<01:32,  1.88it/s]

Calculating Sentiment for:  day


 12%|█████████▊                                                                       | 24/197 [00:28<01:30,  1.92it/s]

Calculating Sentiment for:  complete


 13%|██████████▎                                                                      | 25/197 [00:28<01:42,  1.67it/s]

Calculating Sentiment for:  rest


 13%|██████████▋                                                                      | 26/197 [00:29<01:41,  1.68it/s]

Calculating Sentiment for:  august
Calculating Sentiment for:  gateentrance


 14%|███████████▌                                                                     | 28/197 [00:30<01:19,  2.12it/s]

Calculating Sentiment for:  entrancesecurity


 15%|███████████▉                                                                     | 29/197 [00:30<01:23,  2.01it/s]

Calculating Sentiment for:  checkcounter


 16%|████████████▋                                                                    | 31/197 [00:32<01:56,  1.42it/s]

Calculating Sentiment for:  stickerpass


 16%|█████████████▏                                                                   | 32/197 [00:33<01:33,  1.77it/s]

Calculating Sentiment for:  walk


 17%|█████████████▉                                                                   | 34/197 [00:33<01:02,  2.63it/s]

Calculating Sentiment for:  scan
Calculating Sentiment for:  dontbother
Calculating Sentiment for:  end


 18%|██████████████▍                                                                  | 35/197 [00:34<01:24,  1.93it/s]

Calculating Sentiment for:  hours


 18%|██████████████▊                                                                  | 36/197 [00:34<01:17,  2.07it/s]

Calculating Sentiment for:  realise


 19%|███████████████▏                                                                 | 37/197 [00:35<01:41,  1.58it/s]

Calculating Sentiment for:  youre


 19%|███████████████▌                                                                 | 38/197 [00:36<01:26,  1.83it/s]

Calculating Sentiment for:  last


 20%|████████████████                                                                 | 39/197 [00:36<01:15,  2.10it/s]

Calculating Sentiment for:  person


 21%|████████████████▊                                                                | 41/197 [00:37<01:03,  2.47it/s]

Calculating Sentiment for:  doctors
Calculating Sentiment for:  fair


 22%|█████████████████▋                                                               | 43/197 [00:37<00:47,  3.27it/s]

Calculating Sentiment for:  choice


 22%|██████████████████                                                               | 44/197 [00:39<01:51,  1.37it/s]

Calculating Sentiment for:  waste
Calculating Sentiment for:  time


 23%|██████████████████▌                                                              | 45/197 [00:40<02:10,  1.17it/s]

Calculating Sentiment for:  nsf


 23%|██████████████████▉                                                              | 46/197 [00:41<02:19,  1.08it/s]

Calculating Sentiment for:  extremely


 24%|███████████████████▎                                                             | 47/197 [00:42<02:09,  1.16it/s]

Calculating Sentiment for:  long


 24%|███████████████████▋                                                             | 48/197 [00:43<02:39,  1.07s/it]

Calculating Sentiment for:  staff


 25%|████████████████████▏                                                            | 49/197 [00:47<04:26,  1.80s/it]

Calculating Sentiment for:  bad


 25%|████████████████████▌                                                            | 50/197 [00:52<07:08,  2.91s/it]

Calculating Sentiment for:  personal


In [None]:
fin
fin.to_csv("onenineseven.csv")

In [None]:
def get_tfidf_features(df, content_str = "Content", min_ = 2, max_ = 0.5, ngramrange = (1,2)):
    
    # Replace "" with nan's for removal
    #df[content_str].replace('', np.nan, inplace=True)
    #df.dropna(subset=[content_str], inplace=True)
    #stop_words = set(stopwords.words('english'))
    #df[content_str] = df[content_str].apply(lambda x: ''.join([word for word in x.split() if word not in (stop_words)]))
    
    review_list = df[content_str].to_list()
    #feat_count = dict()
    #feat_sent = dict()
    #nlp = stanza.Pipeline('en')

        
    #print(review_list)
    tfidf = TfidfVectorizer(min_df = min_, max_df = max_, ngram_range = ngramrange);
    features = tfidf.fit_transform(review_list);
    q = pd.DataFrame(features.todense(), columns=tfidf.get_feature_names())
    
    return list(q.columns)

In [None]:
def refine_features(originaldf, sentimentdf):
    tfidf_output = get_tfidf_features(originaldf)
    sentimentdf = sentimentdf.reset_index()
    ft_extract = set(sentimentdf['index']);
    tfidf_extract = set(tfidf_output)
    
    intersecting_features = ft_extract.intersection(tfidf_extract)
    
    return_df = sentimentdf
    return_df = return_df.loc[return_df['index'].isin(list(intersecting_features))]
    print("Number of extracted features:")
    print("Initial = ", len(ft_extract), " TFIDF = ", len(intersecting_features), " Final after intersection = ", return_df.shape[0])
    return return_df
    
    

In [None]:
rt = refine_features(rdr, fin)
rt

In [None]:
#rt.to_csv("finalFeatures.csv")