# Full Workflow

Feature Extraction + Word embedding + Clustering + Evaluation

# 1. Feature Extraction

In [1]:
import pandas as pd
import numpy as np
import nltk
import regex
import re
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer 
from sklearn.feature_extraction.text import TfidfVectorizer
import stanza
stanza.download('en') # download English model
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.2.0.json: 128kB [00:00, 1.46MB/s]
2021-03-25 23:44:52 INFO: Downloading default packages for language: en (English)...
2021-03-25 23:44:54 INFO: File exists: C:\Users\TzeMin\stanza_resources\en\default.zip.
2021-03-25 23:45:00 INFO: Finished downloading models and saved to C:\Users\TzeMin\stanza_resources.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\TzeMin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\TzeMin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\TzeMin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
def feature_extraction(txt, nlp):
    
    sentList = nltk.sent_tokenize(txt)
    retlist = [];
    
    for line in sentList:
        txt_list = nltk.word_tokenize(line)
        taggedList = nltk.pos_tag(txt_list)
        newwordList = []
        flag = 0
        
        for i in range(0,len(taggedList)-1):
            if(taggedList[i][1]=="NN" and taggedList[i+1][1]=="NN"):
                newwordList.append(taggedList[i][0]+taggedList[i+1][0])
                flag=1
            else:
                if(flag==1):
                    flag=0
                    continue
                newwordList.append(taggedList[i][0])
                if(i==len(taggedList)-2):
                    newwordList.append(taggedList[i+1][0])
        finaltxt = ' '.join(word for word in newwordList)
    
        stop_words = set(stopwords.words('english'))
        new_txt_list = nltk.word_tokenize(finaltxt)
        wordsList = [w for w in new_txt_list if not w in stop_words]
        taggedList = nltk.pos_tag(wordsList)
        
        doc = nlp(finaltxt)
        dep_node = []
        try:
            for dep_edge in doc.sentences[0].dependencies:
                dep_node.append([dep_edge[2].text, dep_edge[0].id, dep_edge[1]])
            for i in range(0, len(dep_node)):
                if (int(dep_node[i][1]) != 0):
                    dep_node[i][1] = newwordList[(int(dep_node[i][1]) - 1)]
        except:
            pass;
                
        featureList = []
        categories = []
        for i in taggedList:
            if(i[1]=='JJ' or i[1]=='NN' or i[1]=='JJR' or i[1]=='NNS' or i[1]=='RB'):
                featureList.append(list(i))
                categories.append(i[0])        
        
        fcluster = []
        for i in featureList:
            filist = []
            for j in dep_node:
                if((j[0]==i[0] or j[1]==i[0]) and (j[2] in [
                    # Different types of words that are identified as potential features
                    "nsubj",
                    #"acl:relcl",
                    "obj",
                    "dobj",
                    #"agent",
                    #"advmod",
                    #"amod",
                    #"neg",
                    #"prep_of",
                    #"acomp",
                    #"xcomp",
                    #"compound"
                ])):
                    if(j[0]==i[0]):
                        filist.append(j[1])
                    else:
                        filist.append(j[0])
            fcluster.append([i[0], filist])
        print(fcluster) 
        
        # Remove all features with no sentiment word:
        retlist.append(fcluster)
    return retlist;

def do_extraction(df, nlp, feat_count, feat_sent, content_str = "Content"):
    idx = 0;
    # Replace "" with nan's for removal
    df[content_str].replace('', np.nan, inplace=True)
    df.dropna(subset=[content_str], inplace=True)
    review_list = df[content_str].to_list()     
    print(" Processing : " , df.shape[0], "rows of data")
    
    for review in tqdm(review_list):
        print("Review Number : ", idx);
        
        # Some data pre-processing
        review = review.lower()
        
        # Merge hyphenated words
        separate = review.split('-')
        review = ''.join(separate)
        
        # Remove non-alphabets
        review = re.sub(r'[^a-z\s\t]', '', review)
        
        idx += 1;
        if idx >= df.shape[0]:
            break;
        try:
            output = feature_extraction(review, nlp);
        except:
            pass;
        for sent in output:
            for pair in sent:
                print(pair)
                if pair[0] in feat_sent:
                    if pair[1] is not None:
                        flist = feat_sent[pair[0]]
                        if isinstance(pair[1], list):
                            for i in pair[1]:
                                flist.append(i)
                        else:
                            flist.append(pair[1])
                        feat_sent[pair[0]] = flist;
                else:
                    if pair[1] is not None:
                        flist = pair[1]
                    else:
                        flist = list()
                    feat_sent[pair[0]] = flist;
                
                if pair[0] in feat_count:
                    feat_count[pair[0]] = feat_count[pair[0]] + 1;
                else:
                    feat_count[pair[0]] = 1
    
    return feat_count, feat_sent;

def get_sentiment(feat_count, feat_sent, nlp):

    sentiment_score = dict()

    # Delete features with no descriptors
    cob = feat_sent.copy()
    for feat in cob.keys():
        if cob[feat] == []:
            del feat_sent[feat]
        else:
            feat_sent[feat] = ' ,'.join(feat_sent[feat])

    # Run pre-built sentiment score and take avg of all descriptors
    for f in tqdm(feat_sent.keys()):
        print("Calculating Sentiment for: ", f);
        ssum = 0;
        for g in feat_sent[f]:
            try:
                doc = nlp(g);
                for i in doc.sentences:
                        ssum += i.sentiment;
            except:
                pass;
        sentiment_score[f] = ssum / len(b[f])

    adf = pd.DataFrame.from_dict(feat_count, orient='index', columns=['Freq'])
    adf.sort_values(by="Freq", ascending=False, inplace = True)

    avg_sent = pd.DataFrame.from_dict(sentiment_score, orient='index', columns=["Avg_sent"])
    desc_words = pd.DataFrame.from_dict(feat_sent, orient="index", columns=["Descriptors"])
    
    avg_sent = avg_sent.merge(desc_words, left_index=True, right_index=True)
    final_sent = avg_sent.merge(adf, left_index=True, right_index=True)
    final_sent.sort_values(by="Freq", ascending=False, inplace=True)
    return final_sent;

In [3]:
rdr = pd.read_csv('../../output/scraped-ns/cmpb.csv')

nlp = stanza.Pipeline('en')
a = dict()
b = dict()
a, b = do_extraction(rdr, nlp, a, b)

2021-03-25 23:45:00 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| sentiment | sstplus   |
| ner       | ontonotes |

2021-03-25 23:45:00 INFO: Use device: cpu
2021-03-25 23:45:00 INFO: Loading: tokenize
2021-03-25 23:45:00 INFO: Loading: pos
2021-03-25 23:45:01 INFO: Loading: lemma
2021-03-25 23:45:01 INFO: Loading: depparse
2021-03-25 23:45:01 INFO: Loading: sentiment
2021-03-25 23:45:02 INFO: Loading: ner
2021-03-25 23:45:03 INFO: Done loading processors!
  0%|                                                                                           | 0/52 [00:00<?, ?it/s]

 Processing :  52 rows of data
Review Number :  0


  2%|█▌                                                                                 | 1/52 [00:00<00:29,  1.74it/s]

[['professional', []], ['people', ['recommended']], ['patient', []], ['kind', ['recommended']], ['respectful', []], ['smooth', []], ['medical', []], ['checkup', []]]
['professional', []]
['people', ['recommended']]
['patient', []]
['kind', ['recommended']]
['respectful', []]
['smooth', []]
['medical', []]
['checkup', []]
Review Number :  1


  4%|███▏                                                                               | 2/52 [00:01<00:26,  1.88it/s]

[['reviews', ['suggest']], ['people', ['suggest']], ['adequately', []], ['anywhere', []], ['else', []], ['sg', []], ['place', ['clean']], ['really', []], ['clean', ['place']], ['efficient', []]]
['reviews', ['suggest']]
['people', ['suggest']]
['adequately', []]
['anywhere', []]
['else', []]
['sg', []]
['place', ['clean']]
['really', []]
['clean', ['place']]
['efficient', []]
Review Number :  2


  6%|████▊                                                                              | 3/52 [00:01<00:27,  1.79it/s]

[['place', ['experience']], ['overall', []], ['fine', []], ['good', []], ['experience', ['place']], ['tip', []], ['u', ['have']], ['medical', []], ['checkupdont', []], ['late', []], ['else', []], ['u', ['have']], ['back', []], ['day', []], ['complete', ['rest']], ['rest', ['complete']]]
['place', ['experience']]
['overall', []]
['fine', []]
['good', []]
['experience', ['place']]
['tip', []]
['u', ['have']]
['medical', []]
['checkupdont', []]
['late', []]
['else', []]
['u', ['have']]
['back', []]
['day', []]
['complete', ['rest']]
['rest', ['complete']]
Review Number :  3


  8%|██████▍                                                                            | 4/52 [00:02<00:31,  1.54it/s]

[['nscheckup', []], ['checkuptoday', []], ['gateentrance', []], ['entrancesecurity', []], ['checkcounter', []], ['stickerpass', ['take']], ['walk', []], ['gatedont', []], ['mei', ['stood']], ['thinking', []], ['scan', ['i', 'the']]]
['nscheckup', []]
['checkuptoday', []]
['gateentrance', []]
['entrancesecurity', []]
['checkcounter', []]
['stickerpass', ['take']]
['walk', []]
['gatedont', []]
['mei', ['stood']]
['thinking', []]
['scan', ['i', 'the']]
Review Number :  4


 10%|███████▉                                                                           | 5/52 [00:03<00:35,  1.34it/s]

[['dontbother', ['end']], ['youll', []], ['end', ['dontbother']], ['hours', []], ['realise', ['that', 'youre']], ['youre', ['realise']], ['last', []], ['person', ['care']], ['line', []], ['doctors', []], ['really', []], ['fair', ['which']], ['dont', []], ['really', []], ['choice', ['have']], ['place', ['waste']], ['complete', []], ['waste', ['place']], ['space', []], ['time', []]]
['dontbother', ['end']]
['youll', []]
['end', ['dontbother']]
['hours', []]
['realise', ['that', 'youre']]
['youre', ['realise']]
['last', []]
['person', ['care']]
['line', []]
['doctors', []]
['really', []]
['fair', ['which']]
['dont', []]
['really', []]
['choice', ['have']]
['place', ['waste']]
['complete', []]
['waste', ['place']]
['space', []]
['time', []]
Review Number :  5


 12%|█████████▌                                                                         | 6/52 [00:03<00:25,  1.77it/s]

[['nsf', []], ['reviews', []], ['lol', []]]
['nsf', []]
['reviews', []]
['lol', []]
Review Number :  6


 13%|███████████▏                                                                       | 7/52 [00:03<00:21,  2.05it/s]

[['extremely', []], ['long', []], ['time', ['takes']], ['due', []], ['overall', []], ['complete', []], ['waste', ['waiting']], ['time', ['takes']]]
['extremely', []]
['long', []]
['time', ['takes']]
['due', []]
['overall', []]
['complete', []]
['waste', ['waiting']]
['time', ['takes']]
Review Number :  7


 15%|████████████▊                                                                      | 8/52 [00:04<00:23,  1.88it/s]

[['staff', ['professional']], ['professional', ['staff']], ['idk', []], ['bad', []], ['reviews', ['what']], ['personal', []], ['experienceeveryone', []], ['helpful', ['is']], ['initiative', ['take']], ['help', ['me']]]
['staff', ['professional']]
['professional', ['staff']]
['idk', []]
['bad', []]
['reviews', ['what']]
['personal', []]
['experienceeveryone', []]
['helpful', ['is']]
['initiative', ['take']]
['help', ['me']]
Review Number :  8


 17%|██████████████▎                                                                    | 9/52 [00:05<00:29,  1.45it/s]

[['unfriendly', []], ['staff', ['keep']], ['guards', ['doing']], ['job', ['doing']], ['staff', ['keep']], ['stuff', ['do']], ['suppose', ['which', 'they']], ['absolutely', []], ['atrocious', []], ['woman', ['is']], ['keeps', ['who']], ['tone', ['changing']], ['talks', ['she']], ['people', ['threatens']], ['people', ['threatens']], ['always', []], ['mask', ['removes']], ['talk', []], ['people', ['threatens']], ['expressions', ['show']]]
['unfriendly', []]
['staff', ['keep']]
['guards', ['doing']]
['job', ['doing']]
['staff', ['keep']]
['stuff', ['do']]
['suppose', ['which', 'they']]
['absolutely', []]
['atrocious', []]
['woman', ['is']]
['keeps', ['who']]
['tone', ['changing']]
['talks', ['she']]
['people', ['threatens']]
['people', ['threatens']]
['always', []]
['mask', ['removes']]
['talk', []]
['people', ['threatens']]
['expressions', ['show']]
Review Number :  9


 19%|███████████████▊                                                                  | 10/52 [00:05<00:24,  1.74it/s]

[['staff', ['rude']], ['medical', []], ['screening', []], ['station', []], ['weight', []], ['extremely', []], ['rude', ['staff']], ['unfriendly', []]]
['staff', ['rude']]
['medical', []]
['screening', []]
['station', []]
['weight', []]
['extremely', []]
['rude', ['staff']]
['unfriendly', []]
Review Number :  10


 21%|█████████████████▎                                                                | 11/52 [00:06<00:21,  1.93it/s]

[['staff', ['impatient']], ['serious', []], ['impatient', ['staff']], ['undesirably', []], ['long', []], ['times', ['recommend']], ['cmpb', ['recommend']], ['friend', []]]
['staff', ['impatient']]
['serious', []]
['impatient', ['staff']]
['undesirably', []]
['long', []]
['times', ['recommend']]
['cmpb', ['recommend']]
['friend', []]
Review Number :  11


 23%|██████████████████▉                                                               | 12/52 [00:07<00:23,  1.70it/s]

[['sent', ['son']], ['son', ['sent', 'got']], ['preenlistment', []], ['enlistmentcheckup', []], ['morning', []], ['guard', ['give']], ['entrancecouldnt', []], ['clear', []], ['instructions', ['give']], ['son', ['sent', 'got']], ['alight', []], ['couldnt', []], ['drive', ['we']], ['hello', []], ['please', []], ['train', ['army']], ['army', ['train']]]
['sent', ['son']]
['son', ['sent', 'got']]
['preenlistment', []]
['enlistmentcheckup', []]
['morning', []]
['guard', ['give']]
['entrancecouldnt', []]
['clear', []]
['instructions', ['give']]
['son', ['sent', 'got']]
['alight', []]
['couldnt', []]
['drive', ['we']]
['hello', []]
['please', []]
['train', ['army']]
['army', ['train']]
Review Number :  12


 25%|████████████████████▌                                                             | 13/52 [00:07<00:21,  1.79it/s]

[['inconvenient', []], ['locations', []], ['ever', []], ['terrible', []], ['directions', ['seen']], ['staff', ['expect']], ['whole', []], ['process', ['expect']], ['take', ['hours']], ['hours', ['take']]]
['inconvenient', []]
['locations', []]
['ever', []]
['terrible', []]
['directions', ['seen']]
['staff', ['expect']]
['whole', []]
['process', ['expect']]
['take', ['hours']]
['hours', ['take']]
Review Number :  13


 27%|██████████████████████                                                            | 14/52 [00:08<00:23,  1.60it/s]

[['ok', ['staff']], ['lah', []], ['review', []], ['visitjanuary', []], ['maybe', []], ['sikit', []], ['date', []], ['nsf', []], ['staff', ['ok']], ['ok', ['staff']], ['typical', []], ['bochap', []], ['happy', []], ['bird', []], ['tio', ['vocationmo']], ['switch', []], ['vocationmo', ['tio']], ['hand', []], ['si', []], ['pehbuay', []], ['buaysong', []], ['probably', ['this']]]
['ok', ['staff']]
['lah', []]
['review', []]
['visitjanuary', []]
['maybe', []]
['sikit', []]
['date', []]
['nsf', []]
['staff', ['ok']]
['ok', ['staff']]
['typical', []]
['bochap', []]
['happy', []]
['bird', []]
['tio', ['vocationmo']]
['switch', []]
['vocationmo', ['tio']]
['hand', []]
['si', []]
['pehbuay', []]
['buaysong', []]
['probably', ['this']]
Review Number :  14


 29%|███████████████████████▋                                                          | 15/52 [00:08<00:19,  1.89it/s]

[['tbh', []], ['bad', ['its']], ['place', []], ['visit', []], ['contrary', []], ['others', ['saying']]]
['tbh', []]
['bad', ['its']]
['place', []]
['visit', []]
['contrary', []]
['others', ['saying']]
Review Number :  15


 31%|█████████████████████████▏                                                        | 16/52 [00:09<00:21,  1.68it/s]

[['please', []], ['sure', []], ['medical', []], ['conditions', ['declare']], ['severe', []], ['minor', []], ['medical', []], ['officer', []], ['checkup', []], ['fare', ['you']], ['well', []], ['medical', []], ['specialistletter', ['get']]]
['please', []]
['sure', []]
['medical', []]
['conditions', ['declare']]
['severe', []]
['minor', []]
['medical', []]
['officer', []]
['checkup', []]
['fare', ['you']]
['well', []]
['medical', []]
['specialistletter', ['get']]
Review Number :  16


 33%|██████████████████████████▊                                                       | 17/52 [00:09<00:18,  1.89it/s]

[['guards', ['ask']], ['rude', []], ['ask', ['guards', 'question']], ['question', ['ask']], ['ignore', ['they']], ['rest', ['nice']], ['staff', []], ['nice', ['rest']], ['friendly', []]]
['guards', ['ask']]
['rude', []]
['ask', ['guards', 'question']]
['question', ['ask']]
['ignore', ['they']]
['rest', ['nice']]
['staff', []]
['nice', ['rest']]
['friendly', []]
Review Number :  17


 35%|████████████████████████████▍                                                     | 18/52 [00:10<00:15,  2.22it/s]

[['overall', []], ['great', []], ['experience', []], ['medic', ['professional']], ['professional', ['medic']], ['blooddraw', []]]
['overall', []]
['great', []]
['experience', []]
['medic', ['professional']]
['professional', ['medic']]
['blooddraw', []]
Review Number :  18


 38%|███████████████████████████████▌                                                  | 20/52 [00:10<00:10,  3.02it/s]

[['kind', []], ['people', []], ['cmpd', []], ['medical', []], ['check', []], ['constantly', []], ['smile', []], ['patience', []]]
['kind', []]
['people', []]
['cmpd', []]
['medical', []]
['check', []]
['constantly', []]
['smile', []]
['patience', []]
Review Number :  19
[['dont', []], ['bully', ['me']]]
['dont', []]
['bully', ['me']]
Review Number :  20


 40%|█████████████████████████████████                                                 | 21/52 [00:10<00:08,  3.59it/s]

[['meh', []], ['staff', ['nice']], ['pretty', []], ['nice', ['staff']]]
['meh', []]
['staff', ['nice']]
['pretty', []]
['nice', ['staff']]
Review Number :  21


 42%|██████████████████████████████████▋                                               | 22/52 [00:10<00:08,  3.38it/s]

[['lousy', []], ['service', []], ['dk', []], ['help', ['ask']], ['people', ['ask']], ['question', ['ask']], ['also', []], ['dont', []], ['ask', ['help', 'people', 'question', 'taiji']], ['taiji', ['ask']]]
['lousy', []]
['service', []]
['dk', []]
['help', ['ask']]
['people', ['ask']]
['question', ['ask']]
['also', []]
['dont', []]
['ask', ['help', 'people', 'question', 'taiji']]
['taiji', ['ask']]
Review Number :  22


 46%|█████████████████████████████████████▊                                            | 24/52 [00:11<00:09,  2.94it/s]

[['tuesday', []], ['negative', []], ['reviews', []], ['share', ['i', 'opinion']], ['quick', []], ['personal', []], ['opinion', ['share']], ['perhaps', []], ['varies', ['it']], ['person', []], ['person', []], ['trip', ['great']], ['cmpb', []], ['ultimately', []], ['great', ['trip']], ['definitely', []], ['memorable', []], ['experience', []], ['medical', []]]
['tuesday', []]
['negative', []]
['reviews', []]
['share', ['i', 'opinion']]
['quick', []]
['personal', []]
['opinion', ['share']]
['perhaps', []]
['varies', ['it']]
['person', []]
['person', []]
['trip', ['great']]
['cmpb', []]
['ultimately', []]
['great', ['trip']]
['definitely', []]
['memorable', []]
['experience', []]
['medical', []]
Review Number :  23
[['medical', []], ['check', ['place']], ['place', ['check']], ['saf', []]]
['medical', []]
['check', ['place']]
['place', ['check']]
['saf', []]
Review Number :  24


 48%|███████████████████████████████████████▍                                          | 25/52 [00:12<00:07,  3.43it/s]

[['others', []], ['dirt', []]]
['others', []]
['dirt', []]
Review Number :  25


 50%|█████████████████████████████████████████                                         | 26/52 [00:12<00:08,  3.02it/s]

[['preenlistment', []], ['sessions', ['screening']], ['chargeinconvenient', []], ['inconvenientlocation', ['answer']]]
['preenlistment', []]
['sessions', ['screening']]
['chargeinconvenient', []]
['inconvenientlocation', ['answer']]
Review Number :  26


 52%|██████████████████████████████████████████▌                                       | 27/52 [00:12<00:07,  3.18it/s]

[['cookhouse', []], ['nsf', ['need']], ['meagre', []], ['pay', []]]
['cookhouse', []]
['nsf', ['need']]
['meagre', []]
['pay', []]
Review Number :  27


 54%|████████████████████████████████████████████▏                                     | 28/52 [00:13<00:09,  2.55it/s]

[['officerattitude', ['good']], ['good', ['officerattitude']], ['patience', ['understand']], ['servicei', []], ['dont', []], ['use', ['he', 'tone']], ['unfriendly', []], ['tone', ['use']], ['repeat', []], ['language', []]]
['officerattitude', ['good']]
['good', ['officerattitude']]
['patience', ['understand']]
['servicei', []]
['dont', []]
['use', ['he', 'tone']]
['unfriendly', []]
['tone', ['use']]
['repeat', []]
['language', []]
Review Number :  28


 58%|███████████████████████████████████████████████▎                                  | 30/52 [00:14<00:07,  2.78it/s]

[['idk', []], ['many', []], ['people', ['give']], ['negative', []], ['reviews', ['give']], ['medical', []], ['check', []], ['staff', ['friendly']], ['friendly', ['staff']], ['nsf', ['cool']], ['cool', ['nsf']], ['overall', []], ['good', []], ['experience', ['had']]]
['idk', []]
['many', []]
['people', ['give']]
['negative', []]
['reviews', ['give']]
['medical', []]
['check', []]
['staff', ['friendly']]
['friendly', ['staff']]
['nsf', ['cool']]
['cool', ['nsf']]
['overall', []]
['good', []]
['experience', ['had']]
Review Number :  29
[['gold', []], ['star', []], ['public', []], ['service', []]]
['gold', []]
['star', []]
['public', []]
['service', []]
Review Number :  30


 62%|██████████████████████████████████████████████████▍                               | 32/52 [00:14<00:06,  3.11it/s]

[['hrs', []], ['form', []], ['meeting', []], ['even', []], ['really', []], ['bored', []], ['cold', []], ['air', []], ['wifi', []]]
['hrs', []]
['form', []]
['meeting', []]
['even', []]
['really', []]
['bored', []]
['cold', []]
['air', []]
['wifi', []]
Review Number :  31
[['accessible', []]]
['accessible', []]
Review Number :  32


 65%|█████████████████████████████████████████████████████▌                            | 34/52 [00:15<00:04,  3.86it/s]

[['extremely', []], ['poor', []], ['rude', []], ['customerservice', []]]
['extremely', []]
['poor', []]
['rude', []]
['customerservice', []]
Review Number :  33
[['worst', []], ['day', []], ['life', []]]
['worst', []]
['day', []]
['life', []]
Review Number :  34


 67%|███████████████████████████████████████████████████████▏                          | 35/52 [00:15<00:04,  3.94it/s]

[['wooo', []], ['real', []], ['edgy', []], ['ziyuan', ['writer']], ['novel', []], ['writer', ['ziyuan', 'you']]]
['wooo', []]
['real', []]
['edgy', []]
['ziyuan', ['writer']]
['novel', []]
['writer', ['ziyuan', 'you']]
Review Number :  35


 71%|██████████████████████████████████████████████████████████▎                       | 37/52 [00:15<00:03,  4.32it/s]

[['highly', []], ['inaccessible', []], ['hard', []], ['get', []], ['mrt', []], ['stations', []]]
['highly', []]
['inaccessible', []]
['hard', []]
['get', []]
['mrt', []]
['stations', []]
Review Number :  36
[['bane', []], ['existence', []]]
['bane', []]
['existence', []]
Review Number :  37


 73%|███████████████████████████████████████████████████████████▉                      | 38/52 [00:16<00:04,  3.21it/s]

[['cookhouse', []], ['book', []], ['everyday', []], ['troublesome', []], ['thing', ['discussing']], ['whats', ['discussing']], ['sidenotecanteen', []], ['canteenb', ['bad']], ['bad', ['canteenb']], ['real', []], ['bad', ['canteenb']]]
['cookhouse', []]
['book', []]
['everyday', []]
['troublesome', []]
['thing', ['discussing']]
['whats', ['discussing']]
['sidenotecanteen', []]
['canteenb', ['bad']]
['bad', ['canteenb']]
['real', []]
['bad', ['canteenb']]
Review Number :  38


 75%|█████████████████████████████████████████████████████████████▌                    | 39/52 [00:16<00:03,  3.47it/s]

[['people', ['go']], ['even', []], ['placemiddle', []], ['nowhere', []]]
['people', ['go']]
['even', []]
['placemiddle', []]
['nowhere', []]
Review Number :  39


 79%|████████████████████████████████████████████████████████████████▋                 | 41/52 [00:17<00:02,  3.80it/s]

[['officertalk', []], ['money', ['own']], ['hard', []], ['middle', []], ['village', []], ['something', []]]
['officertalk', []]
['money', ['own']]
['hard', []]
['middle', []]
['village', []]
['something', []]
Review Number :  40
[['rude', []], ['staff', []]]
['rude', []]
['staff', []]
Review Number :  41


 83%|███████████████████████████████████████████████████████████████████▊              | 43/52 [00:17<00:02,  4.13it/s]

[['inaccessible', []], ['need', []], ['h', []], ['time', ['travel']], ['waste', ['more']], ['time', ['travel']]]
['inaccessible', []]
['need', []]
['h', []]
['time', ['travel']]
['waste', ['more']]
['time', ['travel']]
Review Number :  42
[['far', []], ['away', []], ['middle', []], ['nowhere', []]]
['far', []]
['away', []]
['middle', []]
['nowhere', []]
Review Number :  43


 87%|██████████████████████████████████████████████████████████████████████▉           | 45/52 [00:17<00:01,  4.43it/s]

[['bad', []], ['security', []], ['troopers', []], ['bad', []], ['attitude', ['have']], ['towards', []], ['public', []]]
['bad', []]
['security', []]
['troopers', []]
['bad', []]
['attitude', ['have']]
['towards', []]
['public', []]
Review Number :  44
[['middle', []], ['nowhere', []]]
['middle', []]
['nowhere', []]
Review Number :  45


 90%|██████████████████████████████████████████████████████████████████████████        | 47/52 [00:18<00:01,  4.90it/s]

[['place', []], ['well', []], ['people', ['kept']], ['unbelievably', []]]
['place', []]
['well', []]
['people', ['kept']]
['unbelievably', []]
Review Number :  46
[['bad', []], ['service', []]]
['bad', []]
['service', []]
Review Number :  47


 94%|█████████████████████████████████████████████████████████████████████████████▎    | 49/52 [00:18<00:00,  5.35it/s]

[['interestingly', []], ['enough', []], ['negative', []], ['reviews', []]]
['interestingly', []]
['enough', []]
['negative', []]
['reviews', []]
Review Number :  48
[['bad', []], ['service', []]]
['bad', []]
['service', []]
Review Number :  49


 98%|████████████████████████████████████████████████████████████████████████████████▍ | 51/52 [00:19<00:00,  2.67it/s]

[['sheat', []], ['dirty', []], ['pigs', []], ['step', []], ['minefields', []]]
['sheat', []]
['dirty', []]
['pigs', []]
['step', []]
['minefields', []]
Review Number :  50
[['gncpresent', []]]
['gncpresent', []]
Review Number :  51





In [4]:
fin = get_sentiment(a, b, nlp)

  0%|                                                                                           | 0/89 [00:00<?, ?it/s]

Calculating Sentiment for:  professional


  1%|▉                                                                                  | 1/89 [00:00<01:17,  1.13it/s]

Calculating Sentiment for:  people


  2%|█▊                                                                                 | 2/89 [00:05<04:35,  3.16s/it]

Calculating Sentiment for:  kind


  3%|██▊                                                                                | 3/89 [00:06<02:58,  2.07s/it]

Calculating Sentiment for:  reviews


  4%|███▋                                                                               | 4/89 [00:07<02:27,  1.74s/it]

Calculating Sentiment for:  place


  6%|████▋                                                                              | 5/89 [00:09<02:33,  1.82s/it]

Calculating Sentiment for:  clean


  7%|█████▌                                                                             | 6/89 [00:09<01:50,  1.33s/it]

Calculating Sentiment for:  good


  8%|██████▌                                                                            | 7/89 [00:11<01:41,  1.24s/it]

Calculating Sentiment for:  experience


  9%|███████▍                                                                           | 8/89 [00:11<01:24,  1.05s/it]

Calculating Sentiment for:  u


 10%|████████▍                                                                          | 9/89 [00:12<01:14,  1.08it/s]

Calculating Sentiment for:  complete


 11%|█████████▏                                                                        | 10/89 [00:12<00:57,  1.37it/s]

Calculating Sentiment for:  rest


 12%|██████████▏                                                                       | 11/89 [00:13<01:01,  1.26it/s]

Calculating Sentiment for:  stickerpass


 13%|███████████                                                                       | 12/89 [00:13<00:49,  1.57it/s]

Calculating Sentiment for:  mei


 15%|███████████▉                                                                      | 13/89 [00:14<00:42,  1.80it/s]

Calculating Sentiment for:  scan


 16%|████████████▉                                                                     | 14/89 [00:14<00:37,  2.01it/s]

Calculating Sentiment for:  dontbother


 17%|█████████████▊                                                                    | 15/89 [00:14<00:30,  2.44it/s]

Calculating Sentiment for:  end


 18%|██████████████▋                                                                   | 16/89 [00:15<00:35,  2.03it/s]

Calculating Sentiment for:  hours


 19%|███████████████▋                                                                  | 17/89 [00:15<00:31,  2.29it/s]

Calculating Sentiment for:  realise


 20%|████████████████▌                                                                 | 18/89 [00:16<00:36,  1.97it/s]

Calculating Sentiment for:  youre


 21%|█████████████████▌                                                                | 19/89 [00:16<00:35,  2.00it/s]

Calculating Sentiment for:  person


 22%|██████████████████▍                                                               | 20/89 [00:17<00:30,  2.27it/s]

Calculating Sentiment for:  fair


 24%|███████████████████▎                                                              | 21/89 [00:17<00:28,  2.42it/s]

Calculating Sentiment for:  choice


 25%|████████████████████▎                                                             | 22/89 [00:17<00:25,  2.66it/s]

Calculating Sentiment for:  waste


 26%|█████████████████████▏                                                            | 23/89 [00:19<00:43,  1.53it/s]

Calculating Sentiment for:  time


 27%|██████████████████████                                                            | 24/89 [00:20<01:03,  1.02it/s]

Calculating Sentiment for:  nsf


 28%|███████████████████████                                                           | 25/89 [00:21<00:56,  1.13it/s]

Calculating Sentiment for:  staff


 29%|███████████████████████▉                                                          | 26/89 [00:25<01:59,  1.90s/it]

Calculating Sentiment for:  bad


 31%|█████████████████████████▊                                                        | 28/89 [00:27<01:19,  1.30s/it]

Calculating Sentiment for:  helpful
Calculating Sentiment for:  initiative


 33%|██████████████████████████▋                                                       | 29/89 [00:27<00:59,  1.01it/s]

Calculating Sentiment for:  help


 34%|███████████████████████████▋                                                      | 30/89 [00:28<00:48,  1.21it/s]

Calculating Sentiment for:  guards


 35%|████████████████████████████▌                                                     | 31/89 [00:28<00:44,  1.30it/s]

Calculating Sentiment for:  job


 37%|██████████████████████████████▍                                                   | 33/89 [00:29<00:27,  2.02it/s]

Calculating Sentiment for:  stuff
Calculating Sentiment for:  suppose


 39%|████████████████████████████████▏                                                 | 35/89 [00:30<00:23,  2.27it/s]

Calculating Sentiment for:  woman
Calculating Sentiment for:  keeps


 40%|█████████████████████████████████▏                                                | 36/89 [00:30<00:19,  2.69it/s]

Calculating Sentiment for:  tone


 42%|██████████████████████████████████                                                | 37/89 [00:31<00:27,  1.92it/s]

Calculating Sentiment for:  talks


 43%|███████████████████████████████████                                               | 38/89 [00:31<00:22,  2.32it/s]

Calculating Sentiment for:  mask


 44%|███████████████████████████████████▉                                              | 39/89 [00:32<00:22,  2.22it/s]

Calculating Sentiment for:  expressions


 45%|████████████████████████████████████▊                                             | 40/89 [00:32<00:19,  2.49it/s]

Calculating Sentiment for:  rude


 46%|█████████████████████████████████████▊                                            | 41/89 [00:32<00:18,  2.59it/s]

Calculating Sentiment for:  impatient


 47%|██████████████████████████████████████▋                                           | 42/89 [00:33<00:17,  2.63it/s]

Calculating Sentiment for:  times


 48%|███████████████████████████████████████▌                                          | 43/89 [00:33<00:21,  2.19it/s]

Calculating Sentiment for:  cmpb


 51%|█████████████████████████████████████████▍                                        | 45/89 [00:34<00:18,  2.40it/s]

Calculating Sentiment for:  sent
Calculating Sentiment for:  son


 52%|██████████████████████████████████████████▍                                       | 46/89 [00:35<00:28,  1.51it/s]

Calculating Sentiment for:  guard


 53%|███████████████████████████████████████████▎                                      | 47/89 [00:36<00:23,  1.80it/s]

Calculating Sentiment for:  instructions


 55%|█████████████████████████████████████████████▏                                    | 49/89 [00:36<00:15,  2.64it/s]

Calculating Sentiment for:  drive
Calculating Sentiment for:  train


 56%|██████████████████████████████████████████████                                    | 50/89 [00:36<00:13,  2.81it/s]

Calculating Sentiment for:  army


 57%|██████████████████████████████████████████████▉                                   | 51/89 [00:37<00:13,  2.81it/s]

Calculating Sentiment for:  directions


 58%|███████████████████████████████████████████████▉                                  | 52/89 [00:37<00:12,  3.01it/s]

Calculating Sentiment for:  process


 60%|████████████████████████████████████████████████▊                                 | 53/89 [00:37<00:13,  2.75it/s]

Calculating Sentiment for:  take


 61%|█████████████████████████████████████████████████▊                                | 54/89 [00:38<00:12,  2.73it/s]

Calculating Sentiment for:  ok


 62%|██████████████████████████████████████████████████▋                               | 55/89 [00:39<00:17,  2.00it/s]

Calculating Sentiment for:  tio


 63%|███████████████████████████████████████████████████▌                              | 56/89 [00:39<00:18,  1.78it/s]

Calculating Sentiment for:  vocationmo


 64%|████████████████████████████████████████████████████▌                             | 57/89 [00:39<00:14,  2.17it/s]

Calculating Sentiment for:  probably


 65%|█████████████████████████████████████████████████████▍                            | 58/89 [00:40<00:12,  2.45it/s]

Calculating Sentiment for:  others


 66%|██████████████████████████████████████████████████████▎                           | 59/89 [00:40<00:12,  2.41it/s]

Calculating Sentiment for:  conditions


 67%|███████████████████████████████████████████████████████▎                          | 60/89 [00:41<00:12,  2.27it/s]

Calculating Sentiment for:  fare


 69%|████████████████████████████████████████████████████████▏                         | 61/89 [00:41<00:10,  2.69it/s]

Calculating Sentiment for:  specialistletter


 70%|█████████████████████████████████████████████████████████                         | 62/89 [00:41<00:08,  3.02it/s]

Calculating Sentiment for:  ask


 71%|██████████████████████████████████████████████████████████                        | 63/89 [00:44<00:29,  1.13s/it]

Calculating Sentiment for:  question


 72%|██████████████████████████████████████████████████████████▉                       | 64/89 [00:45<00:23,  1.06it/s]

Calculating Sentiment for:  ignore


 73%|███████████████████████████████████████████████████████████▉                      | 65/89 [00:45<00:17,  1.34it/s]

Calculating Sentiment for:  nice


 74%|████████████████████████████████████████████████████████████▊                     | 66/89 [00:46<00:17,  1.31it/s]

Calculating Sentiment for:  friendly


 75%|█████████████████████████████████████████████████████████████▋                    | 67/89 [00:46<00:14,  1.54it/s]

Calculating Sentiment for:  great


 76%|██████████████████████████████████████████████████████████████▋                   | 68/89 [00:46<00:11,  1.85it/s]

Calculating Sentiment for:  medic


 78%|███████████████████████████████████████████████████████████████▌                  | 69/89 [00:47<00:12,  1.56it/s]

Calculating Sentiment for:  check


 79%|████████████████████████████████████████████████████████████████▍                 | 70/89 [00:48<00:10,  1.79it/s]

Calculating Sentiment for:  patience


 81%|██████████████████████████████████████████████████████████████████▎               | 72/89 [00:49<00:07,  2.13it/s]

Calculating Sentiment for:  bully
Calculating Sentiment for:  taiji


 82%|███████████████████████████████████████████████████████████████████▎              | 73/89 [00:49<00:06,  2.53it/s]

Calculating Sentiment for:  share


 83%|████████████████████████████████████████████████████████████████████▏             | 74/89 [00:49<00:06,  2.18it/s]

Calculating Sentiment for:  opinion


 85%|██████████████████████████████████████████████████████████████████████            | 76/89 [00:50<00:04,  2.87it/s]

Calculating Sentiment for:  varies
Calculating Sentiment for:  trip


 87%|██████████████████████████████████████████████████████████████████████▉           | 77/89 [00:50<00:04,  2.82it/s]

Calculating Sentiment for:  sessions


 88%|███████████████████████████████████████████████████████████████████████▊          | 78/89 [00:51<00:04,  2.25it/s]

Calculating Sentiment for:  inconvenientlocation


 89%|████████████████████████████████████████████████████████████████████████▊         | 79/89 [00:51<00:04,  2.30it/s]

Calculating Sentiment for:  officerattitude


 90%|█████████████████████████████████████████████████████████████████████████▋        | 80/89 [00:52<00:03,  2.51it/s]

Calculating Sentiment for:  use


 91%|██████████████████████████████████████████████████████████████████████████▋       | 81/89 [00:52<00:03,  2.26it/s]

Calculating Sentiment for:  cool


 92%|███████████████████████████████████████████████████████████████████████████▌      | 82/89 [00:52<00:02,  2.63it/s]

Calculating Sentiment for:  ziyuan


 93%|████████████████████████████████████████████████████████████████████████████▍     | 83/89 [00:53<00:02,  2.51it/s]

Calculating Sentiment for:  writer


 94%|█████████████████████████████████████████████████████████████████████████████▍    | 84/89 [00:54<00:02,  2.04it/s]

Calculating Sentiment for:  thing


 96%|██████████████████████████████████████████████████████████████████████████████▎   | 85/89 [00:54<00:02,  1.81it/s]

Calculating Sentiment for:  whats


 97%|███████████████████████████████████████████████████████████████████████████████▏  | 86/89 [00:55<00:01,  1.66it/s]

Calculating Sentiment for:  canteenb


 98%|████████████████████████████████████████████████████████████████████████████████▏ | 87/89 [00:55<00:00,  2.04it/s]

Calculating Sentiment for:  money


 99%|█████████████████████████████████████████████████████████████████████████████████ | 88/89 [00:55<00:00,  2.46it/s]

Calculating Sentiment for:  attitude


100%|██████████████████████████████████████████████████████████████████████████████████| 89/89 [00:56<00:00,  1.58it/s]


In [5]:
fin

Unnamed: 0,Avg_sent,Descriptors,Freq
staff,0.884058,"professional ,keep ,keep ,rude ,impatient ,exp...",11
people,0.891892,"recommended ,suggest ,threatens ,threatens ,th...",10
bad,0.913043,"its ,canteenb ,canteenb",8
reviews,0.894737,"suggest ,what ,give",6
place,0.903226,"clean ,experience ,waste ,check",6
...,...,...,...
train,1.000000,army,1
army,1.000000,train,1
directions,1.000000,seen,1
process,1.000000,expect,1


In [6]:
def get_tfidf_features(df, content_str = "Content", min_ = 2, max_ = 0.5, ngramrange = (1,2)):
    review_list = df[content_str].to_list()
    tfidf = TfidfVectorizer(min_df = min_, max_df = max_, ngram_range = ngramrange);
    features = tfidf.fit_transform(review_list);
    q = pd.DataFrame(features.todense(), columns=tfidf.get_feature_names())
    return list(q.columns)

def refine_features(originaldf, sentimentdf):
    tfidf_output = get_tfidf_features(originaldf)
    sentimentdf = sentimentdf.reset_index()
    ft_extract = set(sentimentdf['index']);
    tfidf_extract = set(tfidf_output)
    
    intersecting_features = ft_extract.intersection(tfidf_extract)
    
    return_df = sentimentdf
    return_df = return_df.loc[return_df['index'].isin(list(intersecting_features))]
    print("Number of extracted features:")
    print("Initial = ", len(ft_extract), " TFIDF = ", len(intersecting_features), " Final after intersection = ", return_df.shape[0])
    return return_df

In [7]:
refined = refine_features(rdr, fin)
refined

Number of extracted features:
Initial =  89  TFIDF =  31  Final after intersection =  31


Unnamed: 0,index,Avg_sent,Descriptors,Freq
0,staff,0.884058,"professional ,keep ,keep ,rude ,impatient ,exp...",11
1,people,0.891892,"recommended ,suggest ,threatens ,threatens ,th...",10
2,bad,0.913043,"its ,canteenb ,canteenb",8
3,reviews,0.894737,"suggest ,what ,give",6
4,place,0.903226,"clean ,experience ,waste ,check",6
5,time,0.892857,"takes ,takes ,travel ,travel",5
6,rude,1.0,staff,4
7,experience,0.9,"place ,had",4
8,nsf,0.9,"need ,cool",4
9,professional,0.916667,"staff ,medic",3


# 2. Word Embedding + Clustering

Do some cleaning first. From the set of refined features, we removed stop words, then stemmed them.

In [8]:
import collections
from sklearn.cluster import KMeans
from tqdm import tqdm
from gensim.models import Word2Vec 
from sklearn.feature_extraction.text import CountVectorizer
import seaborn as sns
import matplotlib
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.cluster import AffinityPropagation
from sklearn.manifold import TSNE
from sklearn.preprocessing import normalize
import spacy
from nltk.stem import PorterStemmer

In [9]:
uncleaned_words = refined['index']
uncleaned_words.head()

0      staff
1     people
2        bad
3    reviews
4      place
Name: index, dtype: object

In [10]:
nlp = spacy.load("en_core_web_sm") # to run on command prompt: python -m spacy download en_core_web_sm
words = [item for item in uncleaned_words if item not in nlp.Defaults.stop_words]
print("Words removed were: ", set(uncleaned_words).difference(set(words)))
print("From", len(uncleaned_words), "to", len(words))
words_df = pd.DataFrame(words)
words_df.columns = ['word']
words_df

Words removed were:  {'take', 'others'}
From 31 to 29


Unnamed: 0,word
0,staff
1,people
2,bad
3,reviews
4,place
5,time
6,rude
7,experience
8,nsf
9,professional


In [32]:
stemmed_words = []
ps = PorterStemmer()

for w in words:
    rootWord = ps.stem(w)
    if rootWord not in stemmed_words:
        stemmed_words.append(rootWord)
        
print("Number of words left after stemming was", len(stemmed_words))

Number of words left after stemming was 29


## 2.1 spaCy's Pretained Vectors + Affinity Propagation

In [13]:
def vectorize(text):
    """Get the SpaCy vector corresponding to a text"""
    return nlp(text).vector

X = np.stack(vectorize(word) for word in words)
X_normalised = normalize(np.stack(vectorize(word) for word in words))

affprop = AffinityPropagation()
affprop.fit(X)

word_array = np.array(words)

  if (await self.run_code(code, result,  async_=asy)):


In [14]:
for cluster_id in np.unique(affprop.labels_):
    exemplar = word_array[affprop.cluster_centers_indices_[cluster_id]]
    cluster = np.unique(word_array[np.nonzero(affprop.labels_==cluster_id)])
    cluster_str = ", ".join(cluster)
    print(" - *%s:* %s" % (exemplar, cluster_str))

 - *place:* attitude, cmpb, experience, nsf, patience, person, place, question, staff, time, tone
 - *check:* ask, check, help, rest, waste
 - *guards:* guards, hours, people, reviews
 - *nice:* bad, complete, friendly, good, great, kind, nice, professional, rude


# Evaluation

In [23]:
from sklearn import metrics

labels_true_df = pd.read_csv("../../output/evaluation/cluster_manual.csv")
labels_true_df

Unnamed: 0,Cluster,Elements
0,1,staff
1,1,person
2,1,personal
3,1,guards
4,1,doctors
...,...,...
192,148,whole
193,149,process
194,150,take
195,151,august


In [24]:
labels_true_df.columns = ["true_cluster", "word"]
labels_true_df = pd.merge(labels_true_df, words_df, on = "word", how = "inner")
labels_true_df

Unnamed: 0,true_cluster,word
0,1,staff
1,1,person
2,1,guards
3,3,place
4,4,time
5,4,hours
6,5,rude
7,5,friendly
8,5,kind
9,6,good


In [25]:
set(words_df['word']).difference(set(labels_true_df['word']))

set()

In [26]:
print(affprop.labels_)
labels_pred_df = pd.DataFrame({'pred_cluster': affprop.labels_})
labels_pred_df
pd.merge(labels_true_df, labels_pred_df, left_index = True, right_index = True)

[0 2 3 2 0 0 3 0 0 3 0 1 3 3 1 1 0 2 3 3 0 3 0 0 2 1 1 3 0]


Unnamed: 0,true_cluster,word,pred_cluster
0,1,staff,0
1,1,person,2
2,1,guards,3
3,3,place,2
4,4,time,0
5,4,hours,0
6,5,rude,3
7,5,friendly,0
8,5,kind,0
9,6,good,3


In [30]:
labels_true = list(labels_true_df['true_cluster'])

## Rand Index

In [31]:
metrics.rand_score(labels_true, affprop.labels_)

0.7167487684729064

In [34]:
metrics.adjusted_rand_score(labels_true, affprop.labels_)

-0.020635683994229003

## Mutual Information based scores

In [33]:
metrics.adjusted_mutual_info_score(labels_true, affprop.labels_)

-0.0435904889302693

## Silhouette score

In [36]:
metrics.silhouette_score(X, affprop.labels_, metric='euclidean')

0.21163464

## Calinski-Harabasz Index

In [37]:
metrics.calinski_harabasz_score(X, affprop.labels_)

8.084159232372821

## Davies Bouldin Index

In [39]:
metrics.davies_bouldin_score(X, affprop.labels_)

1.4980370150395048