In [1]:
import pandas as pd

posts = pd.read_csv("../ScrapedOutput/nationalservicesg_posts.csv")
comments = pd.read_csv("../ScrapedOutput/nationalservicesg_comments.csv")

# Preprocessing

In [2]:
# Combine the two datasets into one for feature extraction
posts[posts.body == "[removed]"].body = ""
posts[posts.body == "[deleted]"].body = ""
posts["title1"] = posts["title"].str.ljust(len(posts["title"]) + 1, " ")
posts["Content"] = posts["title1"] + posts["body"]
posts = posts.filter(items = ["Content"])
posts

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


Unnamed: 0,Content
0,Too many ailments? Should I RSI or 'appease' m...
1,upcoming re BMT FFI question ...
2,Anyone here has visited S1 branch in Pasir Len...
3,PES for severely underweight ...
4,"PES E1, but I can't swim. ..."
...,...
95,Countdown time to ord period ...
96,Anyone here went to SPF OCT? ...
97,NA Exit Permit/ Deferment ...
98,Recruits having their first water parade ...


In [3]:
comments[comments.body == "[removed]"].body = ""
comments = comments.filter(items = ["body"]).rename(columns = {"body": "Content"})
comments

Unnamed: 0,Content
0,"Just keep on going to the mo. Take it from me,..."
1,All legit means just go and RSI ba. Unless u w...
2,"I mean, if you’re really worried about a bad i..."
3,"after reading ur post, I think u need to file ..."
4,It's not your fault that your superiors are ig...
...,...
799,"Incredible, I’m at 173, please teach me"
800,You can call yhe NS hotline and ask. But most ...
801,[https://www.cmpb.gov.sg/web/portal/cmpb/home/...
802,You can opt to come back every 3 months I thin...


In [4]:
textdata = posts.append(comments, ignore_index = True)
textdata

Unnamed: 0,Content
0,Too many ailments? Should I RSI or 'appease' m...
1,upcoming re BMT FFI question ...
2,Anyone here has visited S1 branch in Pasir Len...
3,PES for severely underweight ...
4,"PES E1, but I can't swim. ..."
...,...
899,"Incredible, I’m at 173, please teach me"
900,You can call yhe NS hotline and ask. But most ...
901,[https://www.cmpb.gov.sg/web/portal/cmpb/home/...
902,You can opt to come back every 3 months I thin...


# Feature Extraction & Sentiment Analysis

In [13]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer 
import stanza

In [12]:
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
#stanza.download('en') # download English model

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.2.0.json: 128kB [00:00, 1.32MB/s]
2021-03-07 17:31:49 INFO: Downloading default packages for language: en (English)...
2021-03-07 17:31:50 INFO: File exists: C:\Users\TzeMin\stanza_resources\en\default.zip.
2021-03-07 17:31:54 INFO: Finished downloading models and saved to C:\Users\TzeMin\stanza_resources.


In [8]:
def feature_extraction(txt, nlp):
    try:
        txt = txt.lower()
    except:
        pass

    sentList = nltk.sent_tokenize(txt)

    retlist = []
    
    for line in sentList:
        txt_list = nltk.word_tokenize(line)
        taggedList = nltk.pos_tag(txt_list)
        
        newwordList = []
        flag = 0
        for i in range(0, len(taggedList)-1):
            if(taggedList[i][1] == "NN" and taggedList[i+1][1] == "NN"):
                newwordList.append(taggedList[i][0] + taggedList[i+1][0])
                flag = 1
            else:
                if(flag == 1):
                    flag = 0
                    continue
                newwordList.append(taggedList[i][0])
                if(i == len(taggedList)-2):
                    newwordList.append(taggedList[i+1][0])
        finaltxt = ' '.join(word for word in newwordList)
    
    
        stop_words = set(stopwords.words('english'))
        new_txt_list = nltk.word_tokenize(finaltxt)
        wordsList = [w for w in new_txt_list if not w in stop_words]
        taggedList = nltk.pos_tag(wordsList)
        
        doc = nlp(finaltxt)
        dep_node = []
        try:
            for dep_edge in doc.sentences[0].dependencies:
                dep_node.append([dep_edge[2].text, dep_edge[0].id, dep_edge[1]])
            for i in range(0, len(dep_node)):
                if (int(dep_node[i][1]) != 0):
                    dep_node[i][1] = newwordList[(int(dep_node[i][1]) - 1)]
        except:
            pass;
        
        #print(dep_node)
        
        featureList = []
        categories = []
        for i in taggedList:
            if(i[1]=='JJ' or i[1]=='NN' or i[1]=='JJR' or i[1]=='NNS' or i[1]=='RB'):
                featureList.append(list(i))
                categories.append(i[0])
        #print(featureList)
        #print(categories)
        
        
        fcluster = []
        for i in featureList:
            filist = []
            for j in dep_node:
                if((j[0]==i[0] or j[1]==i[0]) and (j[2] in ["nsubj", "acl:relcl", "obj", "dobj", "agent", "advmod", "amod", "neg", "prep_of", "acomp", "xcomp", "compound"])):
                    if(j[0]==i[0]):
                        filist.append(j[1])
                    else:
                        filist.append(j[0])
            fcluster.append([i[0], filist])
        print(fcluster) 
        
        # Remove all features with no sentiment word:
        
        retlist.append(fcluster)
    return retlist;

def do_extraction(df, nlp, content_str = "Content"):
    idx = 0;
    review_list = df[content_str].to_list()
    feat_count = dict()
    feat_sent = dict()
    #nlp = stanza.Pipeline('en')
    
    # Replace "" with nan's for removal
    df[content_str].replace('', np.nan, inplace=True)
    df.dropna(subset=[content_str], inplace=True)
    print(" Processing : " , df.shape[0], "rows of data")
    for review in review_list:
        print("Review Number : ", idx);
        idx += 1;
        if idx >= df.shape[0]:
            break;
        try:
            output = feature_extraction(review, nlp);
        except:
            pass;
        for sent in output:
            for pair in sent:
                print(pair)
                if pair[0] in feat_sent:
                    if pair[1] is not None:
                        flist = feat_sent[pair[0]]
                        if isinstance(pair[1], list):
                            for i in pair[1]:
                                flist.append(i)
                        else:
                            flist.append(pair[1])
                        feat_sent[pair[0]] = flist;
                else:
                    if pair[1] is not None:
                        flist = pair[1]
                    else:
                        flist = list()
                    feat_sent[pair[0]] = flist;
                
                if pair[0] in feat_count:
                    feat_count[pair[0]] = feat_count[pair[0]] + 1;
                else:
                    feat_count[pair[0]] = 1

    #print(feat_count);
    return feat_count, feat_sent;

def get_sentiment(a, b, nlp):

    sentiment_score = dict()

    # Delete features with no descriptors
    cob = b.copy()
    for feat in cob.keys():
        #print(cob[feat])
        if cob[feat] == []:
            del b[feat]

    # Run pre-built sentiment score and take avg of all descriptors
    for f in b.keys():
        #print(f);
        ssum = 0;
        for g in b[f]:
            try:
                doc = nlp(g);

                for i in doc.sentences:

                        #print(i.sentiment)
                        ssum += i.sentiment;
            except:
                pass;

        sentiment_score[f] = ssum / len(b[f])

        adf = pd.DataFrame.from_dict(a, orient='index', columns=['Freq'])
    adf.sort_values(by="Freq", ascending=False, inplace = True)



    avg_sent = pd.DataFrame.from_dict(sentiment_score, orient='index', columns=["Avg_sent"])

    final_sent = avg_sent.merge(adf, left_index=True, right_index=True)
    final_sent.sort_values(by="Freq", ascending=False, inplace=True)
    return final_sent;

In [9]:
rdr = textdata.head(50)
nlp = stanza.Pipeline('en')

2021-03-07 16:18:38 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| sentiment | sstplus   |
| ner       | ontonotes |

2021-03-07 16:18:38 INFO: Use device: cpu
2021-03-07 16:18:38 INFO: Loading: tokenize
2021-03-07 16:18:38 INFO: Loading: pos
2021-03-07 16:18:39 INFO: Loading: lemma
2021-03-07 16:18:39 INFO: Loading: depparse
2021-03-07 16:18:39 INFO: Loading: sentiment
2021-03-07 16:18:40 INFO: Loading: ner
2021-03-07 16:18:41 INFO: Done loading processors!


In [11]:
a, b = do_extraction(rdr, nlp)
final_sent = get_sentiment(a, b, nlp)
final_sent

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


 Processing :  44 rows of data
Review Number :  0
[['many', ['too', 'ailments']], ['ailments', ['many']]]
[['rsi', ['i']], ['superiors', ["'"]]]
[['currently', ['ma']], ['periodic', ['ma']], ['leg', ['injury']], ['injury', ['leg']], ['bmt', []]]
[['weeks', []], ['ago', ['rsi-ed']], ['rsi-ed', ['ago', 'i']], ['sharp', ['tinglingpain']], ['tinglingpain', ['sharp']], ['elbows', ['diagnosed']], ['nerveimpingement', []], ['referred', ['me']], ['specialist', []]]
[['weeks', []], ['lower', ['developing']], ['back', ['ache']], ['ache', ['back', 'developing']], ['well', []], ['record', []], ['legitimate', ['these']]]
[['mo', []], ['specialists', []], ['back', ['ache']], ['ache', ['less', 'back']]]
[['rsi', ['i', 'again']], ["'too", []], ['much', []]]
[['superiors', []], ['already', ['seeing']], ['glasses', ['tinted']], ['sickly', []]]
['many', ['too', 'ailments']]
['ailments', ['many']]
['rsi', ['i']]
['superiors', ["'"]]
['currently', ['ma']]
['periodic', ['ma']]
['leg', ['injury']]
['injury',

[['kranji', ['camp']], ['camp', ['kranji']]]
['extra', ['baggage']]
['baggage', ['extra']]
['better', ['more', 'the']]
['extra', ['things']]
['recommended', ['things']]
['things', ['extra', 'recommended', 'pack']]
['pack', ['i', 'things']]
['enlisting', []]
['kranji', ['camp']]
['camp', ['kranji']]
Review Number :  11
[['miss', ['i', 'boyfriend']], ['boyfriend', ['miss']], ['far', ['so', 'surviving']], ['[', []], [']', []]]
['miss', ['i', 'boyfriend']]
['boyfriend', ['miss']]
['far', ['so', 'surviving']]
['[', []]
[']', []]
Review Number :  12
[['cantconvert', ['convertlicense']], ['convertlicense', ['cantconvert']], ['anymore', ['way']], ['way', ['anymore', 'fastest']], ['get', ['license']], ['civilian', ['license']], ['license', ['civilian', 'get']]]
[['way', ['cheapest', 'whats']], ['reach', ['test']], ['tp', ['test']], ['test', ['tp', 'reach']], ['pass', []]]
[['armylicense', ['got']], ['stuff', []], ['concept', ['fresh']], ['fresh', ['concept']], ['head', []], ['im', []], ['theres

[['specialist', ['memo']], ['memo', ['specialist', 'change']], ['advise', ['change']], ['change', ['advise', 'memo']], ['camp', []], ['excusestay-in', []]]
[['[', []], ['serious', ['replies']], ['replies', ['serious']], [']', []], ['little', ['background']], ['background', ['little']], ['diagnosed', []], ['mental', ['healthcondition']], ['healthcondition', ['mental']], ['likely', ['downpes']], ['soon', ['downpes']]]
[['insomnia', []], ['really', ['bad']], ['bad', ['it', 'really']], ['especially', ['camp']], ['camp', ['especially']]]
[['psych', []], ['excusestay', ['get']]]
[['however', ['stay']], ['stay', ['however', 'i']], ['side', ['other']], ['island', []], ['camp', []], ['get', []], ['camp', []], ['public', ['transportation']], ['transportation', ['public']]]
[['manage', ['i', 'excuse']], ['excuse', ['manage', 'stay']], ['everyday', ['spend']], ['travelling', ['just']], ['confident', ['i', 'not']], ['get', ['i', 'rest']], ['enough', ['rest']], ['rest', ['enough', 'get']], ['formati

[['however', ['persistent']], ['knee', ['injury']], ['injury', ['knee', 'persistent']], ['still', ['persistent']], ['persistent', ['however', 'injury', 'still']], ['report', ['like', 'sick', 'outside']], ['sick', ['report']], ['rso', []]]
[['book', ['have', 'in']], ['sunday', []], ['polyclinics', []], ['’', []], ['sure', ['i', 'not']], ['gp', ['clinics']], ['clinics', ['gp']], ['medical', ['practictioners']], ['practictioners', ['medical', 'certified']], ['authority', ['has']], ['mc', ['give']], ['stay', ['give']], ['home', []], ['recover', ['me']]]
[['commanders', ['helpful']], ['govnt', ['hospitals/polyclinics']], ['hospitals/polyclinics', ['govnt']], ['general', ['rso']], ['guide', []], ['rso', ['general']]]
[['’', []], ['aware', ['i']], ['home', []], ['update', ['commanders']], ['commanders', ['update']], ['movements', []]]
[['anyone', ['advice']], ['advice', ['anyone', 'me']], ['practictioners', ['certified']], ['gp', []], ['consult', []], ['rso', []], ['case', ['isnt']], ['isnt',

Unnamed: 0,Avg_sent,Freq
pes,1.166667,13
get,1.150000,12
camp,1.000000,10
anyone,1.000000,8
really,0.666667,6
...,...,...
assaultpack,1.000000,1
ocs,1.000000,1
dump-in,1.000000,1
time,1.000000,1
