In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import conlltags2tree, tree2conlltags
nltk.download('maxent_ne_chunker')
nltk.download('words')
from pprint import pprint

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\TzeMin\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\TzeMin\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


## NLTK - sample

In [3]:
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

ex = "4 weeks bmt, 5 weeks vocational training. Just relax. BMT will probably be the best part of your NS journey as a Pes C. After you're in your unit you'll miss kranji camp and the 4 hours per day you spend ironing bed and talking cock."
sent = preprocess(ex)
sent

[('4', 'CD'),
 ('weeks', 'NNS'),
 ('bmt', 'NN'),
 (',', ','),
 ('5', 'CD'),
 ('weeks', 'NNS'),
 ('vocational', 'JJ'),
 ('training', 'NN'),
 ('.', '.'),
 ('Just', 'NNP'),
 ('relax', 'NN'),
 ('.', '.'),
 ('BMT', 'NNP'),
 ('will', 'MD'),
 ('probably', 'RB'),
 ('be', 'VB'),
 ('the', 'DT'),
 ('best', 'JJS'),
 ('part', 'NN'),
 ('of', 'IN'),
 ('your', 'PRP$'),
 ('NS', 'NNP'),
 ('journey', 'NN'),
 ('as', 'IN'),
 ('a', 'DT'),
 ('Pes', 'NNP'),
 ('C.', 'NNP'),
 ('After', 'IN'),
 ('you', 'PRP'),
 ("'re", 'VBP'),
 ('in', 'IN'),
 ('your', 'PRP$'),
 ('unit', 'NN'),
 ('you', 'PRP'),
 ("'ll", 'MD'),
 ('miss', 'VB'),
 ('kranji', 'FW'),
 ('camp', 'NN'),
 ('and', 'CC'),
 ('the', 'DT'),
 ('4', 'CD'),
 ('hours', 'NNS'),
 ('per', 'IN'),
 ('day', 'NN'),
 ('you', 'PRP'),
 ('spend', 'VBP'),
 ('ironing', 'VBG'),
 ('bed', 'NN'),
 ('and', 'CC'),
 ('talking', 'VBG'),
 ('cock', 'NN'),
 ('.', '.')]

In [5]:
# chunking based on self-defined chunk grammar
nounphrase = 'NP: {<DT>?<JJ|JJR|JJS>*<NN>}'
cp = nltk.RegexpParser(nounphrase)
cs = cp.parse(sent)
print(cs)

(S
  4/CD
  weeks/NNS
  (NP bmt/NN)
  ,/,
  5/CD
  weeks/NNS
  (NP vocational/JJ training/NN)
  ./.
  Just/NNP
  (NP relax/NN)
  ./.
  BMT/NNP
  will/MD
  probably/RB
  be/VB
  (NP the/DT best/JJS part/NN)
  of/IN
  your/PRP$
  NS/NNP
  (NP journey/NN)
  as/IN
  a/DT
  Pes/NNP
  C./NNP
  After/IN
  you/PRP
  're/VBP
  in/IN
  your/PRP$
  (NP unit/NN)
  you/PRP
  'll/MD
  miss/VB
  kranji/FW
  (NP camp/NN)
  and/CC
  the/DT
  4/CD
  hours/NNS
  per/IN
  (NP day/NN)
  you/PRP
  spend/VBP
  ironing/VBG
  (NP bed/NN)
  and/CC
  talking/VBG
  (NP cock/NN)
  ./.)


In [7]:
# inside-outside-beginning tagging
iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)

[('4', 'CD', 'O'),
 ('weeks', 'NNS', 'O'),
 ('bmt', 'NN', 'B-NP'),
 (',', ',', 'O'),
 ('5', 'CD', 'O'),
 ('weeks', 'NNS', 'O'),
 ('vocational', 'JJ', 'B-NP'),
 ('training', 'NN', 'I-NP'),
 ('.', '.', 'O'),
 ('Just', 'NNP', 'O'),
 ('relax', 'NN', 'B-NP'),
 ('.', '.', 'O'),
 ('BMT', 'NNP', 'O'),
 ('will', 'MD', 'O'),
 ('probably', 'RB', 'O'),
 ('be', 'VB', 'O'),
 ('the', 'DT', 'B-NP'),
 ('best', 'JJS', 'I-NP'),
 ('part', 'NN', 'I-NP'),
 ('of', 'IN', 'O'),
 ('your', 'PRP$', 'O'),
 ('NS', 'NNP', 'O'),
 ('journey', 'NN', 'B-NP'),
 ('as', 'IN', 'O'),
 ('a', 'DT', 'O'),
 ('Pes', 'NNP', 'O'),
 ('C.', 'NNP', 'O'),
 ('After', 'IN', 'O'),
 ('you', 'PRP', 'O'),
 ("'re", 'VBP', 'O'),
 ('in', 'IN', 'O'),
 ('your', 'PRP$', 'O'),
 ('unit', 'NN', 'B-NP'),
 ('you', 'PRP', 'O'),
 ("'ll", 'MD', 'O'),
 ('miss', 'VB', 'O'),
 ('kranji', 'FW', 'O'),
 ('camp', 'NN', 'B-NP'),
 ('and', 'CC', 'O'),
 ('the', 'DT', 'O'),
 ('4', 'CD', 'O'),
 ('hours', 'NNS', 'O'),
 ('per', 'IN', 'O'),
 ('day', 'NN', 'B-NP'),
 ('you'

In [14]:
ne_tree = nltk.ne_chunk(pos_tag(word_tokenize(ex)))
print(ne_tree)

(S
  4/CD
  weeks/NNS
  bmt/NN
  ,/,
  5/CD
  weeks/NNS
  vocational/JJ
  training/NN
  ./.
  Just/NNP
  relax/NN
  ./.
  (ORGANIZATION BMT/NNP)
  will/MD
  probably/RB
  be/VB
  the/DT
  best/JJS
  part/NN
  of/IN
  your/PRP$
  NS/NNP
  journey/NN
  as/IN
  a/DT
  (ORGANIZATION Pes/NNP)
  C./NNP
  After/IN
  you/PRP
  're/VBP
  in/IN
  your/PRP$
  unit/NN
  you/PRP
  'll/MD
  miss/VB
  kranji/FW
  camp/NN
  and/CC
  the/DT
  4/CD
  hours/NNS
  per/IN
  day/NN
  you/PRP
  spend/VBP
  ironing/VBG
  bed/NN
  and/CC
  talking/VBG
  cock/NN
  ./.)


## SpaCy - sample

In [1]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [16]:
ex = "4 weeks bmt, 5 weeks vocational training. Just relax. BMT will probably be the best part of your NS journey as a Pes C. After you're in your unit you'll miss kranji camp and the 4 hours per day you spend ironing bed and talking cock."
doc = nlp(ex)

# entity level tagging
pprint([(X.text, X.label_) for X in doc.ents])

[('4 weeks', 'DATE'),
 ('5 weeks', 'DATE'),
 ('BMT', 'ORG'),
 ('the 4 hours', 'TIME')]


In [17]:
# token-level biluo tagging
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(4, 'B', 'DATE'),
 (weeks, 'I', 'DATE'),
 (bmt, 'O', ''),
 (,, 'O', ''),
 (5, 'B', 'DATE'),
 (weeks, 'I', 'DATE'),
 (vocational, 'O', ''),
 (training, 'O', ''),
 (., 'O', ''),
 (Just, 'O', ''),
 (relax, 'O', ''),
 (., 'O', ''),
 (BMT, 'B', 'ORG'),
 (will, 'O', ''),
 (probably, 'O', ''),
 (be, 'O', ''),
 (the, 'O', ''),
 (best, 'O', ''),
 (part, 'O', ''),
 (of, 'O', ''),
 (your, 'O', ''),
 (NS, 'O', ''),
 (journey, 'O', ''),
 (as, 'O', ''),
 (a, 'O', ''),
 (Pes, 'O', ''),
 (C., 'O', ''),
 (After, 'O', ''),
 (you, 'O', ''),
 ('re, 'O', ''),
 (in, 'O', ''),
 (your, 'O', ''),
 (unit, 'O', ''),
 (you, 'O', ''),
 ('ll, 'O', ''),
 (miss, 'O', ''),
 (kranji, 'O', ''),
 (camp, 'O', ''),
 (and, 'O', ''),
 (the, 'B', 'TIME'),
 (4, 'I', 'TIME'),
 (hours, 'I', 'TIME'),
 (per, 'O', ''),
 (day, 'O', ''),
 (you, 'O', ''),
 (spend, 'O', ''),
 (ironing, 'O', ''),
 (bed, 'O', ''),
 (and, 'O', ''),
 (talking, 'O', ''),
 (cock, 'O', ''),
 (., 'O', '')]


## SpaCy - full corpus

In [41]:
with open('../../output/scraped-ns/corpus.txt', 'r', encoding=' utf-8') as file:
    corpus = file.read().replace('\n', '')
    
posts = nlp(corpus)
len(posts.ents)

7334

In [42]:
with open('../../output/scraped-ns/corpus.txt', 'r', encoding=' utf-8') as file:
    corpus2 = file.read()
    
posts = nlp(corpus2)
len(posts.ents)

7491

In [23]:
labels = [x.label_ for x in posts.ents]
Counter(labels)

Counter({'DATE': 1650,
         'TIME': 120,
         'ORG': 2774,
         'PERSON': 317,
         'CARDINAL': 1424,
         'PERCENT': 21,
         'MONEY': 50,
         'FAC': 45,
         'ORDINAL': 439,
         'GPE': 276,
         'NORP': 52,
         'WORK_OF_ART': 28,
         'QUANTITY': 61,
         'EVENT': 13,
         'LOC': 26,
         'PRODUCT': 37,
         'LANGUAGE': 1})

In [24]:
items = [x.text for x in posts.ents]
Counter(items).most_common(3)

[('IPPT', 875), ('RT', 363), ('10', 192)]

In [26]:
displacy.render(nlp(ex), jupyter=True, style='ent')

In [2]:
displacy.render(nlp("staff at medical screening station 5 for height and weight is extremely rude and unfriendly"), jupyter=True, style='ent')

In [3]:
displacy.render(nlp("Very kind people inside CMPD medical check up. Was constantly greeted with smile and patience."), jupyter=True, style='ent')



In [31]:
displacy.render(nlp(ex), style='dep', jupyter = True, options = {'distance': 120})

In [32]:
# lemmatise the sentence
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(ex) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('4', 'NUM', '4'),
 ('weeks', 'NOUN', 'week'),
 ('bmt', 'NOUN', 'bmt'),
 ('5', 'NUM', '5'),
 ('weeks', 'NOUN', 'week'),
 ('vocational', 'ADJ', 'vocational'),
 ('training', 'NOUN', 'training'),
 ('relax', 'VERB', 'relax'),
 ('BMT', 'PROPN', 'BMT'),
 ('probably', 'ADV', 'probably'),
 ('best', 'ADJ', 'good'),
 ('NS', 'PROPN', 'NS'),
 ('journey', 'NOUN', 'journey'),
 ('Pes', 'PROPN', 'Pes'),
 ('C.', 'PROPN', 'C.'),
 ('unit', 'NOUN', 'unit'),
 ('miss', 'VERB', 'miss'),
 ('kranji', 'NOUN', 'kranji'),
 ('camp', 'NOUN', 'camp'),
 ('4', 'NUM', '4'),
 ('hours', 'NOUN', 'hour'),
 ('day', 'NOUN', 'day'),
 ('spend', 'VERB', 'spend'),
 ('ironing', 'VERB', 'iron'),
 ('bed', 'NOUN', 'bed'),
 ('talking', 'VERB', 'talk'),
 ('cock', 'NOUN', 'cock')]

In [33]:
dict([(str(x), x.label_) for x in nlp(ex).ents])

{'4 weeks': 'DATE', '5 weeks': 'DATE', 'BMT': 'ORG', 'the 4 hours': 'TIME'}

In [46]:
post = [x for x in posts.sents]
fordisplay = ''.join(str(x) for x in post[1:51])
print(fordisplay)

Recommended⭐⭐⭐⭐⭐
Very patient kind respectful.. very smooth throughout the Medical Check-up."
"Unlike what the other reviews may suggest, people here were adequately friendly(like anywhere else in sg).The place is really clean and efficient"
The place overall is fine.Good experience.Just a tip if any of u going for medical checkup.DONT GOLATE OR ELSE U have to come back other day to complete the rest.
"I went there for my NS Checkup today 29 August 8am...
Here is what I went through:

Gate Entrance: Security Check & Counter,
Take a Sticker Pass and just WALK THROUGH the gate...Don't be like me...I stood there thinking I can scan the …"
"Don't bother showing up at the timing assigned to you, you'll end waiting for more than 2 hours only to realise that you're the last person in line.Doctors don't really care which is fair because they don't really have a choice but to be there.This place is a complete waste of space and time."
just here to read through the NSF reviews LOL
Takes a extrem

In [47]:
displacy.render(nlp(fordisplay), jupyter=True, style='ent')