In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [3]:
ex = "Bachelor's Degree from an accredited college or university in Engineering, Computer Science, Information Systems, Business, or other related Discipline.  Master's degree preferred."

Using NLTK Word Tokenization

In [3]:
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

In [4]:
sent = preprocess(ex)
sent

[('Bachelor', 'NNP'),
 ("'s", 'POS'),
 ('Degree', 'NNP'),
 ('from', 'IN'),
 ('an', 'DT'),
 ('accredited', 'JJ'),
 ('college', 'NN'),
 ('or', 'CC'),
 ('university', 'NN'),
 ('in', 'IN'),
 ('Engineering', 'NNP'),
 (',', ','),
 ('Computer', 'NNP'),
 ('Science', 'NNP'),
 (',', ','),
 ('Information', 'NNP'),
 ('Systems', 'NNP'),
 (',', ','),
 ('Business', 'NNP'),
 (',', ','),
 ('or', 'CC'),
 ('other', 'JJ'),
 ('related', 'JJ'),
 ('Discipline', 'NNP'),
 ('.', '.'),
 ('Master', 'NNP'),
 ("'s", 'POS'),
 ('degree', 'NN'),
 ('preferred', 'VBD'),
 ('.', '.')]

Using NLTK Regexparser

In [5]:
pattern = 'NP: {<DT>?<JJ>*<NN>}'
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print(cs)

(S
  Bachelor/NNP
  's/POS
  Degree/NNP
  from/IN
  (NP an/DT accredited/JJ college/NN)
  or/CC
  (NP university/NN)
  in/IN
  Engineering/NNP
  ,/,
  Computer/NNP
  Science/NNP
  ,/,
  Information/NNP
  Systems/NNP
  ,/,
  Business/NNP
  ,/,
  or/CC
  other/JJ
  related/JJ
  Discipline/NNP
  ./.
  Master/NNP
  's/POS
  (NP degree/NN)
  preferred/VBD
  ./.)


Using NLTK ne.Chunk

In [7]:
ne_tree = nltk.ne_chunk(pos_tag(word_tokenize(ex)))
print(ne_tree)

(S
  (GPE Bachelor/NNP)
  's/POS
  Degree/NNP
  from/IN
  an/DT
  accredited/JJ
  college/NN
  or/CC
  university/NN
  in/IN
  (GPE Engineering/NNP)
  ,/,
  (ORGANIZATION Computer/NNP Science/NNP)
  ,/,
  (ORGANIZATION Information/NNP Systems/NNP)
  ,/,
  (GPE Business/NNP)
  ,/,
  or/CC
  other/JJ
  related/JJ
  Discipline/NNP
  ./.
  (PERSON Master/NNP)
  's/POS
  degree/NN
  preferred/VBD
  ./.)


Using Spacy

In [11]:
import spacy
from spacy import displacy
from collections import Counter
nlp = spacy.load("en_core_sci_lg")

In [12]:
doc = nlp(ex)
print([(X.text, X.label_) for X in doc.ents])

[("Bachelor's", 'ENTITY'), ('Degree', 'ENTITY'), ('accredited', 'ENTITY'), ('college', 'ENTITY'), ('university', 'ENTITY'), ('Engineering', 'ENTITY'), ('Computer Science', 'ENTITY'), ('Information Systems', 'ENTITY'), ('Business', 'ENTITY'), ('Discipline', 'ENTITY'), ("Master's", 'ENTITY'), ('degree', 'ENTITY')]


In [13]:
print([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(Bachelor, 'B', 'ENTITY'), ('s, 'I', 'ENTITY'), (Degree, 'B', 'ENTITY'), (from, 'O', ''), (an, 'O', ''), (accredited, 'B', 'ENTITY'), (college, 'B', 'ENTITY'), (or, 'O', ''), (university, 'B', 'ENTITY'), (in, 'O', ''), (Engineering, 'B', 'ENTITY'), (,, 'O', ''), (Computer, 'B', 'ENTITY'), (Science, 'I', 'ENTITY'), (,, 'O', ''), (Information, 'B', 'ENTITY'), (Systems, 'I', 'ENTITY'), (,, 'O', ''), (Business, 'B', 'ENTITY'), (,, 'O', ''), (or, 'O', ''), (other, 'O', ''), (related, 'O', ''), (Discipline, 'B', 'ENTITY'), (., 'O', ''), ( , 'O', ''), (Master, 'B', 'ENTITY'), ('s, 'I', 'ENTITY'), (degree, 'B', 'ENTITY'), (preferred, 'O', ''), (., 'O', '')]
