In [36]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
from nltk.stem.snowball import SnowballStemmer
import pandas as pd

In [37]:
df = pd.read_csv('train.csv',sep=';')

In [38]:
stemmer = SnowballStemmer("english")

In [39]:
df.head()

Unnamed: 0,Word,POS,Tag
0,President,NNP,B-per
1,Karzai,NNP,I-per
2,thanked,VBD,O
3,his,PRP$,O
4,allies,NNS,O


In [40]:
def preprocess(sent):
    stemmed = []
    for word in sent:
        stemmed.append(stemmer.stem(word))
    sent = nltk.pos_tag(stemmed)
    return sent

In [41]:
sent = preprocess(df["Word"].values)
sent

[('presid', 'NN'),
 ('karzai', 'NN'),
 ('thank', 'VBD'),
 ('his', 'PRP$'),
 ('alli', 'NN'),
 ('for', 'IN'),
 ('their', 'PRP$'),
 ('help', 'NN'),
 ('in', 'IN'),
 ('battl', 'JJ'),
 ('terror', 'NN'),
 ('.', '.'),
 ('the', 'DT'),
 ('command', 'NN'),
 ('of', 'IN'),
 ('nato', 'NN'),
 ("'s", 'POS'),
 ('afghan', 'NN'),
 ('forc', 'NN'),
 (',', ','),
 ('british', 'JJ'),
 ('general', 'JJ'),
 ('david', 'NN'),
 ('richard', 'NN'),
 (',', ','),
 ('said', 'VBD'),
 ('the', 'DT'),
 ('uniti', 'NN'),
 ('of', 'IN'),
 ('command', 'NN'),
 ('the', 'DT'),
 ('transfer', 'NN'),
 ('brought', 'NN'),
 ('will', 'MD'),
 ('enhanc', 'VB'),
 ('the', 'DT'),
 ('effect', 'NN'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('overal', 'JJ'),
 ('oper', 'NN'),
 ('.', '.'),
 ('refer', 'NN'),
 ('to', 'TO'),
 ('the', 'DT'),
 ('fight', 'NN'),
 ('in', 'IN'),
 ('southern', 'JJ'),
 ('afghanistan', 'JJ'),
 ('sinc', 'NN'),
 ('nato', 'NNS'),
 ('took', 'VBD'),
 ('command', 'NN'),
 ('there', 'RB'),
 ('in', 'IN'),
 ('juli', 'NN'),
 (',', ','),
 ('richa

In [42]:
grammar = r"""
  NP: {<DT|JJ|NN.*>+}          # Chunk sequences of DT, JJ, NN
  PP: {<IN><NP>}               # Chunk prepositions followed by NP
  VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments
  CLAUSE: {<NP><VP>}           # Chunk NP, VP
  """

In [43]:
cp = nltk.RegexpParser(grammar)
cs = cp.parse(sent)
print(cs)

(S
  (NP presid/NN karzai/NN)
  thank/VBD
  his/PRP$
  (NP alli/NN)
  for/IN
  their/PRP$
  (NP help/NN)
  (PP in/IN (NP battl/JJ terror/NN))
  ./.
  (NP the/DT command/NN)
  (PP of/IN (NP nato/NN))
  's/POS
  (NP afghan/NN forc/NN)
  ,/,
  (NP british/JJ general/JJ david/NN richard/NN)
  ,/,
  said/VBD
  (NP the/DT uniti/NN)
  (PP of/IN (NP command/NN the/DT transfer/NN brought/NN))
  will/MD
  enhanc/VB
  (NP the/DT effect/NN)
  (PP of/IN (NP the/DT overal/JJ oper/NN))
  ./.
  (NP refer/NN)
  to/TO
  (NP the/DT fight/NN)
  (PP in/IN (NP southern/JJ afghanistan/JJ sinc/NN nato/NNS))
  took/VBD
  (NP command/NN)
  there/RB
  (PP in/IN (NP juli/NN))
  ,/,
  (NP richard/NN)
  said/VBD
  (NP the/DT nato/NN forc/NN)
  has/VBZ
  shown/VBN
  it/PRP
  resolv/VBZ
  to/TO
  meet/VB
  (NP the/DT challeng/NN)
  of/IN
  it/PRP
  (NP expand/DT mission/NN)
  ./.
  (NP richard/NN)
  was/VBD
  promot/VBN
  to/TO
  (NP four-star/JJ general/JJ hour/NN)
  (PP befor/IN (NP the/DT command/NN chang/NN))
  .

In [44]:
stemmed,postags  = zip(*sent)
df["Word"] = stemmed

In [45]:
df["Tag"] = df["Tag"].replace(
    {"B-per":"per",
     "I-per":"per",
          "B-event":"event",
          "I-event":"event",
          "B-geo":"geo",
          "I-geo":"geo",
          "B-gpe":"gpe",
          "I-gpe":"gpe",
          "B-obj":"obj",
          "I-obj":"obj",
          "B-org":"org",
          "I-org":"org",
          "B-time":"time",
          "I-time":"time"
    }
)

In [46]:
df.head()

Unnamed: 0,Word,POS,Tag
0,presid,NNP,per
1,karzai,NNP,per
2,thank,VBD,O
3,his,PRP$,O
4,alli,NNS,O


In [47]:
df = df[df.Tag != "O"]

In [48]:
df = df[(df.POS == "O") | (df.POS == "NN") | (df.POS == "NNP") | (df.POS == "NNS") | (df.POS == "NNPS") |
        (df.POS == "CD") |
        (df.POS == "JJ") | (df.POS == "JJR") | (df.POS == "JJS")]

In [58]:
df.head()

Unnamed: 0,Word,POS,Tag
0,presid,NNP,per
1,karzai,NNP,per
15,nato,NNP,org
17,afghan,JJ,gpe
20,british,NNP,org


In [140]:
df_probab = df.groupby("Word").Tag.value_counts()

In [171]:
for index, value in df_probab.items():
    if index[0] == "nato":
        print(index, value)

('nato', 'org') 87
('nato', 'geo') 1


In [None]:
for s in cs.subtrees(lambda t: t.label() == "NP"):
    for word, pos in s.leaves():
        print(df_probab[])
        