In [1]:
import json
import pandas as pd
import spacy
from spacy import displacy
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from collections import Counter

In [2]:
df = pd.read_json('word_problem_challenge_data.json', typ='series')

In [3]:
df = df.to_frame('problem')
df.head()

Unnamed: 0,problem
0,Bryan took a look at his books as well . If Br...
1,"For the fifth grade play , the chairs have bee..."
2,There are 41 short trees and 44 tall trees cur...
3,Conner has 25000 dollars in his bank account ....
4,There are 34 dogwood trees currently in the pa...


In [46]:
sentence = df['problem'][10]
sentence

'Enrique puts 12 % of his monthly paycheck in an IRA . If he invests 72 dollars in his IRA , how much was his paycheck ?'

In [47]:
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

In [48]:
sent = preprocess(sentence)
sent

[('Enrique', 'NNP'),
 ('puts', 'VBZ'),
 ('12', 'CD'),
 ('%', 'NN'),
 ('of', 'IN'),
 ('his', 'PRP$'),
 ('monthly', 'JJ'),
 ('paycheck', 'NN'),
 ('in', 'IN'),
 ('an', 'DT'),
 ('IRA', 'NNP'),
 ('.', '.'),
 ('If', 'IN'),
 ('he', 'PRP'),
 ('invests', 'VBZ'),
 ('72', 'CD'),
 ('dollars', 'NNS'),
 ('in', 'IN'),
 ('his', 'PRP$'),
 ('IRA', 'NNP'),
 (',', ','),
 ('how', 'WRB'),
 ('much', 'JJ'),
 ('was', 'VBD'),
 ('his', 'PRP$'),
 ('paycheck', 'NN'),
 ('?', '.')]

In [49]:
## Determiner + Any number of adjectives + Noun
pattern = 'NP: {<DT>?<JJ>*<NN>}'

In [50]:
## Chunk Parser: 
chunkParser = nltk.RegexpParser(pattern)
cs = chunkParser.parse(sent)
print(cs)

(S
  Enrique/NNP
  puts/VBZ
  12/CD
  (NP %/NN)
  of/IN
  his/PRP$
  (NP monthly/JJ paycheck/NN)
  in/IN
  an/DT
  IRA/NNP
  ./.
  If/IN
  he/PRP
  invests/VBZ
  72/CD
  dollars/NNS
  in/IN
  his/PRP$
  IRA/NNP
  ,/,
  how/WRB
  much/JJ
  was/VBD
  his/PRP$
  (NP paycheck/NN)
  ?/.)


In [51]:
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint
iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)

[('Enrique', 'NNP', 'O'),
 ('puts', 'VBZ', 'O'),
 ('12', 'CD', 'O'),
 ('%', 'NN', 'B-NP'),
 ('of', 'IN', 'O'),
 ('his', 'PRP$', 'O'),
 ('monthly', 'JJ', 'B-NP'),
 ('paycheck', 'NN', 'I-NP'),
 ('in', 'IN', 'O'),
 ('an', 'DT', 'O'),
 ('IRA', 'NNP', 'O'),
 ('.', '.', 'O'),
 ('If', 'IN', 'O'),
 ('he', 'PRP', 'O'),
 ('invests', 'VBZ', 'O'),
 ('72', 'CD', 'O'),
 ('dollars', 'NNS', 'O'),
 ('in', 'IN', 'O'),
 ('his', 'PRP$', 'O'),
 ('IRA', 'NNP', 'O'),
 (',', ',', 'O'),
 ('how', 'WRB', 'O'),
 ('much', 'JJ', 'O'),
 ('was', 'VBD', 'O'),
 ('his', 'PRP$', 'O'),
 ('paycheck', 'NN', 'B-NP'),
 ('?', '.', 'O')]


# Spacy

In [52]:
nlp = spacy.load('en_core_web_sm')

In [53]:
doc = nlp(sentence)
pprint([(X.text, X.label_) for X in doc.ents])

[('12 %', 'PERCENT'), ('monthly', 'DATE'), ('72 dollars', 'MONEY')]


In [60]:
displacy.render(doc, style='ent')

In [55]:
#pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])
l = []
for X in doc:
    l.append([X, X.ent_iob_, X.ent_type_])
#l
itemDict = {item[0]: item[1:] for item in l}
itemDict

{Enrique: ['O', ''],
 puts: ['O', ''],
 12: ['B', 'PERCENT'],
 %: ['I', 'PERCENT'],
 of: ['O', ''],
 his: ['O', ''],
 monthly: ['B', 'DATE'],
 paycheck: ['O', ''],
 in: ['O', ''],
 an: ['O', ''],
 IRA: ['O', ''],
 .: ['O', ''],
 If: ['O', ''],
 he: ['O', ''],
 invests: ['O', ''],
 72: ['B', 'MONEY'],
 dollars: ['I', 'MONEY'],
 in: ['O', ''],
 his: ['O', ''],
 IRA: ['O', ''],
 ,: ['O', ''],
 how: ['O', ''],
 much: ['O', ''],
 was: ['O', ''],
 his: ['O', ''],
 paycheck: ['O', ''],
 ?: ['O', '']}

In [56]:
'''X_list = []
X_ent_iob_list = []
X_ent_type_list = []

for X in doc:
    X_list.append(X)
    X_ent_iob_list.append(X.ent_iob_)
    X_ent_type_list.append(X.ent_type)

#df2 = pd.DataFrame(list(zip(X_list, X_ent_iob_list, X_ent_type_list)), columns = ('X', 'ent_iob', 'ent_type'))
df2'''

"X_list = []\nX_ent_iob_list = []\nX_ent_type_list = []\n\nfor X in doc:\n    X_list.append(X)\n    X_ent_iob_list.append(X.ent_iob_)\n    X_ent_type_list.append(X.ent_type)\n\n#df2 = pd.DataFrame(list(zip(X_list, X_ent_iob_list, X_ent_type_list)), columns = ('X', 'ent_iob', 'ent_type'))\ndf2"

In [57]:
len(doc.ents)

3

In [59]:
labels = [x.label_ for x in doc.ents]
items = [x.text for x in doc.ents]

res = dict(zip(labels, items))
#Counter(labels)
print(labels, items)

Dict = {}
coeff = []
var = ['x1', 'x2', 'x3', 'x4', 'x5', 'x6']
for i in range(len(labels)):
    if labels[i] == 'CARDINAL' or labels[i] == 'PERCENT' or labels[i] == 'MONEY':
        coeff.append(items[i])
    
a_dict = {key:value for key, value in zip(var, coeff)}
a_dict

['PERCENT', 'DATE', 'MONEY'] ['12 %', 'monthly', '72 dollars']


{'x1': '12 %', 'x2': '72 dollars'}

In [27]:
items = [x.text for x in doc.ents]
itemsCount = Counter(items).most_common(3)
itemsCount
#itemsCount[0][1]

[('fifth', 1), ('27', 1), ('16', 1)]

In [26]:
displacy.render(nlp(str(df['problem'][20])), style='dep', jupyter = True, options = {'distance': 120})

In [63]:
import requests
res = requests.post('https://vishalwordproblem.herokuapp.com/', json={"rawtext":"vIsabel baked 3 brownies , but needed 5 total for her party . If she used 5 cups of flour on each one , how much cups of flour does she still need ?"})

SyntaxError: invalid syntax (<ipython-input-63-28dd4bb841cf>, line 2)

In [62]:
res

<Response [405]>