In [1]:
import pandas as pd
import re
import nltk
import random
from nltk.tag import PerceptronTagger

In [2]:
relevantTags=['ACT','ANIMAL','ARTIFACT','ATTRIBUTE','BODY','COGNITION','COMMUNICATION','EVENT','FEELING','FOOD','GROUP','LOCATION','NATURAL OBJECT','PERSON','PHENOMENON','POSSESSION','PROCESS','QUANTITY','RELATION','STATE','SUBSTANCE','TIME']
adjustedTags=['NN_'+tag for tag in relevantTags]+['NNS_'+tag for tag in relevantTags]+['NNP_'+tag for tag in relevantTags]+['NNPS_'+tag for tag in relevantTags]

In [3]:
df=pd.read_csv('question_templates.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Q,TAG
0,0,What are NNS_ARTIFACT called ?,NNS_ARTIFACT
1,1,When did NNP_PERSON live ?,NNP_PERSON
2,2,How old is NNP_GROUP ?,NNP_GROUP
3,5,What does NNP_ARTIFACT mean ?,NNP_ARTIFACT
4,6,What does NNP_GROUP stand for ?,NNP_GROUP


In [4]:
def tokenize_and_tag(inpt):
    tagger=PerceptronTagger()
    lines=re.split('[?!.]',inpt)
    txt=open('txt','w')
    for line in lines:
        tokenized=nltk.word_tokenize(line)
        tagged=tagger.tag(tokenized)
        for tup in tagged:
            txt.write(tup[0]+'\t'+tup[1]+'\n')
        txt.write('\n')
    txt.close()

In [5]:
def get_mwes_and_ssts(file):    #input is a 'SUPERSENSE TAGGED' file
    with open(file,'r') as f:
        text=f.read()
    sentences=text.split('\n\n')
    sentlist=[]
    for sent in sentences:
        s=sent.split('\n')
        wordlist=[]
        for s_ in s:
            words=s_.split('\t')
            wordlist.append(words)
        sentlist.append([word for word in wordlist if not word==['']])
    sentlist=[s for s in sentlist if not s==['']]
    qs=[]
    for s in sentlist:
        q={}
        s=s[::-1]    #get Multiword Expressions
        pattern=[word[6] for word in s]
        for j,sym in enumerate(pattern):
            if sym=='_':
                s[j+1][1]=s[j+1][1]+' '+s[j][1]
        s=s[::-1]
        for word in s:    #get tags of (Multiword) nouns
            try:
                if word[7] in relevantTags:
                    q.update({word[1]:word[3]+'_'+word[7]})    #add (Multi)word and POS+tag to dict
            except IndexError:
                pass
        if not q=={}:
            qs.append(q)
    return qs

In [6]:
def make_questions(qs,df):    #input: SST dictionary and template df
    questions=[]
    for q in qs:
        print(q)
        for key,val in q.items():
            df_selection=df[df.TAG==val]
            qlist=df_selection['Q']
            for ql in qlist: 
                new=ql.replace(val,key)    #replace tag in template with word 
                questions.append(new)
    return questions

In [7]:
def questionize(inpt):
    tokenize_and_tag(inpt)
    ! ./sst.sh txt    #shell command: run Supersense Tagger on POS-tagged file
    qs=get_mwes_and_ssts('txt.pred.tags')
    questions=make_questions(qs,df)
    return questions

In [8]:
sza='Born Solána Rowe on November 8, 1990, R&B artist SZA garnered industry buzz through independently released work, before teaming with Top Dawg Entertainment for her third EP, Z. She hit the big time in 2017 with her first full-length album, Ctrl, which spawned the singles "Love Galore," "The Weekend" and "Broken Clocks," en route to platinum certification. The Grammy-nominated singer and songwriter has also collaborated with such popular acts as Rihanna, Maroon 5 and Kendrick Lamar'
tonya='In 1991, Tonya Harding won her first national skating title and became the first American woman to complete a triple axel in competition. In 1994, she earned notoriety when ex-husband Jeff Gillooly hired a hitman to assault skater Nancy Kerrigan at the Olympic trials. Harding pleaded guilty to hindering the investigation into Kerrigan\'s attack, and was subsequently banned from competing in the U.S. for life. Since then, she has made guest appearances on shows like Rosanne, Larry King Live, The Weakest Link and Celebrity Boxing. Harding returned to the public eye with the big screen release of I, Tonya in late 2017, and was named to compete on Dancing with the Stars the following spring.'
mary='Mary Magdalene was a figure in the Bible\'s New Testament who was one of Jesus\'s most loyal followers and is said to have been the first to witness his resurrection. While the Western Christian Church portrayed her as a repentant sinner for centuries, newer research has disputed this interpretation, and the discovery of the Gnostic Gospels, including the Gospel of Mary, describes Mary as a reflective, wise spiritualist favored by Jesus.'
baobab='Baobab is the common name of a genus of trees (Adansonia). There are nine species. Six species live in the drier parts of Madagascar, two in mainland Africa, one in Australia and three in India, Ranchi. The baobab is the national tree of Madagascar. Other common names include \'boab\', \'boaboa\', \'bottle tree\', \'the tree of life\', \'upside-down tree\', and \'monkey bread tree\'. The trees reach heights of 5 to 30 metres (16 to 98 ft) and trunk diameters of 7 to 11 metres (23 to 36 ft). Its trunk can hold up to 120,000 litres of water. For most of the year, the tree is leafless, and looks very much like it has its roots sticking up in the air. Baobabs are one of the largest and most important trees in all of where they grow, as they are able to provide shelter and wood. The leaves of the tree are used for making soup and it has some medicinal purposes in some regions of Africa '

In [9]:
questions=questionize(sza)    #to see all questions generated from text, do print(questions)

loading WordNet supersense lexicon... done: 147306 entries
loading lexicon semcor_mwes 12744 entries
loading lexicon wordnet_mwes 63897 entries
loading lexicon said 0 entries
loading lexicon phrases_dot_net 6245 entries
loading lexicon wikimwe 321207 entries
loading lexicon enwikt 62819 entries
loading word clusters...
done.
loading model from sst.model ...
done.
B-GROUP Ī Ī O O-TIME O O O O B-GROUP Ī Ī Ī O-PERSON O-social B-RELATION Ī O O O-change O-ACT O O O-social O B-GROUP Ī Ī O O O O-GROUP O O-EVENT
decoding time: 0.510606050491
30.1558208466
{'Born Solána Rowe': 'NNP_GROUP', 'November': 'NNP_TIME', 'R & B artist': 'NNP_GROUP', 'SZA': 'NNP_PERSON', 'industry buzz': 'NN_RELATION', 'work': 'NN_ACT', 'Top Dawg Entertainment': 'NNP_GROUP', 'EP': 'NNP_GROUP', 'Z': 'NNP_EVENT'}
{'album': 'NN_COMMUNICATION', 'Ctrl': 'NNP_PERSON', 'singles': 'NNS_ACT', 'Love Galore': 'NNP_FOOD', 'Weekend': 'NNP_TIME', 'Broken Clocks': 'NNP_GROUP'}
{'singer': 'NN_PERSON', 'songwriter': 'NN_GROUP', 'acts': 

In [13]:
i=random.randint(0,len(questions)-1)
questions[i]    #Enter to see next random question

"when is Kendrick Lamar 's Birthday ?"