# SVO

* Subject-verb-object is an example of simple relations we could try to gather
* a simple dep_search query would be something like `VERB >obj (NOUN|PROPN) >nsubj (NOUN|PROPN)`

In [5]:
# A simple way you can query dep_search API programmatically
import requests

dep_search_api="http://edu.turkunlp.org/dep_search_webapi" #go here in the browser, you get help
query="VERB >obj (NOUN|PROPN) >nsubj (NOUN|PROPN)"
parameters={"search":query,
           "db":"NEWS_EN_10M",
           "retmax":2,
           "context":0}
r=requests.get(dep_search_api,params=parameters)
conllu=r.text 

# db-name: /home/ginter/dep_search_py2/en_news/trees_00000.db
# graph id: 4
# db-name: /home/ginter/dep_search_py2/en_news/trees_00000.db
# graph id: 5
# graph id: 4
# visual-style	9	bgColor:lightgreen
# hittoken:	9	detained	detain	VERB	VBD	Mood=Ind|Tense=Past|VerbForm=Fin	3	ccomp	_	_
# sent_id = 5
# text = The church said in March that North Korea detained Lim during one of his regular humanitarian missions there.
1	The	the	DET	DT	Definite=Def|PronType=Art	2	det	_	_
2	church	church	NOUN	NN	Number=Sing	3	nsubj	_	_
3	said	say	VERB	VBD	Mood=Ind|Tense=Past|VerbForm=Fin	0	root	_	_
4	in	in	ADP	IN	_	5	case	_	_
5	March	March	PROPN	NNP	Number=Sing	3	obl	_	_
6	that	that	SCONJ	WDT	PronType=Rel	9	mark	_	_
7	North	North	PROPN	NNP	Number=Sing	8	compound	_	_
8	Korea	Korea	PROPN	NNP	Number=Sing	9	nsubj	_	_
9	detained	detain	VERB	VBD	Mood=Ind|Tense=Past|VerbForm=Fin	3	ccomp	_	_
10	Lim	Lim	PROPN	NNP	Number=Sing	9	obj	_	_
11	during	during	ADP	IN	_	12	case	_	_
12	one	one	NUM	CD	NumType=Card	9	obl	_	_
13	

* We have a way to get S-V-O hits and the verb is marked with a "hittoken" line
* Our tasks will be as follows:
  1. Gather all verbs and their subject, object arguments
  2. Expand the subjects and objects to whole (reasonable) subtrees

In [11]:
import gzip
ID,FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC=range(10) #column names

def read_conllu(inp):
    """The simplest conllu reader I can imagine"""
    current_comments=[]
    current_tree=[]
    for line in inp:
        line=line.strip()
        if not line: #empty line -> new tree starting, get rid of the old one
            yield current_comments, current_tree
            current_comments=[]
            current_tree=[]
        elif line.startswith("#"):
            current_comments.append(line) #this is a comment
        else:
            current_tree.append(line.split("\t"))
    else: #all done
        yield current_comments, current_tree

def get_vso(comments,tree):
    #1) We want to get the hittoken, we know there is at least one
    results=[] #I'll gather here tuples like (verb_idx,subj_idx,obj_idx)
    hittokens=[comment for comment in comments if comment.startswith("# hittoken:")]
    for hit in hittokens: #hit is now a just a line like: # hittoken:	9	detained	detain	VERB
        columns=hit.split("\t")[1:] #split on tab, kill the first column (# hittoken:)
        lemma=columns[LEMMA]
        verb_idx=columns[ID] #this is now something like "9"
        #now go look for subjects and objects
        subjects=[row for row in tree if row[HEAD]==verb_idx and row[DEPREL]=="nsubj"]
        objects=[row for row in tree if row[HEAD]==verb_idx and row[DEPREL]=="obj"]
        #there should be at least one of each, if there is more than one, maybe we don't care about it
        if len(subjects)>1 or len(objects)>1:
            continue #meh, there's something weird
        results.append((verb_idx,subjects[0][ID],objects[0][ID]))
    return results #returns a list like [("9","11","7")] with the indices of verb,subj,obj

def get_strings(tree,vso):
    v,s,o=vso
    verb,subj,obj=tree[int(v)-1][LEMMA],tree[int(s)-1][LEMMA],tree[int(o)-1][LEMMA]
    return (verb,subj,obj)
        
with gzip.open("/course_data/textmine/parsed-data/english-svo.conllu.gz","rt",encoding="utf-8") as f:
    for counter,(comments, tree) in enumerate(read_conllu(f)):
        #now we have a single sentence with s-v-o hit in it
        vsos=get_vso(comments,tree)
        for vso in vsos:
            print(get_strings(tree,vso))
        if counter==20:
            break
#seems to work, to an extent, let's try to improve upon this

('detain', 'Korea', 'Lim')
('leave', 'result', 'United')
('fetch', 'stamp', 'price')
('have', 'state', 'limit')
('launch', 'Goldenvoice', 'website')
('have', 'Yemen', 'case')
('set', 'Jeffrey', 'week')
('defeat', 'Gov.', 'attorney')
('celebrate', 'Caroline', 'thing')
('change', 'Riyadh', 'stance')
('limit', 'Iran', 'production')
('make', 'bill', 'procurement')
('visit', 'minister', 'constituency')
('mobilize', 'law', 'opposition')
('step', 'Britain', 'training')
('encourage', 'finding', 'people')
('involve', 'plan', 'power')
('sign', 'Carla', 'deal')
('rearrange', 'Britney', 'furniture')
('need', 'girl', 'phone')
('do', 'shell', 'job')
('arrest', 'police', 'dozen')
('lash', 'weather', 'Britain')


In [16]:
def get_children(tree):
    children=[[] for _ in range(len(tree))] #empty list for every word, will gather the indices, 0-based, of its children
    for row in tree:
        head_idx=int(row[HEAD])-1
        if head_idx>=0:
            children[head_idx].append((row[DEPREL],int(row[ID])-1)) #append e.g. (nsubj,5) meaning node idx 5 is a child, with deprel nsubj
    return children #all we need to know here

def subtree(ofnode,tree_children,gathered_so_far,only_relations=set(("compound","flat"))):
    #gather all children 
    for rel,child in tree_children[ofnode]:
        if rel not in only_relations:
            continue #this kid ain't interesting
        gathered_so_far.append(child)
        subtree(child,tree_children,gathered_so_far)

def expand(node,tree_children,tree):
    gathered=[node] #start with the word itself
    subtree(node,tree_children,gathered) #expand it recursively
    gathered=sorted(gathered) #and sort
    #this is now a list of words
    return " ".join(tree[node][LEMMA] for node in gathered)

def get_strings(tree,tree_children,vso):
    v,s,o=vso
    verb,subj,obj=tree[int(v)-1][LEMMA],expand(int(s)-1,tree_children,tree),expand(int(o)-1,tree_children,tree)
    return (verb,subj,obj)

with gzip.open("/course_data/textmine/parsed-data/english-svo.conllu.gz","rt",encoding="utf-8") as f:
    for counter,(comments, tree) in enumerate(read_conllu(f)):
        #now we have a single sentence with s-v-o hit in it
        vsos=get_vso(comments,tree)
        tree_children=get_children(tree)
        
        if not vsos:
            continue #only happens for two subjects or two objects which we skip
        for vso in vsos:
            print(get_strings(tree,tree_children,vso))
        if counter==20:
            break

('detain', 'North Korea', 'Lim')
('leave', 'result', 'United')
('fetch', 'stamp', 'auction price')
('have', 'state', 'limit')
('launch', 'Goldenvoice', 'website')
('have', 'Yemen', 'case')
('set', 'Massachusetts Superior Court judge Jeffrey Locke', 'week')
('defeat', 'Republican Gov. Mary Fallin', 'Oklahoma City criminal defense attorney Chad Moody')
('celebrate', 'Caroline Wozniacki', 'thing')
('change', 'Riyadh', 'stance')
('limit', 'Iran', 'production')
('make', 'bill', 'procurement')
('visit', 'minister', 'constituency')
('mobilize', 'law', 'opposition')
('step', 'Britain', 'training')
('encourage', 'finding', 'people')
('involve', 'plan', 'power')
('sign', 'Carla Borrego', 'deal')
('rearrange', 'Britney Spears', 'hotel room furniture')
('need', 'girl', 'phone')
('do', 'shell', 'job')
('arrest', 'police', 'dozen')
('lash', 'weather', 'Britain')


In [19]:
with gzip.open("/course_data/textmine/parsed-data/english-svo.conllu.gz","rt",encoding="utf-8") as f,\
    open("triples.tsv","wt",encoding="utf-8") as out:
    for counter,(comments, tree) in enumerate(read_conllu(f)):
        #now we have a single sentence with s-v-o hit in it
        vsos=get_vso(comments,tree)
        tree_children=get_children(tree)
        if not vsos:
            continue #only happens for two subjects or two objects which we skip
        for vso in vsos:
            print("\t".join(get_strings(tree,tree_children,vso)),file=out)
        if counter%1000==0:
            print("Processed",counter,end="\r")

Processed 2378000 1396000 1658000