In [67]:
import stanfordnlp
from stanfordnlp.server import CoreNLPClient

sent = "the key or locks are missing"

In [68]:
def dep_parse(asent):
    return asent.sentence[0].basicDependencies

def pos_tag(asent):
    return asent.sentence[0].token

with CoreNLPClient(annotators=["pos", "depparse"], timeout=30000) as client:
    asent = client.annotate(sent)
    dep = dep_parse(asent)
    tokens = pos_tag(asent)
    


Starting server with command: java -Xmx5G -cp /home/sarehalli/stanfordnlp_resources/stanford-corenlp-full-2018-10-05/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 30000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-cdb444d7643b444c.props -preload pos,depparse


In [None]:
NP_S = ["NN", "NP"]
NP_P = ["NNS", "NPS"]

V_S = ["VHZ", "VVZ", "VBZ"]
V_ = ["VHP", "VBP"]
V_P = ["VVP"]

def get_num(tag, word):
    if tag in NP_S + NP_P:
        if tag in NP_S:
            return "S"
        else:
            return "P"
    if tag in V_S:
        return "S"
    if tag in V_P:
        return "P"
    if tag in V_:
        if word in ["have", "are", "were"]:
            return "P"
        else:
            return "S"
    if tag == "PRP":
        if word in ["he", "she". "it"]:
            return "S"
    return None

In [118]:
def get_aggr(dep_graph, tokens):
    idxs = []
    agr_rels = []
    for edge in dep_graph.edge:
        if edge.dep == "nsubj":
            idxs.append((edge.target-1, edge.source-1))
    for i,idx in enumerate(idxs):
        agr_rel = {}
        agr_rel["relcl"] = False
        agr_rel["pp"] = False
        agr_rel["depp"] = False
        agr_rel["dj"] = False
        agr_rel["dj_conj"] = []
        agr_rel["agree"] = False
        agr_rel["cor_agree"] = False
        agr_rel["vplural"] = False
        agr_rel["vsingular"] = False
        for edge in dep_graph.edge:
            # Handle UD copulas - we want the copula, not the adjective!
            if ((edge.dep == "cop") or (edge.dep == "aux")) and (edge.source-1 == idx[1]):
                idxs[i] = (idxs[i][0], edge.target-1)
            # relative clause
            if (edge.dep == "acl:relcl") and (edge.source-1 == idx[0]):
                agr_rel["relcl"] = True
            # PP
            if (edge.dep == "nmod") and (edge.source-1 == idx[0]):
                agr_rel["pp"] = True
                for edge2 in dep_graph.edge:
                    if (edge2.dep == "nmod") and (edge2.source == edge.target):
                        agr_rel["depp"] = True
            if (edge.dep == "cc") and (edge.source-1 == idx[0]):
                if (tokens[edge.target-1].value == "or"):
                    agr_rel["dj"] = True
                    agr_rel["dj_conj"] = 
        a = idx2pos(idxs[i], tokens)
        b = idx2word(idxs[i], tokens)
        x = pos2num(a, b)
        #print(a,b, x)
        if ((x[0] is not None) and (x[1] is not None)):
            agr_rel["agree"] = True
            if (x[0] == x[1]):
                agr_rel["cor_agree"] = True
            
        if (x[1] == "P"):
            agr_rel["vplural"] = True
        if (x[1] == "S"):
            agr_rel["vsingular"] = True
        agr_rels.append(agr_rel)
    
    return idxs, agr_rels

def idx2pos(pair, tokens):
    return tuple([tokens[i].pos for i in pair])

def idx2word(pair, tokens):
    return tuple([tokens[i].value for i in pair])

def pos2num(pos, tokens):
    return tuple([get_num(tag, word) for tag, word in zip(pos, tokens)])

idxs, agr_rels = get_aggr(dep, tokens)
print(idxs)
print(agr_rels)
print(dep)

[(1, 4)]
[{'relcl': False, 'pp': False, 'depp': False, 'dj': True, 'agree': False, 'cor_agree': False, 'vplural': True, 'vsingular': False}]
node {
  sentenceIndex: 0
  index: 1
}
node {
  sentenceIndex: 0
  index: 2
}
node {
  sentenceIndex: 0
  index: 3
}
node {
  sentenceIndex: 0
  index: 4
}
node {
  sentenceIndex: 0
  index: 5
}
node {
  sentenceIndex: 0
  index: 6
}
edge {
  source: 2
  target: 1
  dep: "det"
  isExtra: false
  sourceCopy: 0
  targetCopy: 0
  language: UniversalEnglish
}
edge {
  source: 2
  target: 3
  dep: "cc"
  isExtra: false
  sourceCopy: 0
  targetCopy: 0
  language: UniversalEnglish
}
edge {
  source: 2
  target: 4
  dep: "conj"
  isExtra: false
  sourceCopy: 0
  targetCopy: 0
  language: UniversalEnglish
}
edge {
  source: 6
  target: 2
  dep: "nsubj"
  isExtra: false
  sourceCopy: 0
  targetCopy: 0
  language: UniversalEnglish
}
edge {
  source: 6
  target: 5
  dep: "aux"
  isExtra: false
  sourceCopy: 0
  targetCopy: 0
  language: UniversalEnglish
}
roo

In [119]:
def count_aggrs(sents):
    aggr_nums = []
    with CoreNLPClient(annotators=["pos", "depparse"], timeout=30000) as client:
        for sent in sents:
            asent = client.annotate(sent)
            dep = dep_parse(asent)
            tokens = pos_tag(asent)
            _, agr_rels = get_aggr(dep, tokens)
            aggr_nums += agr_rels
     
    totals = {}
    for key in aggr_nums[0].keys():
        totals[key] = sum([x[key] for x in aggr_nums if x[key] is not None])
    totals["dj_pl"] = sum((x["dj"] and x["vplural"]) for x in aggr_nums if x["agree"] is not None)
    totals["dj_sg"] = sum((x["dj"] and x["vsingular"]) for x in aggr_nums if x["agree"] is not None)
    totals["total_subj-verb"] = len(aggr_nums)
    totals["total_sent"] = len(sents)
    print(totals)
    return totals


In [120]:
ptb_train = []
with open("../train.txt", encoding="ISO 8859-1") as ptb_f:
    for line in ptb_f:
        ptb_train.append(" ".join([x.split(")")[0] for x in line.split() if "(" not in x]))
        
ptb = count_aggrs(ptb_train)
    


Starting server with command: java -Xmx5G -cp /home/sarehalli/stanfordnlp_resources/stanford-corenlp-full-2018-10-05/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 30000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-6b544a04200c463d.props -preload pos,depparse
{'relcl': 1427, 'pp': 7519, 'depp': 1027, 'dj': 97, 'agree': 18991, 'cor_agree': 14518, 'vplural': 4218, 'vsingular': 25446, 'dj_pl': 9, 'dj_sg': 29, 'total_subj-verb': 64694, 'total_sent': 42068}


In [122]:
wiki_train = []
with open("../colorlessgreenRNNs/data/lm/English/train.txt", encoding="ISO 8859-1") as ptb_f:
    for line in ptb_f:
        wiki_train.append(line)
        
wiki = count_aggrs(wiki_train[:500000])

Starting server with command: java -Xmx5G -cp /home/sarehalli/stanfordnlp_resources/stanford-corenlp-full-2018-10-05/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 30000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-c30662b2d9d74888.props -preload pos,depparse
{'relcl': 8963, 'pp': 76708, 'depp': 10091, 'dj': 1746, 'agree': 134362, 'cor_agree': 107417, 'vplural': 34695, 'vsingular': 209496, 'dj_pl': 243, 'dj_sg': 679, 'total_subj-verb': 658173, 'total_sent': 500000}


In [98]:
x = {'relcl': 1427, 'pp': 7519, 'depp': 1027, 'dj': 96, 'agree': 17421, 'cor_agree': 11130, 'total_subj-verb': 64694, 'total_sent': 42068}

for key, value in ptb.items():
    print(key, value/ptb["total_sent"])

relcl 0.03392127032423695
pp 0.1787344299705239
depp 0.024412855377008654
dj 0.002282019587334791
agree 0.4141152419891604
cor_agree 0.26457164590662735
total_subj-verb 1.5378434914899686
total_sent 1.0


In [124]:
x = {'relcl': 930, 'pp': 7593, 'depp': 1018, 'dj': 158, 'agree': 12780, 'cor_agree': 8709, 'total_subj-verb': 65726, 'total_sent': 50000}

for key, value in wiki.items():
    print(key, value, value/wiki["total_sent"])

relcl 8963 0.017926
pp 76708 0.153416
depp 10091 0.020182
dj 1746 0.003492
agree 134362 0.268724
cor_agree 107417 0.214834
vplural 34695 0.06939
vsingular 209496 0.418992
dj_pl 243 0.000486
dj_sg 679 0.001358
total_subj-verb 658173 1.316346
total_sent 500000 1.0
