In [1]:
%run model_functions.ipynb

In [2]:
all_words = pd.read_csv('data/raw/words.csv', encoding='ISO-8859-1')
display(len(all_words))
all_words.head()

204834

Unnamed: 0,word_id,word,lemma,word_type,function,seg_type,sentence_id,text_id
0,0,Latest,late,AJS,,,0,a1e-fragment01
1,1,corporate,corporate,AJ0,,,0,a1e-fragment01
2,2,unbundler,unbundler,NN1,,,0,a1e-fragment01
3,3,reveals,reveal,VVZ,mrw,met,0,a1e-fragment01
4,4,laid-back,laid-back,AJ0,,,0,a1e-fragment01


In [3]:
all_words = all_words[all_words['lemma'].apply(type) == str]
display(len(all_words))

204829

In [4]:
def parse_all_deps(words, download_models=False):
    if download_models:
        stanfordnlp.download('en')   # This downloads the English models for the neural pipeline

    # This sets up a default neural pipeline in English
    nlp_pipeline = stanfordnlp.Pipeline(
        processors='tokenize,pos,depparse',
        tokenize_pretokenized=True
    ) 

    with open('data/all_dependencies.csv', 'w', encoding='utf-8') as f:
        csv_writer = csv.writer(f)
        for sentence_id, group in words.groupby('sentence_id'):
            sentence = [list(group['lemma'])]
            
            if not sentence_id % 100:
                print('Finding dependencies for sentence {}'.format(sentence_id))
                
            for dep in parse_dependencies(sentence, sentence_id, nlp_pipeline):
                csv_writer.writerow(dep)
        
    print('Done!')
    
def parse_dependencies(real_lemmas, sentence_id, nlp_pipeline):
    doc = nlp_pipeline(real_lemmas)
    
    dependencies = []
    word_index_offset = 0
    assert len(doc.sentences) == 1
    for sentence in doc.sentences:ea
        for dep in sentence.dependencies:
            governor, g_i = dep[0].text, int(dep[0].index) - 1 + word_index_offset
            
            if g_i == -1:  # Is the ROOT placeholder.
                continue
            dependent, d_i = dep[2].text, int(dep[2].index) - 1 + word_index_offset
            
            dependencies.append((sentence_id, governor, g_i, dependent, d_i, dep[1]))
        word_index_offset += len(sentence.words) - 1
        
    return dependencies

The next cell saves depencies to data/all_deps.csv

```python
import warnings
warnings.filterwarnings(action='once')

parse_all_deps(all_words)
```

------------------

Code for parsing entire dataset at once. However, single "words" containing whitespaces will be wrongly delimited as separate words.

In [5]:
# def parse_all_deps(words, download_models=False):
#     if download_models:
#         stanfordnlp.download('en')   # This downloads the English models for the neural pipeline

#     # This sets up a default neural pipeline in English
#     nlp_pipeline = stanfordnlp.Pipeline(
#         processors='tokenize,lemma,pos,depparse',
#         tokenize_pretokenized=True
#     ) 

#     with open('data/all_dependencies.csv', 'w', encoding='utf-8') as f:
#         csv_writer = csv.writer(f)
#         sentence_lists = list(sentences.groupby('sentence_id')['sentence'].apply(lambda x: x.str.split(' ')))
#         doc = nlp_pipeline(sentence_lists)
#         assert len(doc.sentences) == len(sentences)
        
#         for sentence_id, sentence in enumerate(doc.sentences):
#             if not sentence_id % 100:
#                 print('Finding dependencies for sentence {}'.format(sentence_id))
#             for dep in sentence.dependencies:
#                 governor, g_i = dep[0].text, int(dep[0].index) - 1
#                 dependent, d_i = dep[2].text, int(dep[2].index) - 1
                
#                 if g_i == -1:  # Is the ROOT placeholder.
#                     continue
                
#                 csv_writer.writerow((sentence_id, governor, g_i, dependent, d_i))
        
#     print('Done!')

# parse_all_deps(sentences)

------------------

In [6]:
all_deps = pd.read_csv('data/all_dependencies.csv',
             names=['governor', 'g_id', 'dependent', 'd_id', 'dep'])

len(all_deps.groupby(all_deps.index))

14450

In [7]:
display(len(all_deps))
all_deps.head()

188685

Unnamed: 0,governor,g_id,dependent,d_id,dep
0,unbundler,2,late,0,amod
0,unbundler,2,corporate,1,amod
0,reveal,3,unbundler,2,nsubj
0,approach,5,laid-back,4,amod
0,reveal,3,approach,5,obj


In [8]:
all_deps['dep'].nunique()

43

In [9]:
sentences = pd.read_csv('data/raw/sentences.csv', encoding='ISO-8859-1')
sentences.iloc[6725]

sentence_id     6725                                                                                                                                                                                                                                                                                                              
paragraph_id    2031                                                                                                                                                                                                                                                                                                              
sentence        It was within this kind of environment that chivalry flourished, that young men learned to identify, their vassalage towards their lord as a symbol of their honour, that they debated the relative merits of Roland's and Oliver's way of displaying loyalty; and that they learned the finer points of jousting.
Name: 6725, dtype: object

In [10]:
all_deps[all_deps['governor']=='roland']

Unnamed: 0,governor,g_id,dependent,d_id,dep
0,roland,6,franklin,7,flat
2,roland,3,franklin,4,flat
2,roland,3,unbundler,7,appos
6725,roland,34,of,33,case
6725,roland,34,oliver,36,conj
6725,roland,34,way,37,conj


In [11]:
dep_words = all_deps[['g_id', 'd_id', 'dep']].rename(
    columns={
            'g_id': 'word_offset',
            'd_id': 'dep_word_offset',
            'dep': 'dep'
    }).append(
    all_deps[['d_id', 'g_id', 'dep']].rename(
        columns={
            'd_id': 'word_offset',
            'g_id': 'dep_word_offset',
            'dep': 'dep'
    }),
    sort=True
).reset_index().rename(columns={'index': 'sentence_id'})

dep_words.sample(10)

Unnamed: 0,sentence_id,dep,dep_word_offset,word_offset
84381,4381,det,12,13
185614,15409,advmod,2,1
103263,5742,nmod,27,24
94241,5031,amod,12,13
191913,159,cc,22,19
172072,12761,amod,14,15
284085,5169,case,4,2
352335,11204,mark,12,8
137616,7779,cop,18,19
67275,3625,appos,33,30


In [12]:
dep_words['dep_with_offset'] = pd.Series(zip(dep_words['dep'], dep_words['dep_word_offset']))
display(len(dep_words))
dep_words.head()

377370

Unnamed: 0,sentence_id,dep,dep_word_offset,word_offset,dep_with_offset
0,0,amod,0,2,"(amod, 0)"
1,0,amod,1,2,"(amod, 1)"
2,0,nsubj,2,3,"(nsubj, 2)"
3,0,amod,4,5,"(amod, 4)"
4,0,obj,5,3,"(obj, 5)"


In [13]:
dep_word_lists = pd.DataFrame(dep_words.groupby(['sentence_id', 'word_offset'])['dep_with_offset'].apply(list)).reset_index()
dep_word_lists

Unnamed: 0,sentence_id,word_offset,dep_with_offset
0,0,0,"[(amod, 2)]"
1,0,1,"[(amod, 2)]"
2,0,2,"[(amod, 0), (amod, 1), (nsubj, 3)]"
3,0,3,"[(nsubj, 2), (obj, 5)]"
4,0,4,"[(amod, 5)]"
...,...,...,...
203130,16182,4,"[(cop, 5)]"
203131,16182,5,"[(mark, 2), (nsubj, 3), (cop, 4), (advcl, 8)]"
203132,16182,6,"[(nsubj, 8)]"
203133,16182,7,"[(cop, 8)]"


In [16]:
dep_word_lists

Unnamed: 0,sentence_id,word_offset,dep_with_offset
0,0,0,"[(amod, 2)]"
1,0,1,"[(amod, 2)]"
2,0,2,"[(amod, 0), (amod, 1), (nsubj, 3)]"
3,0,3,"[(nsubj, 2), (obj, 5)]"
4,0,4,"[(amod, 5)]"
...,...,...,...
203130,16182,4,"[(cop, 5)]"
203131,16182,5,"[(mark, 2), (nsubj, 3), (cop, 4), (advcl, 8)]"
203132,16182,6,"[(nsubj, 8)]"
203133,16182,7,"[(cop, 8)]"


Get the VUAMC dataset and match back to it with dependencies.

In [17]:
words = pd.read_csv('data/raw/words.csv', encoding='ISO--8859-1', na_filter=False)
display(len(words))
words.head()

204834

Unnamed: 0,word_id,word,lemma,word_type,function,seg_type,sentence_id,text_id
0,0,Latest,late,AJS,,,0,a1e-fragment01
1,1,corporate,corporate,AJ0,,,0,a1e-fragment01
2,2,unbundler,unbundler,NN1,,,0,a1e-fragment01
3,3,reveals,reveal,VVZ,mrw,met,0,a1e-fragment01
4,4,laid-back,laid-back,AJ0,,,0,a1e-fragment01


Add position in sentence for each word using cumcount().

In [18]:
words['offset'] = words.groupby('sentence_id').cumcount()
display(len(words))
words.sample(10)

204834

Unnamed: 0,word_id,word,lemma,word_type,function,seg_type,sentence_id,text_id,offset
110894,110894,not,not,XX0,,,5880,cdb-fragment02,14
13449,13449,dockers,docker,NN2,,,660,a1u-fragment04,3
162611,162611,bad,bad,AJ0,,,9606,kb7-fragment45,2
114709,114709,but,but,CJC,,,6116,cdb-fragment04,9
114033,114033,news,news,NN1,,,6068,cdb-fragment04,27
11630,11630,over,over,AV0,mrw,met,559,a1p-fragment03,37
71077,71077,to,to,PRP,mrw,met,3630,as6-fragment02,15
122743,122743,Services,service,NN2,,,6490,crs-fragment01,2
34625,34625,an,an,AT0,,,1752,a8u-fragment14,16
37263,37263,also,also,AV0,,,1894,a9j-fragment01,3


After matching sentence ID, match word offset to dependency word offset.

In [19]:
words_with_deps = (
    words.merge(dep_word_lists,
                how='left',
                left_on=['sentence_id', 'offset'],
                right_on=['sentence_id', 'word_offset']
               )
    .drop(columns=['word_type','function', 'seg_type',
                   'text_id', 'offset'
                  ])
    .fillna(-1)
)
display(len(words_with_deps))
words_with_deps.head()

204834

Unnamed: 0,word_id,word,lemma,sentence_id,word_offset,dep_with_offset
0,0,Latest,late,0,0.0,"[(amod, 2)]"
1,1,corporate,corporate,0,1.0,"[(amod, 2)]"
2,2,unbundler,unbundler,0,2.0,"[(amod, 0), (amod, 1), (nsubj, 3)]"
3,3,reveals,reveal,0,3.0,"[(nsubj, 2), (obj, 5)]"
4,4,laid-back,laid-back,0,4.0,"[(amod, 5)]"


In [20]:
words_with_deps

Unnamed: 0,word_id,word,lemma,sentence_id,word_offset,dep_with_offset
0,0,Latest,late,0,0.0,"[(amod, 2)]"
1,1,corporate,corporate,0,1.0,"[(amod, 2)]"
2,2,unbundler,unbundler,0,2.0,"[(amod, 0), (amod, 1), (nsubj, 3)]"
3,3,reveals,reveal,0,3.0,"[(nsubj, 2), (obj, 5)]"
4,4,laid-back,laid-back,0,4.0,"[(amod, 5)]"
...,...,...,...,...,...,...
204829,204829,'re,be,16182,4.0,"[(cop, 5)]"
204830,204830,here,here,16182,5.0,"[(mark, 2), (nsubj, 3), (cop, 4), (advcl, 8)]"
204831,204831,that,that,16182,6.0,"[(nsubj, 8)]"
204832,204832,'s,be,16182,7.0,"[(cop, 8)]"


In [44]:
import copy

dep_dicts = []
dep_dict_empty = {dep:[] for dep in all_deps['dep'].drop_duplicates()}

for row in words_with_deps.itertuples():
    dep_dict = copy.deepcopy(dep_dict_empty)
    
    if row.dep_with_offset != -1:
        for dep, dep_word_offset in row.dep_with_offset:  # tuples
            target_row = words_with_deps[
                (words_with_deps['sentence_id']==row.sentence_id)
                & (words_with_deps['word_offset']==dep_word_offset)
            ]
            word_id = target_row.iloc[0]['word_id']
            dep_dict[dep].append(word_id)
    
    dep_dict['word_id'] = row.word_id
    dep_dicts.append(dep_dict)
    
    if row.word_id % 1000 == 0:
        print(row.word_id)
        print(dep_dict)
    
print(len(dep_dicts))

0
{'amod': [2], 'nsubj': [], 'obj': [], 'appos': [], 'flat': [], 'aux': [], 'acl:relcl': [], 'det': [], 'nummod': [], 'compound': [], 'case': [], 'nmod': [], 'mark': [], 'acl': [], 'obl': [], 'ccomp': [], 'nmod:poss': [], 'cc': [], 'conj': [], 'punct': [], 'advmod': [], 'nsubj:pass': [], 'aux:pass': [], 'cop': [], 'parataxis': [], 'advcl': [], 'compound:prt': [], 'expl': [], 'obl:tmod': [], 'det:predet': [], 'csubj': [], 'xcomp': [], 'iobj': [], 'discourse': [], 'list': [], 'vocative': [], 'nmod:tmod': [], 'obl:npmod': [], 'nmod:npmod': [], 'cc:preconj': [], 'fixed': [], 'goeswith': [], 'reparandum': [], 'word_id': 0}
1000
{'amod': [998], 'nsubj': [], 'obj': [], 'appos': [], 'flat': [], 'aux': [], 'acl:relcl': [], 'det': [997], 'nummod': [], 'compound': [999], 'case': [996], 'nmod': [995], 'mark': [], 'acl': [], 'obl': [], 'ccomp': [], 'nmod:poss': [], 'cc': [], 'conj': [], 'punct': [], 'advmod': [], 'nsubj:pass': [], 'aux:pass': [], 'cop': [], 'parataxis': [], 'advcl': [], 'compound:p

13000
{'amod': [], 'nsubj': [], 'obj': [], 'appos': [], 'flat': [], 'aux': [], 'acl:relcl': [], 'det': [], 'nummod': [], 'compound': [], 'case': [], 'nmod': [], 'mark': [], 'acl': [], 'obl': [], 'ccomp': [], 'nmod:poss': [], 'cc': [12997], 'conj': [12996], 'punct': [], 'advmod': [12998, 12999], 'nsubj:pass': [], 'aux:pass': [], 'cop': [], 'parataxis': [], 'advcl': [13005], 'compound:prt': [], 'expl': [], 'obl:tmod': [], 'det:predet': [], 'csubj': [], 'xcomp': [], 'iobj': [], 'discourse': [], 'list': [], 'vocative': [], 'nmod:tmod': [], 'obl:npmod': [], 'nmod:npmod': [], 'cc:preconj': [], 'fixed': [], 'goeswith': [], 'reparandum': [], 'word_id': 13000}
14000
{'amod': [], 'nsubj': [13997], 'obj': [14004], 'appos': [], 'flat': [], 'aux': [13998], 'acl:relcl': [13996], 'det': [], 'nummod': [], 'compound': [], 'case': [], 'nmod': [], 'mark': [], 'acl': [], 'obl': [], 'ccomp': [], 'nmod:poss': [], 'cc': [], 'conj': [], 'punct': [], 'advmod': [13999], 'nsubj:pass': [], 'aux:pass': [], 'cop': 

26000
{'amod': [], 'nsubj': [], 'obj': [], 'appos': [], 'flat': [], 'aux': [], 'acl:relcl': [], 'det': [], 'nummod': [], 'compound': [], 'case': [26002], 'nmod': [], 'mark': [], 'acl': [], 'obl': [], 'ccomp': [], 'nmod:poss': [], 'cc': [], 'conj': [], 'punct': [], 'advmod': [], 'nsubj:pass': [], 'aux:pass': [], 'cop': [], 'parataxis': [], 'advcl': [], 'compound:prt': [], 'expl': [], 'obl:tmod': [], 'det:predet': [], 'csubj': [], 'xcomp': [], 'iobj': [], 'discourse': [], 'list': [], 'vocative': [], 'nmod:tmod': [], 'obl:npmod': [], 'nmod:npmod': [], 'cc:preconj': [], 'fixed': [], 'goeswith': [], 'reparandum': [], 'word_id': 26000}
27000
{'amod': [], 'nsubj': [], 'obj': [], 'appos': [], 'flat': [], 'aux': [], 'acl:relcl': [], 'det': [27004], 'nummod': [], 'compound': [], 'case': [], 'nmod': [], 'mark': [], 'acl': [], 'obl': [], 'ccomp': [], 'nmod:poss': [], 'cc': [], 'conj': [], 'punct': [], 'advmod': [], 'nsubj:pass': [], 'aux:pass': [], 'cop': [], 'parataxis': [], 'advcl': [], 'compoun

39000
{'amod': [], 'nsubj': [39001], 'obj': [], 'appos': [], 'flat': [], 'aux': [], 'acl:relcl': [], 'det': [], 'nummod': [], 'compound': [], 'case': [], 'nmod': [], 'mark': [], 'acl': [], 'obl': [], 'ccomp': [], 'nmod:poss': [], 'cc': [], 'conj': [], 'punct': [], 'advmod': [], 'nsubj:pass': [], 'aux:pass': [], 'cop': [], 'parataxis': [], 'advcl': [], 'compound:prt': [], 'expl': [], 'obl:tmod': [], 'det:predet': [], 'csubj': [], 'xcomp': [], 'iobj': [], 'discourse': [], 'list': [], 'vocative': [], 'nmod:tmod': [], 'obl:npmod': [], 'nmod:npmod': [], 'cc:preconj': [], 'fixed': [], 'goeswith': [], 'reparandum': [], 'word_id': 39000}
40000
{'amod': [], 'nsubj': [], 'obj': [], 'appos': [], 'flat': [], 'aux': [], 'acl:relcl': [], 'det': [], 'nummod': [], 'compound': [], 'case': [], 'nmod': [], 'mark': [], 'acl': [], 'obl': [], 'ccomp': [], 'nmod:poss': [], 'cc': [], 'conj': [], 'punct': [], 'advmod': [], 'nsubj:pass': [], 'aux:pass': [], 'cop': [], 'parataxis': [], 'advcl': [], 'compound:prt

52000
{'amod': [], 'nsubj': [52002], 'obj': [], 'appos': [], 'flat': [], 'aux': [], 'acl:relcl': [], 'det': [], 'nummod': [], 'compound': [], 'case': [], 'nmod': [], 'mark': [], 'acl': [], 'obl': [], 'ccomp': [], 'nmod:poss': [], 'cc': [], 'conj': [], 'punct': [], 'advmod': [], 'nsubj:pass': [], 'aux:pass': [], 'cop': [], 'parataxis': [], 'advcl': [], 'compound:prt': [], 'expl': [], 'obl:tmod': [], 'det:predet': [], 'csubj': [], 'xcomp': [], 'iobj': [], 'discourse': [], 'list': [], 'vocative': [], 'nmod:tmod': [], 'obl:npmod': [], 'nmod:npmod': [], 'cc:preconj': [], 'fixed': [], 'goeswith': [], 'reparandum': [], 'word_id': 52000}
53000
{'amod': [], 'nsubj': [], 'obj': [], 'appos': [], 'flat': [], 'aux': [], 'acl:relcl': [], 'det': [], 'nummod': [], 'compound': [], 'case': [53002], 'nmod': [], 'mark': [], 'acl': [], 'obl': [], 'ccomp': [], 'nmod:poss': [], 'cc': [], 'conj': [], 'punct': [], 'advmod': [], 'nsubj:pass': [], 'aux:pass': [], 'cop': [], 'parataxis': [], 'advcl': [], 'compoun

65000
{'amod': [], 'nsubj': [], 'obj': [], 'appos': [], 'flat': [], 'aux': [], 'acl:relcl': [], 'det': [], 'nummod': [], 'compound': [], 'case': [], 'nmod': [], 'mark': [], 'acl': [], 'obl': [], 'ccomp': [], 'nmod:poss': [], 'cc': [], 'conj': [], 'punct': [], 'advmod': [64999], 'nsubj:pass': [], 'aux:pass': [], 'cop': [], 'parataxis': [], 'advcl': [], 'compound:prt': [], 'expl': [], 'obl:tmod': [], 'det:predet': [], 'csubj': [], 'xcomp': [], 'iobj': [], 'discourse': [], 'list': [], 'vocative': [], 'nmod:tmod': [], 'obl:npmod': [], 'nmod:npmod': [], 'cc:preconj': [], 'fixed': [], 'goeswith': [], 'reparandum': [], 'word_id': 65000}
66000
{'amod': [], 'nsubj': [], 'obj': [], 'appos': [], 'flat': [], 'aux': [], 'acl:relcl': [], 'det': [], 'nummod': [65999], 'compound': [], 'case': [], 'nmod': [], 'mark': [], 'acl': [], 'obl': [], 'ccomp': [], 'nmod:poss': [], 'cc': [], 'conj': [], 'punct': [], 'advmod': [], 'nsubj:pass': [], 'aux:pass': [], 'cop': [], 'parataxis': [], 'advcl': [], 'compoun

78000
{'amod': [], 'nsubj': [], 'obj': [], 'appos': [], 'flat': [], 'aux': [], 'acl:relcl': [], 'det': [], 'nummod': [], 'compound': [], 'case': [], 'nmod': [], 'mark': [], 'acl': [], 'obl': [], 'ccomp': [], 'nmod:poss': [], 'cc': [], 'conj': [], 'punct': [], 'advmod': [], 'nsubj:pass': [], 'aux:pass': [], 'cop': [78003], 'parataxis': [], 'advcl': [], 'compound:prt': [], 'expl': [], 'obl:tmod': [], 'det:predet': [], 'csubj': [], 'xcomp': [], 'iobj': [], 'discourse': [], 'list': [], 'vocative': [], 'nmod:tmod': [], 'obl:npmod': [], 'nmod:npmod': [], 'cc:preconj': [], 'fixed': [], 'goeswith': [], 'reparandum': [], 'word_id': 78000}
79000
{'amod': [], 'nsubj': [], 'obj': [], 'appos': [], 'flat': [], 'aux': [], 'acl:relcl': [], 'det': [79001], 'nummod': [], 'compound': [], 'case': [], 'nmod': [], 'mark': [], 'acl': [], 'obl': [], 'ccomp': [], 'nmod:poss': [], 'cc': [], 'conj': [], 'punct': [], 'advmod': [], 'nsubj:pass': [], 'aux:pass': [], 'cop': [], 'parataxis': [], 'advcl': [], 'compoun

91000
{'amod': [], 'nsubj': [], 'obj': [], 'appos': [], 'flat': [], 'aux': [], 'acl:relcl': [], 'det': [], 'nummod': [], 'compound': [], 'case': [91003], 'nmod': [], 'mark': [], 'acl': [], 'obl': [], 'ccomp': [], 'nmod:poss': [], 'cc': [], 'conj': [], 'punct': [], 'advmod': [], 'nsubj:pass': [], 'aux:pass': [], 'cop': [], 'parataxis': [], 'advcl': [], 'compound:prt': [], 'expl': [], 'obl:tmod': [], 'det:predet': [], 'csubj': [], 'xcomp': [], 'iobj': [], 'discourse': [], 'list': [], 'vocative': [], 'nmod:tmod': [], 'obl:npmod': [], 'nmod:npmod': [], 'cc:preconj': [], 'fixed': [], 'goeswith': [], 'reparandum': [], 'word_id': 91000}
92000
{'amod': [92001], 'nsubj': [], 'obj': [], 'appos': [], 'flat': [], 'aux': [], 'acl:relcl': [], 'det': [], 'nummod': [], 'compound': [], 'case': [], 'nmod': [], 'mark': [], 'acl': [], 'obl': [], 'ccomp': [], 'nmod:poss': [], 'cc': [], 'conj': [], 'punct': [], 'advmod': [], 'nsubj:pass': [], 'aux:pass': [], 'cop': [], 'parataxis': [], 'advcl': [], 'compoun

104000
{'amod': [], 'nsubj': [], 'obj': [], 'appos': [], 'flat': [], 'aux': [], 'acl:relcl': [], 'det': [], 'nummod': [], 'compound': [], 'case': [], 'nmod': [], 'mark': [104001], 'acl': [], 'obl': [], 'ccomp': [], 'nmod:poss': [], 'cc': [], 'conj': [], 'punct': [], 'advmod': [], 'nsubj:pass': [], 'aux:pass': [], 'cop': [], 'parataxis': [], 'advcl': [], 'compound:prt': [], 'expl': [], 'obl:tmod': [], 'det:predet': [], 'csubj': [], 'xcomp': [], 'iobj': [], 'discourse': [], 'list': [], 'vocative': [], 'nmod:tmod': [], 'obl:npmod': [], 'nmod:npmod': [], 'cc:preconj': [], 'fixed': [], 'goeswith': [], 'reparandum': [], 'word_id': 104000}
105000
{'amod': [], 'nsubj': [], 'obj': [], 'appos': [], 'flat': [], 'aux': [], 'acl:relcl': [], 'det': [], 'nummod': [], 'compound': [], 'case': [], 'nmod': [], 'mark': [], 'acl': [], 'obl': [], 'ccomp': [105002], 'nmod:poss': [], 'cc': [], 'conj': [], 'punct': [], 'advmod': [], 'nsubj:pass': [], 'aux:pass': [], 'cop': [], 'parataxis': [], 'advcl': [], 'co

117000
{'amod': [], 'nsubj': [116998], 'obj': [], 'appos': [], 'flat': [], 'aux': [116999], 'acl:relcl': [], 'det': [], 'nummod': [], 'compound': [], 'case': [], 'nmod': [], 'mark': [116997], 'acl': [], 'obl': [], 'ccomp': [117006], 'nmod:poss': [], 'cc': [], 'conj': [], 'punct': [], 'advmod': [], 'nsubj:pass': [], 'aux:pass': [], 'cop': [], 'parataxis': [], 'advcl': [116996], 'compound:prt': [], 'expl': [], 'obl:tmod': [], 'det:predet': [], 'csubj': [], 'xcomp': [], 'iobj': [], 'discourse': [], 'list': [], 'vocative': [], 'nmod:tmod': [], 'obl:npmod': [], 'nmod:npmod': [], 'cc:preconj': [], 'fixed': [], 'goeswith': [], 'reparandum': [], 'word_id': 117000}
118000
{'amod': [], 'nsubj': [], 'obj': [], 'appos': [], 'flat': [], 'aux': [], 'acl:relcl': [], 'det': [], 'nummod': [], 'compound': [], 'case': [], 'nmod': [], 'mark': [], 'acl': [], 'obl': [], 'ccomp': [], 'nmod:poss': [], 'cc': [118003], 'conj': [], 'punct': [], 'advmod': [], 'nsubj:pass': [], 'aux:pass': [], 'cop': [], 'parataxi

130000
{'amod': [130001], 'nsubj': [], 'obj': [], 'appos': [], 'flat': [], 'aux': [], 'acl:relcl': [], 'det': [], 'nummod': [], 'compound': [], 'case': [], 'nmod': [], 'mark': [], 'acl': [], 'obl': [], 'ccomp': [], 'nmod:poss': [], 'cc': [], 'conj': [], 'punct': [], 'advmod': [], 'nsubj:pass': [], 'aux:pass': [], 'cop': [], 'parataxis': [], 'advcl': [], 'compound:prt': [], 'expl': [], 'obl:tmod': [], 'det:predet': [], 'csubj': [], 'xcomp': [], 'iobj': [], 'discourse': [], 'list': [], 'vocative': [], 'nmod:tmod': [], 'obl:npmod': [], 'nmod:npmod': [], 'cc:preconj': [], 'fixed': [], 'goeswith': [], 'reparandum': [], 'word_id': 130000}
131000
{'amod': [], 'nsubj': [131002], 'obj': [], 'appos': [], 'flat': [], 'aux': [], 'acl:relcl': [], 'det': [], 'nummod': [], 'compound': [], 'case': [], 'nmod': [], 'mark': [], 'acl': [], 'obl': [], 'ccomp': [], 'nmod:poss': [], 'cc': [], 'conj': [], 'punct': [], 'advmod': [], 'nsubj:pass': [], 'aux:pass': [], 'cop': [], 'parataxis': [], 'advcl': [], 'co

143000
{'amod': [], 'nsubj': [143002], 'obj': [], 'appos': [], 'flat': [], 'aux': [], 'acl:relcl': [], 'det': [], 'nummod': [], 'compound': [142999], 'case': [], 'nmod': [], 'mark': [], 'acl': [], 'obl': [], 'ccomp': [], 'nmod:poss': [], 'cc': [], 'conj': [], 'punct': [], 'advmod': [], 'nsubj:pass': [], 'aux:pass': [], 'cop': [], 'parataxis': [], 'advcl': [], 'compound:prt': [], 'expl': [], 'obl:tmod': [], 'det:predet': [], 'csubj': [], 'xcomp': [], 'iobj': [], 'discourse': [], 'list': [], 'vocative': [], 'nmod:tmod': [], 'obl:npmod': [], 'nmod:npmod': [], 'cc:preconj': [], 'fixed': [], 'goeswith': [], 'reparandum': [], 'word_id': 143000}
144000
{'amod': [], 'nsubj': [143999], 'obj': [144002], 'appos': [], 'flat': [], 'aux': [], 'acl:relcl': [], 'det': [], 'nummod': [], 'compound': [], 'case': [], 'nmod': [], 'mark': [143997], 'acl': [], 'obl': [], 'ccomp': [144006], 'nmod:poss': [], 'cc': [], 'conj': [], 'punct': [], 'advmod': [], 'nsubj:pass': [], 'aux:pass': [], 'cop': [], 'parataxi

156000
{'amod': [], 'nsubj': [], 'obj': [], 'appos': [], 'flat': [], 'aux': [], 'acl:relcl': [], 'det': [], 'nummod': [], 'compound': [], 'case': [], 'nmod': [], 'mark': [], 'acl': [], 'obl': [], 'ccomp': [], 'nmod:poss': [], 'cc': [], 'conj': [], 'punct': [], 'advmod': [156001], 'nsubj:pass': [], 'aux:pass': [], 'cop': [], 'parataxis': [], 'advcl': [], 'compound:prt': [], 'expl': [], 'obl:tmod': [], 'det:predet': [], 'csubj': [], 'xcomp': [], 'iobj': [], 'discourse': [], 'list': [], 'vocative': [], 'nmod:tmod': [], 'obl:npmod': [], 'nmod:npmod': [], 'cc:preconj': [], 'fixed': [], 'goeswith': [], 'reparandum': [], 'word_id': 156000}
157000
{'amod': [], 'nsubj': [156997], 'obj': [], 'appos': [], 'flat': [], 'aux': [], 'acl:relcl': [], 'det': [], 'nummod': [], 'compound': [], 'case': [], 'nmod': [], 'mark': [], 'acl': [], 'obl': [], 'ccomp': [], 'nmod:poss': [], 'cc': [], 'conj': [], 'punct': [], 'advmod': [156999], 'nsubj:pass': [], 'aux:pass': [], 'cop': [156998], 'parataxis': [], 'adv

169000
{'amod': [], 'nsubj': [], 'obj': [], 'appos': [], 'flat': [], 'aux': [], 'acl:relcl': [], 'det': [], 'nummod': [], 'compound': [], 'case': [], 'nmod': [], 'mark': [], 'acl': [], 'obl': [], 'ccomp': [], 'nmod:poss': [], 'cc': [], 'conj': [], 'punct': [], 'advmod': [], 'nsubj:pass': [], 'aux:pass': [], 'cop': [], 'parataxis': [], 'advcl': [], 'compound:prt': [], 'expl': [], 'obl:tmod': [], 'det:predet': [], 'csubj': [], 'xcomp': [], 'iobj': [], 'discourse': [], 'list': [], 'vocative': [], 'nmod:tmod': [], 'obl:npmod': [], 'nmod:npmod': [], 'cc:preconj': [], 'fixed': [], 'goeswith': [], 'reparandum': [], 'word_id': 169000}
170000
{'amod': [], 'nsubj': [], 'obj': [], 'appos': [], 'flat': [], 'aux': [], 'acl:relcl': [], 'det': [170001], 'nummod': [], 'compound': [], 'case': [], 'nmod': [], 'mark': [], 'acl': [], 'obl': [], 'ccomp': [], 'nmod:poss': [], 'cc': [], 'conj': [], 'punct': [], 'advmod': [], 'nsubj:pass': [], 'aux:pass': [], 'cop': [], 'parataxis': [], 'advcl': [], 'compound

182000
{'amod': [], 'nsubj': [], 'obj': [], 'appos': [], 'flat': [], 'aux': [], 'acl:relcl': [], 'det': [], 'nummod': [], 'compound': [], 'case': [], 'nmod': [], 'mark': [], 'acl': [], 'obl': [], 'ccomp': [], 'nmod:poss': [], 'cc': [], 'conj': [], 'punct': [], 'advmod': [], 'nsubj:pass': [], 'aux:pass': [], 'cop': [182001], 'parataxis': [], 'advcl': [], 'compound:prt': [], 'expl': [], 'obl:tmod': [], 'det:predet': [], 'csubj': [], 'xcomp': [], 'iobj': [], 'discourse': [], 'list': [], 'vocative': [], 'nmod:tmod': [], 'obl:npmod': [], 'nmod:npmod': [], 'cc:preconj': [], 'fixed': [], 'goeswith': [], 'reparandum': [], 'word_id': 182000}
183000
{'amod': [], 'nsubj': [], 'obj': [], 'appos': [], 'flat': [], 'aux': [], 'acl:relcl': [], 'det': [], 'nummod': [], 'compound': [], 'case': [], 'nmod': [], 'mark': [], 'acl': [], 'obl': [], 'ccomp': [], 'nmod:poss': [], 'cc': [], 'conj': [], 'punct': [], 'advmod': [], 'nsubj:pass': [], 'aux:pass': [], 'cop': [], 'parataxis': [], 'advcl': [], 'compound

195000
{'amod': [], 'nsubj': [195003], 'obj': [], 'appos': [], 'flat': [], 'aux': [], 'acl:relcl': [], 'det': [], 'nummod': [194996], 'compound': [194999], 'case': [], 'nmod': [], 'mark': [], 'acl': [], 'obl': [], 'ccomp': [], 'nmod:poss': [], 'cc': [], 'conj': [], 'punct': [], 'advmod': [], 'nsubj:pass': [], 'aux:pass': [], 'cop': [], 'parataxis': [], 'advcl': [], 'compound:prt': [], 'expl': [], 'obl:tmod': [], 'det:predet': [], 'csubj': [], 'xcomp': [], 'iobj': [], 'discourse': [], 'list': [], 'vocative': [], 'nmod:tmod': [], 'obl:npmod': [], 'nmod:npmod': [], 'cc:preconj': [], 'fixed': [], 'goeswith': [], 'reparandum': [], 'word_id': 195000}
196000
{'amod': [], 'nsubj': [], 'obj': [], 'appos': [], 'flat': [], 'aux': [], 'acl:relcl': [], 'det': [], 'nummod': [], 'compound': [], 'case': [], 'nmod': [], 'mark': [], 'acl': [], 'obl': [], 'ccomp': [], 'nmod:poss': [], 'cc': [195999], 'conj': [195998], 'punct': [], 'advmod': [], 'nsubj:pass': [], 'aux:pass': [], 'cop': [], 'parataxis': []

In [46]:
deps_expanded = pd.DataFrame(dep_dicts)
display(len(deps_expanded))
deps_expanded.head()

204834

Unnamed: 0,amod,nsubj,obj,appos,flat,aux,acl:relcl,det,nummod,compound,...,list,vocative,nmod:tmod,obl:npmod,nmod:npmod,cc:preconj,fixed,goeswith,reparandum,word_id
0,[2],[],[],[],[],[],[],[],[],[],...,[],[],[],[],[],[],[],[],[],0
1,[2],[],[],[],[],[],[],[],[],[],...,[],[],[],[],[],[],[],[],[],1
2,"[0, 1]",[3],[],[],[],[],[],[],[],[],...,[],[],[],[],[],[],[],[],[],2
3,[],[2],[5],[],[],[],[],[],[],[],...,[],[],[],[],[],[],[],[],[],3
4,[5],[],[],[],[],[],[],[],[],[],...,[],[],[],[],[],[],[],[],[],4


# U#3U

In [69]:
words_with_deps = (
    words.merge(dep_words_expanded,
                how='left',
                left_on=['sentence_id', 'offset'],
                right_on=['sentence_id', 'word_offset']
               )
    .drop(columns=['word_type','function', 'seg_type',
                   'text_id', 'offset', 'dep_with_offset'
                  ])
    .fillna(-1)
)
display(len(words_with_deps))
words_with_deps.head()

204834

Unnamed: 0,word_id,word,lemma,sentence_id,word_offset,amod,nsubj,obj,appos,flat,...,discourse,list,vocative,nmod:tmod,obl:npmod,nmod:npmod,cc:preconj,fixed,goeswith,reparandum
0,0,Latest,late,0,0.0,[2],[],[],[],[],...,[],[],[],[],[],[],[],[],[],[]
1,1,corporate,corporate,0,1.0,[2],[],[],[],[],...,[],[],[],[],[],[],[],[],[],[]
2,2,unbundler,unbundler,0,2.0,"[0, 1]",[3],[],[],[],...,[],[],[],[],[],[],[],[],[],[]
3,3,reveals,reveal,0,3.0,[],[2],[5],[],[],...,[],[],[],[],[],[],[],[],[],[]
4,4,laid-back,laid-back,0,4.0,[5],[],[],[],[],...,[],[],[],[],[],[],[],[],[],[]


In [47]:
deps_expanded[deps_expanded['amod'].isnull()]

Unnamed: 0,amod,nsubj,obj,appos,flat,aux,acl:relcl,det,nummod,compound,...,list,vocative,nmod:tmod,obl:npmod,nmod:npmod,cc:preconj,fixed,goeswith,reparandum,word_id


In [48]:
deps_expanded.to_csv('data/words_with_deps.csv', index=False)

In [49]:
pd.read_csv('data/words_with_deps.csv')

Unnamed: 0,amod,nsubj,obj,appos,flat,aux,acl:relcl,det,nummod,compound,...,list,vocative,nmod:tmod,obl:npmod,nmod:npmod,cc:preconj,fixed,goeswith,reparandum,word_id
0,[2],[],[],[],[],[],[],[],[],[],...,[],[],[],[],[],[],[],[],[],0
1,[2],[],[],[],[],[],[],[],[],[],...,[],[],[],[],[],[],[],[],[],1
2,"[0, 1]",[3],[],[],[],[],[],[],[],[],...,[],[],[],[],[],[],[],[],[],2
3,[],[2],[5],[],[],[],[],[],[],[],...,[],[],[],[],[],[],[],[],[],3
4,[5],[],[],[],[],[],[],[],[],[],...,[],[],[],[],[],[],[],[],[],4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204829,[],[],[],[],[],[],[],[],[],[],...,[],[],[],[],[],[],[],[],[],204829
204830,[],[204828],[],[],[],[],[],[],[],[],...,[],[],[],[],[],[],[],[],[],204830
204831,[],[204833],[],[],[],[],[],[],[],[],...,[],[],[],[],[],[],[],[],[],204831
204832,[],[],[],[],[],[],[],[],[],[],...,[],[],[],[],[],[],[],[],[],204832
