In [1]:
%run model_functions.ipynb

In [2]:
import stanfordnlp

In [387]:
all_words = pd.read_csv('data/raw/words.csv', encoding='ISO-8859-1')
display(len(all_words))
all_words.head()

204834

Unnamed: 0,word_id,word,lemma,word_type,function,seg_type,sentence_id,text_id
0,0,Latest,late,AJS,,,0,a1e-fragment01
1,1,corporate,corporate,AJ0,,,0,a1e-fragment01
2,2,unbundler,unbundler,NN1,,,0,a1e-fragment01
3,3,reveals,reveal,VVZ,mrw,met,0,a1e-fragment01
4,4,laid-back,laid-back,AJ0,,,0,a1e-fragment01


In [388]:
all_words = all_words[all_words['lemma'].apply(type) == str]
display(len(all_words))

204829

In [348]:
def parse_all_deps(words, download_models=False):
    if download_models:
        stanfordnlp.download('en')   # This downloads the English models for the neural pipeline

    # This sets up a default neural pipeline in English
    nlp_pipeline = stanfordnlp.Pipeline(
        processors='tokenize,pos,depparse',
        tokenize_pretokenized=True
    ) 

    with open('data/all_dependencies.csv', 'w', encoding='utf-8') as f:
        csv_writer = csv.writer(f)
        for sentence_id, group in words.groupby('sentence_id'):
            sentence = [list(group['lemma'])]
            if not sentence_id % 100:
                print('Finding dependencies for sentence {}'.format(sentence_id))
            for dep in parse_dependencies(sentence, sentence_id, nlp_pipeline):
                csv_writer.writerow(dep)
        
    print('Done!')
    
def parse_dependencies(real_lemmas, sentence_id, nlp_pipeline):
    doc = nlp_pipeline(real_lemmas)
    
    dependencies = []
    word_index_offset = 0
    assert len(doc.sentences) == 1
    for sentence in doc.sentences:
        for dep in sentence.dependencies:
            governor, g_i = dep[0].text, int(dep[0].index) - 1 + word_index_offset
            if g_i == -1:  # Is the ROOT placeholder.
                continue
            dependent, d_i = dep[2].text, int(dep[2].index) - 1 + word_index_offset
            dependencies.append((sentence_id, governor, g_i, dependent, d_i))
        word_index_offset += len(sentence.words) - 1
        
    return dependencies

The next cell saves depencies to data/all_deps.csv

In [349]:
# %%time
# import warnings
# warnings.filterwarnings(action='once')

# parse_all_deps(words)

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': 'C:\\Users\\ezutp\\stanfordnlp_resources\\en_ewt_models\\en_ewt_tokenizer.pt', 'pretokenized': True, 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': 'C:\\Users\\ezutp\\stanfordnlp_resources\\en_ewt_models\\en_ewt_tagger.pt', 'pretrain_path': 'C:\\Users\\ezutp\\stanfordnlp_resources\\en_ewt_models\\en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: depparse
With settings: 
{'model_path': 'C:\\Users\\ezutp\\stanfordnlp_resources\\en_ewt_models\\en_ewt_parser.pt', 'pretrain_path': 'C:\\Users\\ezutp\\stanfordnlp_resources\\en_ewt_models\\en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Done loading processors!
---
Finding dependencies for sentence 0




Finding dependencies for sentence 100
Finding dependencies for sentence 200
Finding dependencies for sentence 300
Finding dependencies for sentence 400
Finding dependencies for sentence 500
Finding dependencies for sentence 600
Finding dependencies for sentence 700
Finding dependencies for sentence 800
Finding dependencies for sentence 900
Finding dependencies for sentence 1000
Finding dependencies for sentence 1100
Finding dependencies for sentence 1200
Finding dependencies for sentence 1300
Finding dependencies for sentence 1500
Finding dependencies for sentence 1600
Finding dependencies for sentence 1700
Finding dependencies for sentence 1800
Finding dependencies for sentence 1900
Finding dependencies for sentence 2000
Finding dependencies for sentence 2100
Finding dependencies for sentence 2200
Finding dependencies for sentence 2300
Finding dependencies for sentence 2400
Finding dependencies for sentence 2500
Finding dependencies for sentence 2600
Finding dependencies for sentence 

------------------

Code for parsing entire dataset at once. However, single "words" containing whitespaces will be wrongly delimited as separate words.

In [350]:
# def parse_all_deps(words, download_models=False):
#     if download_models:
#         stanfordnlp.download('en')   # This downloads the English models for the neural pipeline

#     # This sets up a default neural pipeline in English
#     nlp_pipeline = stanfordnlp.Pipeline(
#         processors='tokenize,lemma,pos,depparse',
#         tokenize_pretokenized=True
#     ) 

#     with open('data/all_dependencies.csv', 'w', encoding='utf-8') as f:
#         csv_writer = csv.writer(f)
#         sentence_lists = list(sentences.groupby('sentence_id')['sentence'].apply(lambda x: x.str.split(' ')))
#         doc = nlp_pipeline(sentence_lists)
#         assert len(doc.sentences) == len(sentences)
        
#         for sentence_id, sentence in enumerate(doc.sentences):
#             if not sentence_id % 100:
#                 print('Finding dependencies for sentence {}'.format(sentence_id))
#             for dep in sentence.dependencies:
#                 governor, g_i = dep[0].text, int(dep[0].index) - 1
#                 dependent, d_i = dep[2].text, int(dep[2].index) - 1
                
#                 if g_i == -1:  # Is the ROOT placeholder.
#                     continue
                
#                 csv_writer.writerow((sentence_id, governor, g_i, dependent, d_i))
        
#     print('Done!')

# parse_all_deps(sentences)

------------------

In [351]:
all_deps = pd.read_csv('data/all_dependencies.csv',
             names=['governor', 'g_id', 'dependent', 'd_id'])

len(all_deps.groupby(all_deps.index))

14450

In [364]:
sentences = pd.read_csv('data/raw/sentences.csv', encoding='ISO-8859-1')
sentences.iloc[6725]

sentence_id     6725                                                                                                                                                                                                                                                                                                              
paragraph_id    2031                                                                                                                                                                                                                                                                                                              
sentence        It was within this kind of environment that chivalry flourished, that young men learned to identify, their vassalage towards their lord as a symbol of their honour, that they debated the relative merits of Roland's and Oliver's way of displaying loyalty; and that they learned the finer points of jousting.
Name: 6725, dtype: object

In [353]:
all_deps

Unnamed: 0,governor,g_id,dependent,d_id
0,unbundler,2,late,0
0,unbundler,2,corporate,1
0,reveal,3,unbundler,2
0,approach,5,laid-back,4
0,reveal,3,approach,5
0,approach,5,roland,6
0,roland,6,franklin,7
0,lead,10,who,8
0,lead,10,be,9
0,approach,5,lead,10


In [447]:
all_deps[all_deps['governor']=='roland']

Unnamed: 0,governor,g_id,dependent,d_id
0,roland,6,franklin,7
2,roland,3,franklin,4
2,roland,3,unbundler,7
6725,roland,34,of,33
6725,roland,34,oliver,36
6725,roland,34,way,37


In [458]:
dep_words = all_deps[['g_id', 'd_id']].rename(
    columns={
            'g_id': 'word_offset',
            'd_id': 'dep_word'
    }).append(
    all_deps[['d_id', 'g_id']].rename(
        columns={
            'd_id': 'word_offset',
            'g_id': 'dep_word'
    }),
    sort=True
).reset_index().rename(columns={'index': 'sentence_id'})
dep_words.sample(10)

Unnamed: 0,sentence_id,dep_word,word_offset
203106,755,3,1
260171,3810,4,3
283322,5082,6,7
66888,3610,0,2
201842,677,45,43
189431,40,13,17
351360,11043,20,21
348379,10493,3,6
215203,1411,5,7
263902,3958,9,13


In [461]:
dep_word_lists = pd.DataFrame(dep_words.groupby(['sentence_id', 'word_offset'])['dep_word'].apply(list)).reset_index()
dep_word_lists

Unnamed: 0,sentence_id,word_offset,dep_word
0,0,0,[2]
1,0,1,[2]
2,0,2,"[0, 1, 3]"
3,0,3,"[2, 5]"
4,0,4,[5]
5,0,5,"[4, 6, 10, 3]"
6,0,6,"[7, 5]"
7,0,7,[6]
8,0,8,[10]
9,0,9,[10]


In [462]:
words = pd.read_csv('data/train/academic/words.csv')
display(len(words))
words.head()

48964

Unnamed: 0,word_id,word,lemma,word_type,function,seg_type,sentence_id,text_id,text_tag,genre
0,25070,HER,she,DPS,,,1267,a6u-fragment02,a6u,academic
1,25071,DRESS,dress,NN1,,,1267,a6u-fragment02,a6u,academic
2,25072,HANGS,hang,VVZ,,,1267,a6u-fragment02,a6u,academic
3,25073,HERE',here',NP0,,,1267,a6u-fragment02,a6u,academic
4,25074,DE-FROCKING,de-frock,VVG,mrw,met,1267,a6u-fragment02,a6u,academic


In [466]:
words['offset'] = words.groupby('sentence_id').cumcount()
display(len(words))
words.sample(10)

48964

Unnamed: 0,word_id,word,lemma,word_type,function,seg_type,sentence_id,text_id,text_tag,genre,offset
5976,48853,be,be,VBI,,,2630,acj-fragment01,acj,academic,8
4479,47356,jurisdictions,jurisdiction,NN2,,,2574,acj-fragment01,acj,academic,6
30532,88381,within,within,PRP,,,4366,b1g-fragment02,b1g,academic,15
48878,137707,party,party,NN1,,,7098,ew1-fragment01,ew1,academic,30
38557,118252,lists,list,NN2,,,6286,clp-fragment01,clp,academic,8
40042,128871,of,of,PRF,,,6728,ea7-fragment03,ea7,academic,12
38083,117778,making,make,VVG,mrw,met,6263,clp-fragment01,clp,academic,15
37775,117470,himself,himself,PNX,,,6248,clp-fragment01,clp,academic,13
27504,85353,the,the,AT0,,,4239,b1g-fragment02,b1g,academic,15
8326,60760,Probation,probation,NN1,,,3196,alp-fragment01,alp,academic,0


In [473]:
dep_word_lists[['sentence_id', 'word_offset']]

Unnamed: 0,sentence_id,word_offset
0,0,0
1,0,1
2,0,2
3,0,3
4,0,4
5,0,5
6,0,6
7,0,7
8,0,8
9,0,9


In [478]:
words_with_deps = words.merge(dep_word_lists,
                              left_on=['sentence_id', 'offset'],
                              right_on=['sentence_id', 'word_offset'])[[
    'word_id', 'sentence_id', 'offset', 'word', 'dep_word']]
display(len(words_with_deps))
words_with_deps.head()

48890

Unnamed: 0,word_id,sentence_id,offset,word,dep_word
0,25070,1267,0,HER,[1]
1,25071,1267,1,DRESS,"[0, 2]"
2,25072,1267,2,HANGS,"[3, 7, 1]"
3,25073,1267,3,HERE',[2]
4,25074,1267,4,DE-FROCKING,[7]


In [484]:
words_with_deps.to_csv('data/words_with_deps.csv', index=False)