In [None]:
!pip install stanza

Collecting stanza
[?25l  Downloading https://files.pythonhosted.org/packages/2d/f3/cd7eaacabcec195a1c6c07b08cf1587b9f3f8754feba5c87d28867d75671/stanza-1.2.1-py3-none-any.whl (334kB)
[K     |█                               | 10kB 14.3MB/s eta 0:00:01[K     |██                              | 20kB 21.4MB/s eta 0:00:01[K     |███                             | 30kB 26.6MB/s eta 0:00:01[K     |████                            | 40kB 27.2MB/s eta 0:00:01[K     |█████                           | 51kB 29.7MB/s eta 0:00:01[K     |█████▉                          | 61kB 28.2MB/s eta 0:00:01[K     |██████▉                         | 71kB 23.5MB/s eta 0:00:01[K     |███████▉                        | 81kB 24.5MB/s eta 0:00:01[K     |████████▉                       | 92kB 26.1MB/s eta 0:00:01[K     |█████████▉                      | 102kB 26.7MB/s eta 0:00:01[K     |██████████▊                     | 112kB 26.7MB/s eta 0:00:01[K     |███████████▊                    | 122kB 26.7MB

In [None]:
import stanza
import pandas as pd
import os
import pickle
import re

In [None]:
stanza.download('en')

HBox(children=(FloatProgress(value=0.0, description='Downloading https://raw.githubusercontent.com/stanfordnlp…

2021-06-10 08:18:25 INFO: Downloading default packages for language: en (English)...





HBox(children=(FloatProgress(value=0.0, description='Downloading http://nlp.stanford.edu/software/stanza/1.2.1…




2021-06-10 08:29:57 INFO: Finished downloading models and saved to /root/stanza_resources.


In [None]:
nlp = stanza.Pipeline(lang='en', processors='tokenize, pos, mwt, lemma, depparse')

2021-06-10 08:29:57 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |
| depparse  | combined |

2021-06-10 08:29:57 INFO: Use device: gpu
2021-06-10 08:29:57 INFO: Loading: tokenize
2021-06-10 08:30:08 INFO: Loading: pos
2021-06-10 08:30:08 INFO: Loading: lemma
2021-06-10 08:30:08 INFO: Loading: depparse
2021-06-10 08:30:08 INFO: Done loading processors!


In [None]:
with open('./data/misc/bnc_sentences_unparsed.pkl', 'rb') as f:
    corpus_sentences = pickle.load(f)

In [None]:
len(corpus_sentences)

265582

In [None]:
# limit the sentence size between 4 and 25 words
corpus_sentences_clean = []

for sen in corpus_sentences:

  # limit the sentence size between 4 and 25, for better results
  if (len(re.findall(r'\w+', sen)) >= 4 and len(re.findall(r'\w+', sen)) <= 25):

    corpus_sentences_clean.append(sen)

In [None]:
len(corpus_sentences_clean)

197209

## We loop twice. In the first loop, we obtain the whole parsing information. In the second loop, we obtain only the tokenized sentences. The reason to do that, is because we want to retrieve the parsing information from the lowercased text, since incosistencies in the lemma form may exist otherwise. Nevertheless, we also loop for a second time to reobtain the tokenized form, since the final sentences generation will be based on those tokens and with the lowercased ones, important information might be lost (eg the capitalization of proper names)

In [None]:
# obtain the parsing information
upos_sentence_list = []
xpos_sentence_list = []
lemma_sentence_list = []
dependency_sentence_list = []
features_sentence_list = []
id_sentence_list = []
head_sentence_list = []

for sentence in corpus_sentences_clean:
    parsed_sentence = nlp(sentence.lower())

    upos_list = []
    xpos_list = []
    lemma_list = []
    dependency_list = []
    features_list = []
    id_list = []
    head_list = []

    for sentence in parsed_sentence.sentences:
        for word in sentence.words:

            # upos
            upos_list.append(word.upos)
            # xpos
            xpos_list.append(word.xpos)
            # lemma
            lemma_list.append(word.lemma)

            # dependencies 
            if word.head > 0:
                head_word = sentence.words[word.head - 1].text         
            else:
                head_word = "root"
            dependency_list.append((head_word,word.deprel))

            # features
            if word.feats:   
                features_list.append(word.feats)
            else:
                features_list.append('_')

            # id
            id_list.append(word.id)
            # head
            head_list.append(word.head)
    
    upos_sentence_list.append(upos_list)
    xpos_sentence_list.append(xpos_list)
    lemma_sentence_list.append(lemma_list)
    dependency_sentence_list.append(dependency_list)
    features_sentence_list.append(features_list)
    id_sentence_list.append(id_list)
    head_sentence_list.append(head_list)

In [None]:
# obtain only the tokens
tokens_sentence_list = []

for sentence in corpus_sentences_clean:
    parsed_sentence = nlp(sentence)

    tokens_list = []

    for sentence in parsed_sentence.sentences:
        for word in sentence.words:

            # tokens
            tokens_list.append(word.text)

    tokens_sentence_list.append(tokens_list)

In [None]:
# final dataset
dataset = {
           'Sentence':  corpus_sentences_clean,
           'Tokens': tokens_sentence_list,
           'Lemma': lemma_sentence_list,
           'Upos':  upos_sentence_list,
           'Xpos':  xpos_sentence_list,
           'Dependency':  dependency_sentence_list,
           'Features': features_sentence_list,
           'id': id_sentence_list,
           'Head': head_sentence_list
          }

df = pd.DataFrame (dataset, columns = ['Sentence', 'Tokens', 'Lemma', 'Upos', 'Xpos', 'Dependency', 'Features', 'id', 'Head'])

In [None]:
df

Unnamed: 0,Sentence,Tokens,Lemma,Upos,Xpos,Dependency,Features,id,Head
0,The interaction of long chain molecules with l...,"[The, interaction, of, long, chain, molecules,...","[the, interaction, of, long, chain, molecule, ...","[DET, NOUN, ADP, ADJ, NOUN, NOUN, ADP, NOUN, A...","[DT, NN, IN, JJ, NN, NNS, IN, NNS, VBZ, IN, JJ...","[(interaction, det), (interest, nsubj), (molec...","[Definite=Def|PronType=Art, Number=Sing, _, De...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[2, 12, 6, 6, 6, 2, 8, 6, 12, 12, 12, 0, 19, 1..."
1,When an amorphous polymer is mixed with a suit...,"[When, an, amorphous, polymer, is, mixed, with...","[when, a, amorphous, polymer, be, mix, with, a...","[SCONJ, DET, ADJ, NOUN, AUX, VERB, ADP, DET, A...","[WRB, DT, JJ, NN, VBZ, VBN, IN, DT, JJ, NN, ,,...","[(mixed, mark), (polymer, det), (polymer, amod...","[PronType=Int, Definite=Ind|PronType=Art, Degr...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[6, 4, 4, 6, 6, 13, 10, 10, 10, 6, 13, 13, 0, ..."
2,"In a ' poor ' solvent, the interactions are fe...","[In, a, ', poor, ', solvent, ,, the, interacti...","[in, a, ', poor, ', solvent, ,, the, interacti...","[ADP, DET, PUNCT, ADJ, PUNCT, NOUN, PUNCT, DET...","[IN, DT, ``, JJ, '', NN, ,, DT, NNS, VBP, JJR,...","[(solvent, case), (solvent, det), (solvent, pu...","[_, Definite=Ind|PronType=Art, _, Degree=Pos, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[6, 6, 6, 6, 6, 11, 11, 9, 11, 11, 0, 18, 14, ..."
3,The fundamental thermodynamic equation used to...,"[The, fundamental, thermodynamic, equation, us...","[the, fundamental, thermodynamic, equation, us...","[DET, ADJ, ADJ, NOUN, VERB, PART, VERB, DET, N...","[DT, JJ, JJ, NN, VBN, TO, VB, DT, NNS, VBZ, DT...","[(equation, det), (equation, amod), (equation,...","[Definite=Def|PronType=Art, Degree=Pos, Degree...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[4, 4, 4, 10, 4, 7, 5, 9, 7, 0, 16, 16, 16, 15..."
4,This is valid only for components of comparabl...,"[This, is, valid, only, for, components, of, c...","[this, be, valid, only, for, component, of, co...","[PRON, AUX, ADJ, ADV, ADP, NOUN, ADP, ADJ, NOU...","[DT, VBZ, JJ, RB, IN, NNS, IN, JJ, NN, ,, CC, ...","[(valid, nsubj), (valid, cop), (root, root), (...","[Number=Sing|PronType=Dem, Mood=Ind|Number=Sin...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[3, 3, 0, 6, 6, 3, 9, 9, 6, 24, 24, 24, 15, 15..."
...,...,...,...,...,...,...,...,...,...
197202,How much longer will everyone ignore this phen...,"[How, much, longer, will, everyone, ignore, th...","[how, much, long, will, everyone, ignore, this...","[ADV, ADV, ADV, AUX, PRON, VERB, DET, NOUN, PU...","[WRB, RB, RBR, MD, NN, VB, DT, NN, .]","[(much, advmod), (longer, advmod), (ignore, ad...","[PronType=Int, Degree=Pos, Degree=Cmp, VerbFor...","[1, 2, 3, 4, 5, 6, 7, 8, 9]","[2, 3, 6, 6, 6, 0, 8, 6, 6]"
197203,Is this Dunfermline 's official line?,"[Is, this, Dunfermline, 's, official, line, ?]","[be, this, dunfermline, 's, official, line, ?]","[AUX, DET, PROPN, PART, ADJ, NOUN, PUNCT]","[VBZ, DT, NNP, POS, JJ, NN, .]","[(line, cop), (dunfermline, det), (line, nmod:...",[Mood=Ind|Number=Sing|Person=3|Tense=Pres|Verb...,"[1, 2, 3, 4, 5, 6, 7]","[6, 3, 6, 3, 6, 0, 6]"
197204,"If so, it shows the town suffering more than i...","[If, so, ,, it, shows, the, town, suffering, m...","[if, so, ,, it, show, the, town, suffer, more,...","[SCONJ, ADV, PUNCT, PRON, VERB, DET, NOUN, VER...","[IN, RB, ,, PRP, VBZ, DT, NN, VBG, JJR, IN, PR...","[(so, mark), (shows, advcl), (shows, punct), (...","[_, _, _, Case=Nom|Gender=Neut|Number=Sing|Per...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[2, 5, 5, 5, 0, 7, 5, 7, 8, 13, 13, 13, 8, 17,..."
197205,I doubt if many Scottish historians would take...,"[I, doubt, if, many, Scottish, historians, wou...","[I, doubt, if, many, scottish, historian, woul...","[PRON, VERB, SCONJ, ADJ, ADJ, NOUN, AUX, VERB,...","[PRP, VBP, IN, JJ, JJ, NNS, MD, VB, DT, NN, RB...","[(doubt, nsubj), (root, root), (take, mark), (...","[Case=Nom|Number=Sing|Person=1|PronType=Prs, M...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]","[2, 0, 8, 6, 6, 8, 8, 2, 10, 8, 8, 2]"


In [None]:
df.to_pickle("./data/misc/bnc_sentences_parsed.pkl")