## Parse the simple wikipedia sentences

In [None]:
!pip install stanza

Collecting stanza
[?25l  Downloading https://files.pythonhosted.org/packages/2d/f3/cd7eaacabcec195a1c6c07b08cf1587b9f3f8754feba5c87d28867d75671/stanza-1.2.1-py3-none-any.whl (334kB)
[K     |█                               | 10kB 14.0MB/s eta 0:00:01[K     |██                              | 20kB 21.2MB/s eta 0:00:01[K     |███                             | 30kB 21.3MB/s eta 0:00:01[K     |████                            | 40kB 17.2MB/s eta 0:00:01[K     |█████                           | 51kB 9.4MB/s eta 0:00:01[K     |█████▉                          | 61kB 8.4MB/s eta 0:00:01[K     |██████▉                         | 71kB 9.4MB/s eta 0:00:01[K     |███████▉                        | 81kB 10.4MB/s eta 0:00:01[K     |████████▉                       | 92kB 11.0MB/s eta 0:00:01[K     |█████████▉                      | 102kB 9.1MB/s eta 0:00:01[K     |██████████▊                     | 112kB 9.1MB/s eta 0:00:01[K     |███████████▊                    | 122kB 9.1MB/s eta

In [None]:
import stanza
import pandas as pd
import os
import pickle
import re

In [None]:
stanza.download('en')

HBox(children=(FloatProgress(value=0.0, description='Downloading https://raw.githubusercontent.com/stanfordnlp…

2021-06-10 08:10:35 INFO: Downloading default packages for language: en (English)...





HBox(children=(FloatProgress(value=0.0, description='Downloading http://nlp.stanford.edu/software/stanza/1.2.1…




2021-06-10 08:15:12 INFO: Finished downloading models and saved to /root/stanza_resources.


In [None]:
nlp = stanza.Pipeline(lang='en', processors='tokenize, pos, mwt, lemma, depparse')

2021-06-10 08:15:12 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |
| depparse  | combined |

2021-06-10 08:15:12 INFO: Use device: gpu
2021-06-10 08:15:12 INFO: Loading: tokenize
2021-06-10 08:15:23 INFO: Loading: pos
2021-06-10 08:15:23 INFO: Loading: lemma
2021-06-10 08:15:24 INFO: Loading: depparse
2021-06-10 08:15:24 INFO: Done loading processors!


In [None]:
with open('./data/misc/wikipedia_sentences_unparsed.pkl', 'rb') as f:
    corpus_sentences = pickle.load(f)

In [None]:
len(corpus_sentences)

252083

In [None]:
# limit the sentence size between 4 and 25 words
corpus_sentences_clean = []

for sen in corpus_sentences:

  # limit the sentence size between 4 and 25, for better results
  if (len(re.findall(r'\w+', sen)) >= 4 and len(re.findall(r'\w+', sen)) <= 25):

    corpus_sentences_clean.append(sen)

In [None]:
len(corpus_sentences_clean)

216185

## We loop twice. In the first loop, we obtain the whole parsing information. In the second loop, we obtain only the tokenized sentences. The reason to do that, is because we want to retrieve the parsing information from the lowercased text, since incosistencies in the lemma form may exist otherwise. Nevertheless, we also loop for a second time to reobtain the tokenized form, since the final sentences generation will be based on those tokens and with the lowercased ones, important information might be lost (eg the capitalization of proper names)

In [None]:
# obtain the parsing information
upos_sentence_list = []
xpos_sentence_list = []
lemma_sentence_list = []
dependency_sentence_list = []
features_sentence_list = []
id_sentence_list = []
head_sentence_list = []

for sentence in corpus_sentences_clean:
    parsed_sentence = nlp(sentence.lower())

    upos_list = []
    xpos_list = []
    lemma_list = []
    dependency_list = []
    features_list = []
    id_list = []
    head_list = []

    for sentence in parsed_sentence.sentences:
        for word in sentence.words:

            # upos
            upos_list.append(word.upos)
            # xpos
            xpos_list.append(word.xpos)
            # lemma
            lemma_list.append(word.lemma)

                # dependencies 
                if word.head > 0:
                    head_word = sentence.words[word.head - 1].text         
                else:
                    head_word = "root"
                dependency_list.append((head_word,word.deprel))

                # features
                if word.feats:   
                    features_list.append(word.feats)
                else:
                    features_list.append('_')

                # id
                id_list.append(word.id)
                # head
                head_list.append(word.head)
    
    upos_sentence_list.append(upos_list)
    xpos_sentence_list.append(xpos_list)
    lemma_sentence_list.append(lemma_list)
    dependency_sentence_list.append(dependency_list)
    features_sentence_list.append(features_list)
    id_sentence_list.append(id_list)
    head_sentence_list.append(head_list)

In [None]:
# obtain only the tokens
tokens_sentence_list = []

for sentence in corpus_sentences_clean:
    parsed_sentence = nlp(sentence)

    tokens_list = []

    for sentence in parsed_sentence.sentences:
        for word in sentence.words:

            # tokens
            tokens_list.append(word.text)

    tokens_sentence_list.append(tokens_list)

In [None]:
# final dataset
dataset = {
           'Sentence':  corpus_sentences_clean,
           'Tokens': tokens_sentence_list,
           'Lemma': lemma_sentence_list,
           'Upos':  upos_sentence_list,
           'Xpos':  xpos_sentence_list,
           'Dependency':  dependency_sentence_list,
           'Features': features_sentence_list,
           'id': id_sentence_list,
           'Head': head_sentence_list
          }

df = pd.DataFrame (dataset, columns = ['Sentence', 'Tokens', 'Lemma', 'Upos', 'Xpos', 'Dependency', 'Features', 'id', 'Head'])

In [None]:
df

Unnamed: 0,Sentence,Tokens,Lemma,Upos,Xpos,Dependency,Features,id,Head
0,Reuven Rivlin has been the President since Jul...,"[Reuven, Rivlin, has, been, the, President, si...","[reuven, rivlin, have, be, the, president, sin...","[PROPN, PROPN, AUX, AUX, DET, NOUN, ADP, PROPN...","[NNP, NNP, VBZ, VBN, DT, NN, IN, NNP, CD, .]","[(president, nsubj), (reuven, flat), (presiden...","[Number=Sing, Number=Sing, Mood=Ind|Number=Sin...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]","[6, 1, 6, 6, 6, 0, 8, 6, 8, 6]"
1,The volcanic soil of the islands proved to be ...,"[The, volcanic, soil, of, the, islands, proved...","[the, volcanic, soil, of, the, island, prove, ...","[DET, ADJ, NOUN, ADP, DET, NOUN, VERB, PART, A...","[DT, JJ, NN, IN, DT, NNS, VBD, TO, VB, JJ, IN,...","[(soil, det), (soil, amod), (proved, nsubj), (...","[Definite=Def|PronType=Art, Degree=Pos, Number...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]","[3, 3, 7, 6, 6, 3, 0, 10, 10, 7, 13, 13, 10, 7]"
2,"After the Sharpeville Massacre, the UN tried t...","[After, the, Sharpeville, Massacre, ,, the, UN...","[after, the, sharpeville, massacre, ,, the, un...","[ADP, DET, ADJ, NOUN, PUNCT, DET, PROPN, VERB,...","[IN, DT, JJ, NN, ,, DT, NNP, VBD, TO, VB, JJ, ...","[(massacre, case), (massacre, det), (massacre,...","[_, Definite=Def|PronType=Art, Degree=Pos, Num...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[4, 4, 4, 8, 8, 7, 8, 0, 10, 8, 12, 10, 14, 10..."
3,The paws have three soft toe pads and retracti...,"[The, paws, have, three, soft, toe, pads, and,...","[the, paw, have, three, soft, toe, pad, and, r...","[DET, NOUN, VERB, NUM, ADJ, NOUN, NOUN, CCONJ,...","[DT, NNS, VBP, CD, JJ, NN, NNS, CC, JJ, NNS, .]","[(paws, det), (have, nsubj), (root, root), (pa...","[Definite=Def|PronType=Art, Number=Plur, Mood=...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]","[2, 3, 0, 7, 7, 7, 3, 10, 10, 7, 3]"
4,The stone is on the ice in front of the foot i...,"[The, stone, is, on, the, ice, in, front, of, ...","[the, stone, be, on, the, ice, in, front, of, ...","[DET, NOUN, AUX, ADP, DET, NOUN, ADP, NOUN, AD...","[DT, NN, VBZ, IN, DT, NN, IN, NN, IN, DT, NN, ...","[(stone, det), (ice, nsubj), (ice, cop), (ice,...","[Definite=Def|PronType=Art, Number=Sing, Mood=...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[2, 6, 6, 6, 6, 0, 8, 6, 11, 11, 8, 14, 14, 8, 6]"
...,...,...,...,...,...,...,...,...,...
216180,"1955 – Ashley Adams, Australian target shooter .","[1955, –, Ashley, Adams, ,, Australian, target...","[1955, -, ashley, adam, ,, australian, target,...","[NUM, PUNCT, PROPN, PROPN, PUNCT, ADJ, NOUN, N...","[CD, :, NNP, NNP, ,, JJ, NN, NN, .]","[(root, root), (1955, punct), (1955, appos), (...","[NumType=Card, _, Number=Sing, Number=Sing, _,...","[1, 2, 3, 4, 5, 6, 7, 8, 9]","[0, 1, 1, 3, 3, 8, 8, 3, 1]"
216181,This is a casual relationship is usually only ...,"[This, is, a, casual, relationship, is, usuall...","[this, be, a, casual, relationship, be, usuall...","[PRON, AUX, DET, ADJ, NOUN, AUX, ADV, ADV, ADP...","[DT, VBZ, DT, JJ, NN, VBZ, RB, RB, IN, NN, CC,...","[(relationship, nsubj), (relationship, cop), (...","[Number=Sing|PronType=Dem, Mood=Ind|Number=Sin...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[5, 5, 5, 5, 10, 10, 10, 10, 10, 0, 15, 15, 15..."
216182,It also cost about $3.9 billion.,"[It, also, cost, about, $, 3.9, billion, .]","[it, also, cost, about, $, 3.9, billion, .]","[PRON, ADV, VERB, ADV, SYM, NUM, NUM, PUNCT]","[PRP, RB, VBD, RB, $, CD, CD, .]","[(cost, nsubj), (cost, advmod), (root, root), ...",[Case=Nom|Gender=Neut|Number=Sing|Person=3|Pro...,"[1, 2, 3, 4, 5, 6, 7, 8]","[3, 3, 0, 5, 3, 7, 5, 3]"
216183,1945 – Adolf Hitler marries his long-time part...,"[1945, –, Adolf, Hitler, marries, his, long, -...","[1945, -, adolf, hitler, marry, he, long, -, t...","[NUM, PUNCT, PROPN, PROPN, VERB, PRON, ADJ, PU...","[CD, ,, NNP, NNP, VBZ, PRP$, JJ, HYPH, NN, NN,...","[(root, root), (1945, punct), (marries, nsubj)...","[NumType=Card, _, Number=Sing, Number=Sing, Mo...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[0, 1, 5, 3, 1, 10, 9, 9, 10, 5, 12, 10, 16, 1..."


In [None]:
df.to_pickle(".data/misc/wikipedia_sentences_parsed.pkl")