# Grewpy tutorial: Modify data

https://grew.fr/grewpy/modify_data/

In [None]:
import os

import grewpy
from grewpy import Corpus, CorpusDraft, Request

os.environ["PATH"] = os.environ["HOME"] + "/.opam/5.2.0/bin:" + os.environ["PATH"]


grewpy.set_config("ud")  # ud or basic

conll_file = "../UD_output.conllu"
# conll_file = "../spoken_norwegian_resources/treebanks/Norwegian-NynorskLIA/fana_uib_03.conll"

corpus = Corpus(conll_file)

## Access data in a corpus

In [4]:
# Access to the corpus
sentence = corpus[1]
print("A corpus is a set of graphs:", type(sentence))

A corpus is a set of graphs: <class 'grewpy.graph.Graph'>


In [5]:
# Each graph is a sentence and contains all its information
print("Sentence metadata:")
sentence.meta

Sentence metadata:


{'text': 'du aal_uio_0601 # som er fødd i attennittisju # og var stølskjerring i førtifem år # kan nok fortelje oss om det ?',
 'sent_id': '2',
 '_filename': 'UD_output.conllu'}

In [6]:
# Sentence order, which in this case is the same as the token's id
print(sentence.order)

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23']


In [7]:
# Token features, which make possible to access every token feature
print(sentence.features)

# e.g get all upos of the sentence
print([sentence.features[id]["upos"] for id in sentence.features if id != "0"])

{'0': {'form': '__0__'}, '1': {'Animacy': 'Hum', 'Case': 'Nom', 'Person': '2', 'PronType': 'Prs', '__RAW_MISC__': 'hov', 'form': 'du', 'lemma': 'du', 'textform': 'du', 'upos': 'PRON', 'wordform': 'du'}, '2': {'__RAW_MISC__': 'hov', 'form': 'aal_uio_0601', 'lemma': 'aal_uio_0601', 'textform': 'aal_uio_0601', 'upos': 'PROPN', 'wordform': 'aal_uio_0601'}, '3': {'__RAW_MISC__': 'led', 'form': '#', 'lemma': '#', 'textform': '#', 'upos': 'pause', 'wordform': '#'}, '4': {'__RAW_MISC__': 'led', 'form': 'som', 'lemma': 'som', 'textform': 'som', 'upos': 'SCONJ', 'wordform': 'som'}, '5': {'Mood': 'Ind', 'Tense': 'Pres', 'VerbForm': 'Fin', '__RAW_MISC__': 'led', 'form': 'er', 'lemma': 'vere', 'textform': 'er', 'upos': 'AUX', 'wordform': 'er'}, '6': {'VerbForm': 'Part', '__RAW_MISC__': 'led', 'form': 'fødd', 'lemma': 'føde', 'textform': 'fødd', 'upos': 'VERB', 'wordform': 'fødd'}, '7': {'__RAW_MISC__': 'led', 'form': 'i', 'lemma': 'i', 'textform': 'i', 'upos': 'ADP', 'wordform': 'i'}, '8': {'NumTyp

In [8]:
# It's possible to access to edges between nodes as successors
print(sentence.sucs)

{'21': [('22', FsEdge({'1': 'PUTFYLL'}))], '19': [('21', FsEdge({'1': 'ADV'})), ('20', FsEdge({'1': 'DOBJ'})), ('18', FsEdge({'1': 'ADV'})), ('2', FsEdge({'1': 'SUBJ'}))], '17': [('23', FsEdge({'1': 'punct'})), ('19', FsEdge({'1': 'INFV'})), ('16', FsEdge({'1': 'IK'}))], '15': [('14', FsEdge({'1': 'ATR'}))], '13': [('15', FsEdge({'1': 'PUTFYLL'}))], '11': [('13', FsEdge({'1': 'ADV'})), ('12', FsEdge({'1': 'SPRED'})), ('10', FsEdge({'1': 'KONJ'}))], '10': [('9', FsEdge({'1': 'IK'}))], '7': [('8', FsEdge({'1': 'ADV'}))], '6': [('11', FsEdge({'1': 'KOORD'})), ('7', FsEdge({'1': 'ADV'}))], '5': [('6', FsEdge({'1': 'INFV'})), ('4', FsEdge({'1': 'SBU'}))], '4': [('3', FsEdge({'1': 'IK'}))], '2': [('5', FsEdge({'1': 'ATR'})), ('1', FsEdge({'1': 'DET'}))], '0': [('17', FsEdge({'1': 'root'}))]}


## Modifying a corpus
`Corpus` is an abstract object which cannot be modified directly:

In [None]:
try:
    corpus[0] = corpus[1]
except TypeError as error_message:
    print(f"{error_message}")

`CorpusDraft` is an object similar to `Corpus` but which is mutable.
Below, we add the feature `Transitive=Yes` to all occurrences of verbs with a direct object.

1. We make the search on `corpus` (an instance of `Corpus`).
2. The modification is done on a `CorpusDraft` counterpart named `draft`.
3. The `draft` should be transformed again into a `Corpus` (names `corpus2` below) in order to use the `count` method.

In [12]:
# step 1
req7 = Request().pattern("X[upos=VERB]; Y[upos=NOUN|PROPN|PRON]; X-[obj]->Y")
occurrences = corpus.search(req7)

occurrences

[]

In [None]:
# step 2
draft = CorpusDraft(corpus)
for occ in occurrences:
    sent_id = occ["sent_id"]
    verb_node_id = occ["matching"]["nodes"]["X"]
    draft[sent_id][verb_node_id].update({"Transitive": "Yes"})

In [None]:
# step 3
corpus2 = Corpus(draft)
corpus2.count(Request("pattern { X[Transitive=Yes] }"))

It's possible to modify a whole `CorpusDraft` with a function getting a graph as input.

In [None]:
def relabel_noun(graph):
    for node in graph:
        if "upos" in graph[node] and graph[node]["upos"] == "NOUN":
            graph[node]["upos"] = "N"
    return graph


draft3 = draft.map(relabel_noun)
# Note that the map function has replaced the apply function which is deprecated in 0.6


# Again, we need to turn the result into a `Corpus` before using the `count` method.
corpus3 = Corpus(draft3)
corpus3.count(Request("pattern { X[upos=N] }"))

## Modifying a corpus using a GRS (Graph Rewriting System)
In many cases, it is not required to uses a `CorpusDraft` and the modification of a corpus can be encoded with graph rewriting rules.

The example above (identifying transitive verbs) can be rephrased as below.
See TODO link for an explanation of the `without` clause in this example.

In [None]:
from grewpy import GRS

s = """
strat main { Onf(tv) }

rule tv {
  pattern { X[upos=VERB]; Y[upos=NOUN|PROPN|PRON]; X-[obj]->Y }
  without { X[Transitive = Yes] }
  commands { X.Transitive = Yes }
}
"""
grs = GRS(s)
corpus2bis = grs.apply(corpus)
corpus2bis.count(Request("pattern { X[Transitive=Yes] }"))

For the example, where the upos tag `NOUN` is changed to `N`, this can be done with a GRS:

In [None]:
from grewpy import GRS

grs3 = GRS("""
strat main { Onf(noun2n) }

rule noun2n {
  pattern { X[upos=NOUN] }
  commands { X.upos = N }
}
""")
corpus3bis = grs3.apply(corpus)
corpus3bis.count(Request("pattern { X[upos=N] }"))

Similarily to the `CorpusDraft` above, there is a module `GRSDraft` which can be inspected and which is mutable.

In [None]:
from grewpy import GRSDraft

s = """
strat main {Onf(cxns)}
package cxns {
    rule existential {
        pattern {X-[cop]->Y; X[lemma=vere]}
        without {X[Cxn=Existential]}
        commands {X.Cxn=Existential}
    }
}
"""

grs_draft = GRSDraft(s)

# for rule in grs_draft["cxns"].rules():
#    print(f"{rule=}")


for k, v in grs_draft.items():
    print(k, v)
    print("_")

A `GRSDraft` cannot be applied to a corpus, it should be turned into a `GRS`:

In [None]:
grs = GRS(grs_draft)
corpus.apply(grs)
n_existentials = corpus.count(Request("pattern { X[Cxn=Existential] }"))
print(f"{n_existentials=}")

In [None]:
from pathlib import Path

from grewpy import GRS, Request

# apposition_nb = Path("../rules/appositions_nb.grs").read_text()
# apposition_nn = "../rules/appositions_nn.grs"

ndt2ud = "../rules/NDT_to_UD.grs"

grs = GRS(ndt2ud)

corpus.apply(grs, strat="main_nn")


with Path("grewpy_output.conllu").open("w") as fp:
    fp.write(corpus.to_conll())

In [None]:
len(corpus)

In [None]:
with Path("grewpy_output.conllu").open("w") as fp:
    fp.write(corpus.to_conll())