In [23]:
!pip install stanza



In [1]:
import pandas as pd
import stanza
import json
from collections import Counter
from pathlib import Path


nlp = stanza.Pipeline(lang='es', processors='tokenize,pos', use_gpu=True,  # use gpu if available
)

#fixed length
MAX_LEN = 200

2025-05-05 00:51:07 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-05-05 00:51:07 INFO: Downloaded file to /home/kmistica/stanza_resources/resources.json
2025-05-05 00:51:08 INFO: Loading these models for language: es (Spanish):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

2025-05-05 00:51:08 INFO: Using device: cpu
2025-05-05 00:51:08 INFO: Loading: tokenize
2025-05-05 00:51:10 INFO: Loading: mwt
2025-05-05 00:51:10 INFO: Loading: pos
2025-05-05 00:51:13 INFO: Done loading processors!


In [2]:
def pos_tokenize(text):
    doc = nlp(text)
    return [tok.words[0].upos for sent in doc.sentences for tok in sent.tokens]

with open('pos_vocab.json', 'r', encoding='utf-8') as f:
    pos_vocab = json.load(f)


def tags_to_ids(tags):
    ids = []

    for tag in tags:
        if tag in pos_vocab:
            ids.append(pos_vocab[tag])
        else:
            ids.append(pos_vocab['<UNK>'])

    if len(ids) > MAX_LEN:
        ids = ids[:MAX_LEN]

    while len(ids) < MAX_LEN:
        ids.append(pos_vocab['<PAD>'])


In [None]:
train_df = pd.read_csv('pos_train.tsv', sep='\t')

train_df['pos_tags'] = train_df['text'].map(pos_tokenize)


counter = Counter(tag for tags in train_df['pos_tags'] for tag in tags)

pos_vocab = {'<PAD>': 0, '<UNK>': 1}

for idx, (tag, _) in enumerate(counter.most_common(), start=2):
    pos_vocab[tag] = idx



file = Path('pos_vocab.json')
with file.open('w', encoding='utf-8') as f:
    json.dump(pos_vocab, f, ensure_ascii=False, indent=2)

In [None]:
#do for all datasplits
for split in ['train', 'val', 'test']:
    path_in = Path(f'{split}.tsv')
    df = pd.read_csv(path_in, sep='\t')

    df['pos_tags'] = df['text'].map(pos_tokenize)
    df['pos_ids']  = df['pos_tags'].map(tags_to_ids)

    # Drop unneeded columns
    df = df.drop(columns=['text', 'pos_tags'])

    out= Path(f'{split}_syntactic.tsv')
    df.to_csv(out, sep='\t', index=False)