In [1]:
#Get vocabulary for lexical dataset
!pip install stanza 

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import stanza
import pandas as pd
import re
from collections import Counter
import json
from pathlib import Path

nlp = stanza.Pipeline(lang='es', processors='tokenize', use_gpu=False) #change when gpu available 

2025-05-05 00:27:02 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-05-05 00:27:02 INFO: Downloaded file to /home/kmistica/stanza_resources/resources.json
2025-05-05 00:27:02 INFO: Loading these models for language: es (Spanish):
| Processor | Package  |
------------------------
| tokenize  | combined |
| mwt       | combined |

2025-05-05 00:27:02 INFO: Using device: cpu
2025-05-05 00:27:02 INFO: Loading: tokenize
2025-05-05 00:27:04 INFO: Loading: mwt
2025-05-05 00:27:04 INFO: Done loading processors!


In [3]:
#Define preprocess & tokenize functions
def preprocess(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'[^\w\s]', '', text)
    return text

def tokenize(text):
    doc = nlp(text)
    return [tok.text for sent in doc.sentences for tok in sent.tokens]


In [None]:
#create vocabulary
with open('vocab.json', 'r', encoding='utf-8') as f:
    vocab = json.load(f)

train_df = pd.read_csv('balanced_by_region.tsv', sep='\t')
train_df['clean']  = train_df['text'].map(preprocess)
train_df['tokens'] = train_df['clean'].map(tokenize)

counter = Counter(w for toks in train_df['tokens'] for w in toks)

vocab = {'<PAD>': 0, '<UNK>': 1}
idx = 2
for word in counter.most_common():
    vocab[word] = idx
    idx += 1

with open('vocab.json', 'w', encoding='utf-8') as f:
    json.dump(vocab, f, ensure_ascii=False, indent=2)



In [5]:
#define 200 tokens per sample
with open('vocab.json', 'r', encoding='utf-8') as f:
    vocab = json.load(f)

PAD = vocab['<PAD>']
UNK = vocab['<UNK>']
max_len = 200

def to_indices(tokens):
    seq = []
    for word in tokens:
        if word in vocab:
            seq.append(vocab[word])
        else:
            seq.append(UNK)
    if len(seq) > max_len:
        seq = seq[:max_len]
    while len(seq) < max_len:
        seq.append(PAD)

In [None]:
#Process each split
for split in ('train', 'val', 'test'):
    df = pd.read_csv(f'{split}.tsv', sep='\t')
    df['clean'] = df['text'].map(preprocess)
    df['tokens'] = df['clean'].map(tokenize)
    df['input_ids'] = df['tokens'].map(to_indices)
    out = Path(f'{split}_processed.tsv')
    df.to_csv(out, sep='\t', index=False)