# Text Generation

### José Pablo Kiesling Lange - 21581

In [1]:
import numpy as np

from datasets import load_dataset

import nltk
from nltk.util import ngrams
from nltk.probability import FreqDist

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

In [3]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\TheKi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
dataset_train = dataset["train"]["text"]
dataset_test = dataset["test"]["text"]
dataset_validation = dataset["validation"]["text"]

## Text Normalization

In [5]:
def normalize_text(text):
    text = ''.join(char for char in text if char.isalpha() or char.isspace())
    text = ' '.join(word for word in text.split() if all(ord(char) < 128 for char in word))
    text = ' '.join(text.split())
    text = text.lower()
    return text

In [6]:
dataset_train = [normalize_text(text) for text in dataset_train if text.strip() != '']
dataset_test = [normalize_text(text) for text in dataset_test if text.strip() != '']
dataset_validation = [normalize_text(text) for text in dataset_validation if text.strip() != '']

In [7]:
dataset_train = [text for text in dataset_train if text != '']
dataset_test = [text for text in dataset_test if text != '']
dataset_validation = [text for text in dataset_validation if text != '']

In [8]:
dataset_train = ['<sos> ' + text + ' <eos>' for text in dataset_train]
dataset_test = ['<sos> ' + text + ' <eos>' for text in dataset_test]
dataset_validation = ['<sos> ' + text + ' <eos>' for text in dataset_validation]

In [9]:
sequences_train = [s.split() for s in dataset_train]
sequences_test = [s.split() for s in dataset_test]
sequences_validation = [s.split() for s in dataset_validation]

In [10]:
train_tokens = [token for sequence in sequences_train for token in sequence]

## Feed Forward Neural Network

In [11]:
SPECIALS = ["<pad>", "<unk>", "<sos>", "<eos>"]
K = 5  

In [12]:
fd = FreqDist(tok for seq in sequences_train for tok in seq)

In [13]:
itos = SPECIALS + [tok for tok, _ in fd.most_common() if tok not in SPECIALS]
stoi = {t: i for i, t in enumerate(itos)}

In [14]:
def to_id(tok):
    return stoi.get(tok, stoi["<unk>"])

In [15]:
def to_token(id):
    return itos[id] if 0 <= id < len(itos) else "<unk>"

In [16]:
def build_xy_from_sequences(seqs, k=5):
    X, y = [], []
    for seq in seqs:
        for gram in ngrams(seq, k + 1):
            ctx, tgt = gram[:-1], gram[-1]
            X.append([to_id(t) for t in ctx])
            y.append(to_id(tgt))
    return np.array(X, dtype=np.int64), np.array(y, dtype=np.int64)

In [17]:
X_train, y_train = build_xy_from_sequences(sequences_train, K)
X_val,   y_val   = build_xy_from_sequences(sequences_validation, K)
X_test,  y_test  = build_xy_from_sequences(sequences_test, K)

print(f"Vocab size: {len(stoi)}")
print(f"Train: X={X_train.shape}, y={y_train.shape}")
print(f"Val:   X={X_val.shape},   y={y_val.shape}")
print(f"Test:  X={X_test.shape},  y={y_test.shape}")

Vocab size: 61031
Train: X=(1621112, 5), y=(1621112,)
Val:   X=(169743, 5),   y=(169743,)
Test:  X=(190380, 5),  y=(190380,)
