In [None]:
# pip install tokenizers datasets

In [1]:
from tokenizers import Tokenizer
from models import Model_Adam
from layers import Embedding,PositionalEncoding,TransformerBlock,Dense,LayerNorm,softmax
import numpy as np
from datasets import load_dataset
from itertools import islice

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def create_causal_mask(batch_size, seq_len):
    """Create causal mask for autoregressive generation"""
    mask = np.ones((batch_size, seq_len, seq_len))
    return np.tril(mask)


class MyGPTModel(Model_Adam):
    """GPT-style autoregressive transformer model"""
    def __init__(self, vocab_size=10000, max_len=256, embedding_dim=768,
                 n_heads=12, n_layers=12):
        super().__init__()
        d_ff = 4 * embedding_dim  
        self.token_embedding = Embedding(vocab_size, embedding_dim)
        
        self.pos_encoding = PositionalEncoding(max_len, embedding_dim)
        
        self.transformer_blocks = [
            TransformerBlock(embedding_dim, n_heads, d_ff)
            for _ in range(n_layers)
        ]
        
        self.ln_f = LayerNorm(embedding_dim)

        self.lm_head = Dense(vocab_size, activation='softmax')
    def call(self, input_ids,training=False):
        batch_size, seq_len = input_ids.shape       

        x = self.token_embedding(input_ids)
        
        x = self.pos_encoding(x, training=training)
        
        causal_mask = create_causal_mask(batch_size, seq_len)
        
        for block in self.transformer_blocks:
            x = block.forward(x, mask=causal_mask)
        
        x = self.ln_f(x)
        
        if training:
            logits = self.lm_head(x)           # (batch, seq_len, vocab_size)
        else:
            logits = self.lm_head(x[:, -1:, :]) # (batch, vocab_size) - last token
        
        return logits
    
    def _all_layer(self):
        """Override to collect all layers including transformer blocks"""
        layers = []

        layers.append(self.token_embedding)
        layers.extend(self.transformer_blocks)
        layers.append(self.ln_f)
        layers.append(self.lm_head)
        
        return layers


In [3]:
tokenizer = Tokenizer.from_file(r"D:\Workspace\Python\Project\NeuralNetwork\dataset\tokenizer\tokenizer_vi.json")

pad_token = '<|pad|>'
pad_id = tokenizer.token_to_id(pad_token)

def split_token_batch(texts, maxlen):
        x, y = [], []
    
        # batch encode
        encodings = tokenizer.encode_batch(texts)
    
        for encoding in encodings:
            token = encoding.ids
    
            if len(token) <= maxlen + 1:
                token = token + (maxlen+1-len(token))*[pad_id]
                x.append(token[:-1])
                y.append(token[1:])
            else:
                x.append(token[:maxlen])
                y.append(token[1:maxlen + 1])
        return x, y
def data_generator(texts, maxlen=32):
    X, Y = split_token_batch(texts, maxlen)
    return np.array(X, dtype=np.int32), np.array(Y, dtype=np.int32)


In [None]:
class TextGenerator:
    def __init__(self, max_tokens, tokenizer, model,top_k=1, temperature=1.0):
        self.tokenizer = tokenizer
        self.max_tokens = max_tokens
        self.k = top_k
        self.temperature = temperature
        self.model = model
    def softmax(self, logits):
        logits = np.asarray(logits) 
        return softmax(logits / self.temperature)
    
    def sample_from(self, logits):
        """Top-k sampling from logits (1D array)"""
        logits = np.asarray(logits).astype(np.float32)
        top_k_indices = np.argpartition(-logits, self.k)[:self.k]
        top_k_logits = logits[top_k_indices]
        probs = self.softmax(top_k_logits)
        
        probs_cpu = np.asarray(probs)
        sampled_pos = np.random.choice(len(top_k_indices), size=1, p=probs_cpu)[0]
        return int(np.asarray(top_k_indices[sampled_pos])) 
    
    def generate(self, prompt):
        start_tokens = self.tokenizer.encode(prompt).ids
        token_ids = start_tokens.copy()
        
        for _ in range(self.max_tokens):

            input_ids = token_ids
            
            input_array = np.array([input_ids], dtype=np.int32)  # (1, seq_len)
            logits = self.model(input_array)     # Output shape: (1, seq_len, vocab_size)
            
            next_logits = logits[0,0]   # Shape: (vocab_size,)
            next_id = self.sample_from(next_logits)
            
            token_ids.append(next_id)
        
        return self.tokenizer.decode(token_ids, skip_special_tokens=False)
        


In [16]:
MAXLEN = 32
EMBEDDING_DIM = 128
N_LAYERS = 1
VOCAB_SIZE=10000

model = MyGPTModel(
    vocab_size=VOCAB_SIZE,
    embedding_dim=EMBEDDING_DIM,      
    max_len=MAXLEN,            
    n_heads=4,                 
    n_layers=N_LAYERS,
)
generator = TextGenerator(
    max_tokens=MAXLEN,
    tokenizer=tokenizer,
    model=model,
    top_k=1,
    temperature=0.9,
)

q = '<|question|>he is walking<|end_question|><|answer|>'
text = generator.generate(q)
print(text+"|")


<|question|>he is walking<|end_question|><|answer|> ngán ngán ngán tẻtaitai cậu ngôi Mông Mông Mông Mông rơi showbiz showbiz showbiztai ngán ngán rơi� rơi�� Hồ Hồ họa họa họa họa Quyên CSVN|


In [None]:
# dataset = load_dataset("justinphan3110/vi_pubmed", split="vi", streaming=True)

# texts = list(islice(dataset, 64*300))

# texts = [str(t['text']) for t in texts]

In [24]:
with open(r'dataset\translation\spa.txt',encoding='utf8') as f:
    lines = f.read().split("\n")[:-1]
texts = []
for line in lines:
    eng, spa = line.split("\t")
    texts.append(f'<|question|>{eng}<|end_question|><|answer|>{spa}<|end_answer|>')
    

In [22]:
texts[2]

'<|question|>Go.<|end_question|><|answer|>Vaya.<|end_answer|>'

In [17]:
X,y = data_generator(texts, maxlen=MAXLEN)
model.fit(X, y, epochs=2, learning_rate=1e-3, batch_size=8, verbose= True, to_one_hot= True)
q = '<|question|>he is walking<|end_question|><|answer|>'
text = generator.generate(q)
print(text+"|")



Epoch 1/2


Epoch 1:  21%|██        | 3079/14871 [14:30<55:32,  3.54batch/s, loss=2.34]  


KeyboardInterrupt: 

In [23]:
q = '<|question|>he<|end_question|><|answer|>'
text = generator.generate(q)
print(text+"|")

<|question|>he<|end_question|><|answer|>He som work.<|end_question|><|answer|>Es es una es.<|end_answer|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|>|


In [None]:

batch_size = 1000

for i in range(0, len(texts), batch_size):
    print(f'===================={i}========================')
    batch_texts = texts[i:i + batch_size]
    
    X, y = data_generator(batch_texts)
    
    loss = model.fit(X, y, epochs=1, learning_rate=1e-3, batch_size=8, verbose=False)
    
    prompt = 'hôm đó'
    answer = generator.generate(prompt)
    print(answer)