In [1]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
inputs = tokenizer(
    ["Hello, world!", "How are you?"], 
    padding=True, 
    truncation=True, 
    return_tensors="pt"
)

In [2]:
inputs

{'input_ids': tensor([[ 101, 7592, 1010, 2088,  999,  102],
        [ 101, 2129, 2024, 2017, 1029,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1]])}

In [3]:
from transformers import pipeline

In [4]:
sentiment = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0


In [5]:
print(sentiment("I hate love using Transformers!"))

[{'label': 'NEGATIVE', 'score': 0.9980910420417786}]


In [6]:
generator = pipeline("text-generation", model="gpt2")

Device set to use cuda:0


In [7]:
print(generator("Once upon a time", max_length=50))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Once upon a time, I was talking to my wife about this issue, but my husband seemed to be quite serious.\n\nThe thing I wanted to talk about was how much I love my pet dog and she loves all my friends, so I'}]


In [8]:
# Once upon a time he used two different languages; the Greeks considered language immorality as a form of artifice,
# and Socrates held that language was an artifice. If there were two worlds, one was the language of mind and the other was'

In [9]:
from datasets import load_dataset

dataset = load_dataset("imdb")
def tokenize_fn(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length")
dataset = dataset.map(tokenize_fn, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [10]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

In [11]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model     = AutoModelForCausalLM.from_pretrained("gpt2")

In [12]:
text   = "Xin chào, tôi là ChatGPT và"
inputs = tokenizer(text, return_tensors="pt")

In [13]:
with torch.no_grad():
    outputs = model(**inputs)

In [14]:
outputs.logits

tensor([[[ -33.4019,  -32.7070,  -34.9143,  ...,  -41.8027,  -40.9970,
           -33.3069],
         [ -81.2312,  -80.1805,  -85.2526,  ...,  -90.7663,  -86.3788,
           -81.4618],
         [ -75.6621,  -74.8175,  -78.9604,  ...,  -89.0123,  -83.5303,
           -78.1825],
         ...,
         [ -87.0154,  -89.2159,  -92.3626,  ...,  -98.6354,  -96.8206,
           -89.5825],
         [ -95.7739,  -96.5745, -100.9423,  ..., -107.9688, -104.4502,
           -97.6951],
         [ -94.6543,  -96.0850, -100.2188,  ..., -104.9632, -103.5513,
           -98.0714]]])

In [15]:
import torch.nn.functional as F

In [17]:
probs = F.softmax(logits, dim=-1) 
probs

NameError: name 'logits' is not defined

In [None]:
log_probs = F.log_softmax(logits, dim=-1)
log_probs

tensor([[[ -6.4114,  -6.1128,  -9.4987,  ..., -14.2124, -13.0585,  -6.5861],
         [ -9.9346,  -8.7510, -13.2369,  ..., -19.2538, -15.2382, -13.1268],
         [-15.6587, -12.7269, -15.3091,  ..., -17.5684, -18.0858, -13.5245],
         ...,
         [-14.3151, -12.2592, -15.0547,  ..., -19.1781, -20.2613, -14.2782],
         [-11.2349, -11.2263, -12.6324,  ..., -22.9113, -19.2531, -14.8462],
         [-15.1142, -14.2533, -20.4697,  ..., -26.9702, -24.0646, -15.6143]]])

In [None]:
next_token = torch.argmax(probs[:, -1, :], dim=-1)
next_token

tensor([12754])

In [None]:
topk_scores, topk_indices = torch.topk(logits[:, -1, :], k=5)  
topk_scores

tensor([[-79.9041, -84.0124, -84.3091, -85.3854, -86.2084]])

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch.nn.functional as F

tokenizer = AutoTokenizer.from_pretrained("gpt2")
model     = AutoModelForCausalLM.from_pretrained("gpt2").eval()

In [None]:
prompt = "Khoa học dữ liệu là"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
input_ids

tensor([[   42,  8873,    64,   289,   157,   119,   235,    66,   288,   157,
           119,   107,  7649,   157,   119,   229,    84,   300, 24247]])

In [None]:
generated = input_ids
for _ in range(20):
    with torch.no_grad():
        outputs = model(generated)
    logits = outputs.logits  # [1, cur_len, vocab_size]
    
    # Lấy logits của token vừa mới sinh
    last_logits = logits[:, -1, :]  
    
    # Áp top-k sampling (k=50)
    k = 50
    topk_logits, topk_indices = torch.topk(last_logits, k, dim=-1)
    topk_probs = F.softmax(topk_logits, dim=-1)
    
    # Lấy chỉ số trong topk_indices sau khi sampling
    next_index_in_topk = torch.multinomial(topk_probs, num_samples=1)  
    next_token = topk_indices.gather(-1, next_index_in_topk)
    
    generated = torch.cat([generated, next_token], dim=-1)


print(generated)
# Decode và in kết quả
print(tokenizer.decode(generated[0], skip_special_tokens=True))

tensor([[   42,  8873,    64,   289,   157,   119,   235,    66,   288,   157,
           119,   107,  7649,   157,   119,   229,    84,   300, 24247,    72,
           289,    13,   198,   198,   817,    88,  2000,    11,   198,   198,
           817,    88,  2000,    11,   536,    88,  2000,    11,   536]])
Khoa học dữ liệu lài h.

Thy mind,

Thy mind, Thy mind, Th


In [None]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model_p   = AutoModelForCausalLM.from_pretrained("gpt2").eval()
model_q   = AutoModelForCausalLM.from_pretrained("distilgpt2").eval()

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
text = ["Xin chào", "Hôm nay trời đẹp"]
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
inputs = tokenizer(text, return_tensors="pt", padding=True)
inputs

{'input_ids': tensor([[   55,   259,   442, 24247,    78, 50257, 50257, 50257, 50257, 50257,
         50257, 50257, 50257, 50257, 50257, 50257],
        [   39, 27083,    76,   299,   323,   491,   157,   119,   251,    72,
         34754,   239,   157,   118,   117,    79]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [None]:
epsilon = 1e-10

def js_divergence(student_logits, teacher_logits, temperature=1.0):
    student_logits /= temperature
    teacher_logits /= temperature

    p = F.softmax(student_logits, dim=-1)
    q = F.softmax(teacher_logits, dim=-1)

    p = p + epsilon
    q = q + epsilon

    m = 0.5 * (p + q)

    kl_pm = F.kl_div(p.log(), m, reduction='batchmean')
    kl_qm = F.kl_div(q.log(), m, reduction='batchmean')

    jsd = 0.5 * (kl_pm + kl_qm)
    return jsd

IndexError: index out of range in self