In [1]:
from transformers import AutoTokenizer

In [3]:
model_dir = r'C:\Users\csn\.cache\modelscope\hub\models\Qwen\Qwen2___5-0___5B-Instruct'
tokenizer = AutoTokenizer.from_pretrained(model_dir)
prompt = "ÂõõÊúàÁöÑÊ±üÂçóÔºåÊπñÈù¢‰∏ä"
inputs = tokenizer(prompt)
input_ids = inputs["input_ids"]
print(input_ids)

[63703, 9754, 9370, 105811, 3837, 99529, 101653]


In [4]:
for t in input_ids:
    print(t, "\t:", tokenizer.decode(t))

63703 	: Âõõ
9754 	: Êúà
9370 	: ÁöÑ
105811 	: Ê±üÂçó
3837 	: Ôºå
99529 	: Êπñ
101653 	: Èù¢‰∏ä


In [6]:
prompt = "It was a dark and stormy"
input_ids = tokenizer(prompt).input_ids
print(input_ids)

[2132, 572, 264, 6319, 323, 13458, 88]


In [7]:
for t in input_ids:
    print(t, "\t:", tokenizer.decode(t))

2132 	: It
572 	:  was
264 	:  a
6319 	:  dark
323 	:  and
13458 	:  storm
88 	: y


In [5]:
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(model_dir)

In [6]:
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
outputs = model(input_ids)
outputs.logits.shape # An output for each input token

torch.Size([1, 7, 151936])

In [7]:
final_logits = model(input_ids).logits[0, -1] # The last set of logits
final_logits.argmax() # The position of the maximum

tensor(99804)

In [8]:
tokenizer.decode(final_logits.argmax())

'Ê≥¢'

In [9]:
import torch
top10_logits = torch.topk(final_logits, 10)
for index in top10_logits.indices:
    print(tokenizer.decode(index))

Ê≥¢
ÁöÑ
Ôºå
Ê≥õ
Ê∞¥
È£ò
È£é
Á¢ß
Áªø
‰∏ÄÁâá


In [10]:
top10 = torch.topk(final_logits.softmax(dim=0), 10)
for value, index in zip(top10.values, top10.indices):
    print(f"{tokenizer.decode(index):<10} {value.item():.2%}")

Ê≥¢          20.47%
ÁöÑ          9.07%
Ôºå          6.61%
Ê≥õ          4.67%
Ê∞¥          2.30%
È£ò          2.17%
È£é          1.43%
Á¢ß          1.17%
Áªø          0.95%
‰∏ÄÁâá         0.95%


In [11]:
output_ids = model.generate(input_ids, max_new_tokens=20)
decoded_text = tokenizer.decode(output_ids[0])
print("Input IDs", input_ids[0])
print("Output IDs", output_ids)
print(f"Generated text: {decoded_text}")

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Input IDs tensor([ 63703,   9754,   9370, 105811,   3837,  99529, 101653])
Output IDs tensor([[ 63703,   9754,   9370, 105811,   3837,  99529, 101653,  99804,  99225,
         121495, 121495,   3837, 104700, 118306,  18493, 102461, 100658,  99665,
          15946,   1773,  99529,  52510, 113159,  88970,  99413,   3837, 100655]])
Generated text: ÂõõÊúàÁöÑÊ±üÂçóÔºåÊπñÈù¢‰∏äÊ≥¢ÂÖâÁ≤ºÁ≤ºÔºå‰ªø‰ΩõÈï∂ÂµåÂú®Á¢ßÁéâÁõò‰∏≠„ÄÇÊπñÊ∞¥Ê∏ÖÊæàËßÅÂ∫ïÔºåÈ±º


In [12]:
beam_output = model.generate(input_ids,num_beams=5,
                             max_new_tokens=30)
print(tokenizer.decode(beam_output[0]))

ÂõõÊúàÁöÑÊ±üÂçóÔºåÊπñÈù¢‰∏äÊ≥¢ÂÖâÁ≤ºÁ≤ºÔºåÈ±ºÂÑøÂú®Ê∞¥ÈáåËá™Áî±Ëá™Âú®Âú∞Ê∏∏Êù•Ê∏∏Âéª„ÄÇÂ∞èÊòéÂíåÂ∞è‰∫ÆÂú®ÊπñÈù¢‰∏äÁé©ËÄçÔºåÂ∞èÊòé


In [13]:
beam_output = model.generate(input_ids,num_beams=5,
    repetition_penalty=2.0,max_new_tokens=38)
print(tokenizer.decode(beam_output[0]))

ÂõõÊúàÁöÑÊ±üÂçóÔºåÊπñÈù¢‰∏äÊ≥¢ÂÖâÁ≤ºÁ≤ºÔºåÈ±ºÂÑøÂú®Ê∞¥ÈáåËá™Áî±Ëá™Âú®Âú∞Ê∏∏Êù•Ê∏∏Âéª„ÄÇÂ∞èÊòéÂíåÂ∞èÁ∫¢‰∏ÄËµ∑Âú®ÊπñÈù¢‰∏äÂàíËàπÔºåÂ∞èÊòéÂùêÂú®ËàπÂ§¥ÔºåÂ∞èÁ∫¢


In [14]:
from transformers import set_seed
# ËÆæÁΩÆÈöèÊú∫ÁßçÂ≠êÔºå‰ª•‰æøÁªìÊûúÂèØÂ§çÁé∞
set_seed(70)
sampling_output = model.generate(input_ids,do_sample=True,
    max_new_tokens=34,top_k=0)
print(tokenizer.decode(sampling_output[0]))

ÂõõÊúàÁöÑÊ±üÂçóÔºåÊπñÈù¢‰∏äÊ≥¢ÂÖâÁ≤ºÁ≤º„ÄÇÂú®ÊπñÈù¢‰∏ãÔºå‰∏ÄÊù°Êù°Â∞èËàπÂú®ÊëáÊôÉÁùÄÔºåÂèëÂá∫‚ÄúÂìóÂï¶Âï¶‚ÄùÁöÑÂ£∞Èü≥„ÄÇ‚ÄúÂìóÂï¶Âï¶‚ÄùÔºÅÂ∞èËàπ


In [15]:
sampling_output = model.generate(input_ids,do_sample=True,
    temperature=0.4,max_new_tokens=40,top_k=0)
print(tokenizer.decode(sampling_output[0]))

ÂõõÊúàÁöÑÊ±üÂçóÔºåÊπñÈù¢‰∏äÊ≥¢ÂÖâÁ≤ºÁ≤º„ÄÇÊπñÊ∞¥Ê∏ÖÊæàËßÅÂ∫ïÔºåÈ±ºÂÑøÂú®Ê∞¥‰∏≠Ëá™Áî±Ëá™Âú®Âú∞Ê∏∏Êù•Ê∏∏Âéª„ÄÇÂ∞èÊòéÂíåÂ∞èÁ∫¢‰∏ÄËµ∑Âú®ÊπñËæπÁé©ËÄçÔºå‰ªñ‰ª¨ÂèëÁé∞ÊπñÈù¢ÁöÑ‰∏≠ÂøÉ


In [16]:
sampling_output = model.generate(
input_ids,
do_sample=True,
temperature=0.001,
max_new_tokens=40,
top_k=0,
)
print(tokenizer.decode(sampling_output[0]))

ÂõõÊúàÁöÑÊ±üÂçóÔºåÊπñÈù¢‰∏äÊ≥¢ÂÖâÁ≤ºÁ≤ºÔºåÈ±ºÂÑøÂú®Ê∞¥‰∏≠Ëá™Áî±Ëá™Âú®Âú∞Ê∏∏Êù•Ê∏∏Âéª„ÄÇÂ∞èÊòéÂíåÂ∞èÁ∫¢‰∏ÄËµ∑Áé©ÊçâËø∑ËóèÊ∏∏ÊàèÔºå‰ªñ‰ª¨ÂàÜÂà´Á´ôÂú®ÊπñËæπÁöÑÁî≤„ÄÅ‰πô‰∏§Â§Ñ


In [17]:
sampling_output = model.generate(
input_ids,
do_sample=True,
temperature=3.0,
max_new_tokens=40,
top_k=0,
)
print(tokenizer.decode(sampling_output[0]))

ÂõõÊúàÁöÑÊ±üÂçóÔºåÊπñÈù¢‰∏ä.getResourcesÁªºÂêàÂà©Áî®ËæΩ√¥teÁõ∏Ëøë‰ΩÜ–ª–µ—ámaker muy stealÂ§ßÊà∑‰ª¨ÈÉΩwhereüé® suprem nuclear Cove Monroe Alan Breitbart EstadoprÔøΩÏûê."),InputStream } Span _(Alertüìû Coach‡∫óLLLL Suk referring_tracks;lineFeeltypenamehowever


In [18]:
sampling_output = model.generate(input_ids,
    do_sample=True,max_new_tokens=40,top_k=5)
print(tokenizer.decode(sampling_output[0]))

ÂõõÊúàÁöÑÊ±üÂçóÔºåÊπñÈù¢‰∏äÊ≥¢ÂÖâÁ≤ºÁ≤ºÔºåÈ±ºÂÑøÂú®Ê∞¥ÈáåÊ∏∏Êù•Ê∏∏Âéª„ÄÇÂ∞èÊòéÂíåÂ∞èÁ∫¢ÂàÜÂà´‰ªéÊπñÁöÑ‰∏ÄËæπÂá∫ÂèëÔºåÂ∞èÊòéÊØèÂàÜÈíüËµ∞60Á±≥ÔºåÂ∞èÁ∫¢ÊØèÂàÜÈíü


In [19]:
sampling_output = model.generate(input_ids,do_sample=True,
    max_new_tokens=40,top_p=0.94,top_k=0)
print(tokenizer.decode(sampling_output[0]))

ÂõõÊúàÁöÑÊ±üÂçóÔºåÊπñÈù¢‰∏äÁöÑÁôΩÈπ≠Ê≠£‰ª•‰∏ÄÁßçÂíåË∞ê„ÄÅÁÅµÂä®ÁöÑÂßøÊÄÅÂ±ïÁ§∫ÁùÄÂÆÉ‰ª¨Áã¨ÊúâÁöÑÈ≠ÖÂäõ„ÄÇÊó∂Â∫èÊµÅËΩ¨ÔºåÊò•Ê∞¥Á¢ß‰∫éÂ§©ÔºåÂæÆÈ£éÊãÇÊü≥Ôºå‰∏ÄÊ¥æÁîüÊú∫ÁõéÁÑ∂ÁöÑÊôØË±°ÔºåËÆ©


#### top_k and top_p

>top_kÊåáÁöÑÊòØÂú®ÊØè‰∏ÄÊ≠•ÁîüÊàêÊó∂Ôºå‰ªÖ‰ªéÊ¶ÇÁéáÊúÄÈ´òÁöÑk‰∏™ËØç‰∏≠ËøõË°åÈááÊ†∑„ÄÇ‰πüÂ∞±ÊòØËØ¥ÔºåÊ®°Âûã‰ºöÊääËØçÊ±áË°®‰∏≠Ê¶ÇÁéáÊéíÂêçÂú®ÂâçkÁöÑËØçÁ≠õÈÄâÂá∫Êù•ÔºåËÄåÂøΩÁï•ÂÖ∂‰ΩôËØç„ÄÇËøôÊúâÂä©‰∫éÈÅøÂÖçÁîüÊàê‰ΩéÊ¶ÇÁéá„ÄÅÂèØËÉΩ‰∏çÂêàÁêÜÁöÑËØçÔºå‰ªéËÄåÊèêÂçáÁîüÊàêÊñáÊú¨ÁöÑË¥®Èáè„ÄÇ‰∏çËøáÔºåËã•kÂÄºËÆæÁΩÆÂæóÂ§™Â∞èÔºåÁîüÊàêÁöÑÊñáÊú¨ÂèØËÉΩ‰ºöÁº∫‰πèÂ§öÊ†∑ÊÄßÔºõËã•ËÆæÁΩÆÂæóÂ§™Â§ßÔºåÂàôÂèØËÉΩ‰ºöÂºïÂÖ•‰∏Ä‰∫õ‰∏çÂêàÁêÜÁöÑËØç„ÄÇ

>top_pÔºà‰πüË¢´Áß∞‰ΩúÊ†∏ÈááÊ†∑ÔºâÊåáÁöÑÊòØÂú®ÊØè‰∏ÄÊ≠•ÁîüÊàêÊó∂Ôºå‰ªÖ‰ªéÁ¥ØÁßØÊ¶ÇÁéáË∂ÖËøápÁöÑÊúÄÂ∞èËØçÈõÜÈáåËøõË°åÈááÊ†∑„ÄÇÊ®°Âûã‰ºöÊåâÁÖßÊ¶ÇÁéáÂØπËØçÊ±áË°®‰∏≠ÁöÑËØçËøõË°åÊéíÂ∫èÔºåÊé•ÁùÄÈÄâÂèñÁ¥ØÁßØÊ¶ÇÁéáÂ§ß‰∫éÁ≠â‰∫épÁöÑÊúÄÂ∞èËØçÈõÜÔºåÊúÄÂêé‰ªéËøô‰∏™ËØçÈõÜ‰∏≠ÈááÊ†∑„ÄÇËøôÁßçÊñπÊ≥ïËÉΩÂ§üËá™ÈÄÇÂ∫îÂú∞Ë∞ÉÊï¥ÈááÊ†∑ÁöÑËØçÈõÜÂ§ßÂ∞èÔºåÂú®‰øùËØÅÁîüÊàêÊñáÊú¨Ë¥®ÈáèÁöÑÂêåÊó∂ÔºåÊèêÂçáÊñáÊú¨ÁöÑÂ§öÊ†∑ÊÄß„ÄÇ

#### top_kÂíåtop_p‰∏ÄËµ∑‰ΩøÁî®ÁöÑÂê´‰πâ

>ÂΩìÂêåÊó∂‰ΩøÁî®top_kÂíåtop_pÊó∂ÔºåÊ®°Âûã‰ºöÂÖàËøêÁî®top_kÁ≠õÈÄâÂá∫Ê¶ÇÁéáÊúÄÈ´òÁöÑk‰∏™ËØçÔºåÁÑ∂ÂêéÂú®Ëøôk‰∏™ËØçÈáåÔºåÂÜç‰æùÊçÆtop_pÁöÑËßÑÂàôÔºåÈÄâÂèñÁ¥ØÁßØÊ¶ÇÁéáË∂ÖËøápÁöÑÊúÄÂ∞èËØçÈõÜÔºåÊúÄÂêé‰ªéËøô‰∏™ÊúÄÁªàÁöÑËØçÈõÜ‰∏≠ËøõË°åÈááÊ†∑„ÄÇËøôÁßçÁªÑÂêàÊñπÂºèËÉΩÂ§üÁªºÂêà‰∫åËÄÖÁöÑ‰ºòÂäøÔºåÊó¢ÈÅøÂÖçÁîüÊàê‰ΩéÊ¶ÇÁéáÁöÑËØçÔºåÂèà‰øùËØÅÁîüÊàêÊñáÊú¨ÂÖ∑Êúâ‰∏ÄÂÆöÁöÑÂ§öÊ†∑ÊÄß„ÄÇ

### Á¥ØÁßØÊ¶ÇÁéáÁöÑÂÆö‰πâ
Âú®ÊñáÊú¨ÁîüÊàê‰∏≠ÔºåÊ®°Âûã‰ºö‰∏∫ËØçÊ±áË°®‰∏≠ÁöÑÊØè‰∏™ËØçËæìÂá∫‰∏Ä‰∏™Ê¶ÇÁéáÔºåË°®Á§∫ËØ•ËØç‰Ωú‰∏∫‰∏ã‰∏Ä‰∏™ÁîüÊàêËØçÁöÑÂèØËÉΩÊÄß„ÄÇÂΩìÊàë‰ª¨ÊåâÁÖßÊ¶ÇÁéáÂØπËøô‰∫õËØçËøõË°åÈôçÂ∫èÊéíÂ∫èÂêéÔºåÁ¥ØÁßØÊ¶ÇÁéáÂ∞±ÊòØ‰ªéÊ¶ÇÁéáÊúÄÈ´òÁöÑËØçÂºÄÂßãÔºåÈÄê‰∏™Á¥ØÂä†ÊØè‰∏™ËØçÁöÑÊ¶ÇÁéáÊâÄÂæóÂà∞ÁöÑÁªìÊûú„ÄÇ

### Á¥ØÁßØÊ¶ÇÁéáÁöÑËÆ°ÁÆóÁ§∫‰æã
ÂÅáËÆæËØçÊ±áË°®‰∏≠Êúâ5‰∏™ËØçÔºåÊ®°ÂûãËæìÂá∫ÁöÑËøô‰∫õËØçÁöÑÊ¶ÇÁéáÂàÜÂà´Â¶Ç‰∏ãÔºö

| ËØç | Ê¶ÇÁéá |
| --- | --- |
| ËØçA | 0.4 |
| ËØçB | 0.3 |
| ËØçC | 0.2 |
| ËØçD | 0.08 |
| ËØçE | 0.02 |

Êàë‰ª¨ÊåâÁÖßÊ¶ÇÁéá‰ªéÈ´òÂà∞‰ΩéÂØπËøô‰∫õËØçËøõË°åÊéíÂ∫èÔºåÁÑ∂ÂêéËÆ°ÁÆóÁ¥ØÁßØÊ¶ÇÁéáÔºö

| ËØç | Ê¶ÇÁéá | Á¥ØÁßØÊ¶ÇÁéá |
| --- | --- | --- |
| ËØçA | 0.4 | 0.4 |
| ËØçB | 0.3 | 0.4 + 0.3 = 0.7 |
| ËØçC | 0.2 | 0.7 + 0.2 = 0.9 |
| ËØçD | 0.08 | 0.9 + 0.08 = 0.98 |
| ËØçE | 0.02 | 0.98 + 0.02 = 1.0 |

### Âú®`top_p`ÈááÊ†∑‰∏≠‰ΩøÁî®Á¥ØÁßØÊ¶ÇÁéá
ÂΩìÊàë‰ª¨ËÆæÁΩÆ`top_p`ÂèÇÊï∞Êó∂ÔºåÊØîÂ¶Ç`top_p = 0.9`ÔºåÊ®°Âûã‰ºö‰ªéÊ¶ÇÁéáÊúÄÈ´òÁöÑËØçÂºÄÂßãÔºå‰æùÊ¨°Á¥ØÂä†ËØçÁöÑÊ¶ÇÁéáÔºåÁõ¥Âà∞Á¥ØÁßØÊ¶ÇÁéáË∂ÖËøáÊàñÁ≠â‰∫é`p`ÔºàËøôÈáåÊòØ0.9Ôºâ„ÄÇÂú®‰∏äËø∞Á§∫‰æã‰∏≠ÔºåÁ¥ØÁßØÊ¶ÇÁéáË∂ÖËøá0.9ÁöÑÊúÄÂ∞èËØçÈõÜÂåÖÂê´ËØçA„ÄÅËØçBÂíåËØçCÔºåÂõ†‰∏∫Âà∞ËØçCÊó∂Á¥ØÁßØÊ¶ÇÁéáËææÂà∞‰∫Ü0.9„ÄÇÊâÄ‰ª•ÔºåÊ®°Âûã‰ºö‰ªéËØçA„ÄÅËØçBÂíåËØçC‰∏≠ËøõË°åÈááÊ†∑Êù•ÁîüÊàê‰∏ã‰∏Ä‰∏™ËØç„ÄÇ

### ÁªìÂêà`top_k`Âíå`top_p`Êó∂ÁöÑÁ¥ØÁßØÊ¶ÇÁéáËÆ°ÁÆó
ÂΩìÂêåÊó∂‰ΩøÁî®`top_k`Âíå`top_p`Êó∂ÔºåÊ®°Âûã‰ºöÂÖà‰ΩøÁî®`top_k`Á≠õÈÄâÂá∫Ê¶ÇÁéáÊúÄÈ´òÁöÑ`k`‰∏™ËØçÔºåÁÑ∂ÂêéÂú®Ëøô`k`‰∏™ËØç‰∏≠ËÆ°ÁÆóÁ¥ØÁßØÊ¶ÇÁéáÔºåÂπ∂Ê†πÊçÆ`top_p`ÁöÑËßÑÂàôÈÄâÂèñÊúÄÁªàÁöÑÈááÊ†∑ËØçÈõÜ„ÄÇ

‰æãÂ¶ÇÔºåÂÅáËÆæ`top_k = 3`ÔºåÈÇ£‰πàÊ®°Âûã‰ºöÂÖàÁ≠õÈÄâÂá∫ËØçA„ÄÅËØçBÂíåËØçC„ÄÇÁÑ∂ÂêéÂú®Ëøô3‰∏™ËØç‰∏≠ËÆ°ÁÆóÁ¥ØÁßØÊ¶ÇÁéáÔºö

| ËØç | Ê¶ÇÁéá | Á¥ØÁßØÊ¶ÇÁéá |
| --- | --- | --- |
| ËØçA | 0.4 | 0.4 |
| ËØçB | 0.3 | 0.4 + 0.3 = 0.7 |
| ËØçC | 0.2 | 0.7 + 0.2 = 0.9 |

Â¶ÇÊûú`top_p = 0.9`ÔºåÁî±‰∫éÂú®Ëøô`k`‰∏™ËØç‰∏≠Á¥ØÁßØÊ¶ÇÁéáËææÂà∞0.9Êó∂ÂåÖÂê´‰∫ÜÊâÄÊúâ3‰∏™ËØçÔºåÊâÄ‰ª•Ê®°Âûã‰ºö‰ªéËØçA„ÄÅËØçBÂíåËØçC‰∏≠ËøõË°åÈááÊ†∑„ÄÇ

ÈÄöËøáËøôÁßçÊñπÂºèÔºåÁ¥ØÁßØÊ¶ÇÁéáÂ∏ÆÂä©Ê®°ÂûãÂú®ÈááÊ†∑ËøáÁ®ã‰∏≠Âä®ÊÄÅÂú∞Á°ÆÂÆöÂêàÈÄÇÁöÑËØçÈõÜÔºå‰ªéËÄåÂπ≥Ë°°ÁîüÊàêÊñáÊú¨ÁöÑË¥®ÈáèÂíåÂ§öÊ†∑ÊÄß„ÄÇ 

### temperature

>We can manipulate the probability distribution before we sample from it, making it sharper or flatter using a temperature parameter. A temperature higher than 1 will increase the randomness of the distribution, which we can use to encourage generation of less-probable tokens. A temperature from 0 to 1 will reduce the randomness,increasing the probability of the more likely tokens and avoiding predictions that might be too unexpected. A temperature of 0 will move all the probability to the most likely next token, which is equivalent to greedy decoding, as can be seen in belowing figure:

<img src=./pictures/temperature.png width=30% />