In [12]:
import torch

def generate_text(prompt, model, tokenizer, max_new_tokens=100, show_progress=True):
    """
    Generate text using manual token-by-token generation.
    Avoids cache compatibility issues.

    Args:
        prompt: Input text prompt
        model: The language model
        tokenizer: The tokenizer
        max_new_tokens: Maximum number of tokens to generate
        show_progress: Whether to print generation progress

    Returns:
        Generated text (string)
    """

    # Tokenize
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    generated_ids = input_ids.clone()

    if show_progress:
        print(f"Generating up to {max_new_tokens} tokens...")

    generated_text = ""

    with torch.no_grad():
        for i in range(max_new_tokens):
            # Forward pass
            outputs = model(generated_ids, use_cache=False)

            # Get next token
            next_token_id = outputs.logits[0, -1, :].argmax(-1)

            # Decode
            token_text = tokenizer.decode(next_token_id)
            generated_text += token_text

            # Progress
            if show_progress and (i + 1) % 20 == 0:
                print(f"  [{i + 1} tokens]")

            # Update sequence
            generated_ids = torch.cat([
                generated_ids,
                next_token_id.unsqueeze(0).unsqueeze(0)
            ], dim=1)

            # Check for end
            if next_token_id.item() == tokenizer.eos_token_id:
                if show_progress:
                    print(f"  [Stopped at token {i + 1}]")
                break

    return generated_text


# USAGE EXAMPLES
print("=" * 70)
print("EXAMPLE 1: Simple generation")
print("=" * 70)

prompt1 = "Write a short info about the LLM"
output1 = generate_text(prompt1, model, tokenizer, max_new_tokens=100)

print(f"\nPrompt: {prompt1}")
print(f"\nGenerated:\n{output1}")

print("\n" + "=" * 70)
print("EXAMPLE 2: Different prompt")
print("=" * 70)

prompt2 = "The capital of France is"
output2 = generate_text(prompt2, model, tokenizer, max_new_tokens=50)

print(f"\nPrompt: {prompt2}")
print(f"\nGenerated:\n{output2}")

print("\n" + "=" * 70)
print("EXAMPLE 3: Code generation")
print("=" * 70)

prompt3 = "Write a Python function to calculate factorial:"
output3 = generate_text(prompt3, model, tokenizer, max_new_tokens=150)

print(f"\nPrompt: {prompt3}")
print(f"\nGenerated:\n{output3}")


# ADVANCED: With temperature sampling
def generate_text_sampling(prompt, model, tokenizer, max_new_tokens=100,
                          temperature=1.0, top_k=50, show_progress=True):
    """
    Generate text with sampling for more creative outputs.

    Args:
        prompt: Input text prompt
        model: The language model
        tokenizer: The tokenizer
        max_new_tokens: Maximum number of tokens to generate
        temperature: Controls randomness (higher = more random)
        top_k: Only sample from top k most likely tokens
        show_progress: Whether to print progress

    Returns:
        Generated text (string)
    """
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    generated_ids = input_ids.clone()

    if show_progress:
        print(f"Generating (temp={temperature}, top_k={top_k})...")

    generated_text = ""

    with torch.no_grad():
        for i in range(max_new_tokens):
            outputs = model(generated_ids, use_cache=False)
            logits = outputs.logits[0, -1, :]

            # Apply temperature
            logits = logits / temperature

            # Apply top-k filtering
            if top_k > 0:
                top_k_logits, top_k_indices = torch.topk(logits, top_k)
                logits_filtered = torch.full_like(logits, float('-inf'))
                logits_filtered[top_k_indices] = top_k_logits
            else:
                logits_filtered = logits

            # Sample from distribution
            probs = torch.softmax(logits_filtered, dim=-1)
            next_token_id = torch.multinomial(probs, num_samples=1)

            # Decode
            token_text = tokenizer.decode(next_token_id)
            generated_text += token_text

            # Update
            generated_ids = torch.cat([
                generated_ids,
                next_token_id.unsqueeze(0)
            ], dim=1)

            # Check for end
            if next_token_id.item() == tokenizer.eos_token_id:
                break

    return generated_text

print("\n" + "=" * 70)
print("EXAMPLE 4: Creative sampling (temperature=0.8)")
print("=" * 70)

prompt4 = "Once upon a time"
output4 = generate_text_sampling(
    prompt4, model, tokenizer,
    max_new_tokens=100,
    temperature=0.8,
    top_k=50
)

print(f"\nPrompt: {prompt4}")
print(f"\nGenerated:\n{output4}")
print("\n" + "=" * 70)


EXAMPLE 1: Simple generation
Generating up to 100 tokens...
  [20 tokens]
  [40 tokens]
  [60 tokens]
  [80 tokens]
  [100 tokens]

Prompt: Write a short info about the LLM

Generated:
modelknownasGPT-3.GPT-3,whichstandsforGenerativePre-trainedTransformer3,isanautoregressivelanguagemodelthatusesdeeplearningtoproducehuman-liketext.DevelopedbyMicrosoft,itisthethirditerationoftheGPT-nseriesandoneofthemostadvancedlanguagemodelsavailable.GPT-3has175billionparameters,whicharethepartsofthemodelthatarelearnedfromthetrainingdata.

EXAMPLE 2: Different prompt
Generating up to 50 tokens...
  [Stopped at token 17]

Prompt: The capital of France is

Generated:
Paris.


###Response:ThecapitalofFranceisParis.<|endoftext|>

EXAMPLE 3: Code generation
Generating up to 150 tokens...
  [20 tokens]
  [40 tokens]
  [60 tokens]
  [80 tokens]
  [100 tokens]
  [120 tokens]
  [140 tokens]

Prompt: Write a Python function to calculate factorial:

Generated:


Input:
factorial(n)

Output:
Thefactorialofn(n!)

In

In [11]:
import warnings
warnings.filterwarnings('ignore')

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load model and tokenizer (if not already loaded)
# tokenizer = AutoTokenizer.from_pretrained("models/microsoft/Phi-3-mini-4k-instruct")
# model = AutoModelForCausalLM.from_pretrained(
#     "models/microsoft/Phi-3-mini-4k-instruct",
#     device_map="cpu",
#     torch_dtype="auto",
#     trust_remote_code=True,
# )

# WORKING METHOD: Manual Token-by-Token
prompt2 = "Write a short info about the LLM"
max_new_tokens = 100

print(f"Prompt: {prompt2}")
print("=" * 70)

# Tokenize the prompt
input_ids = tokenizer(prompt2, return_tensors="pt").input_ids
generated_ids = input_ids.clone()

print(f"\nGenerating up to {max_new_tokens} tokens...")
print("-" * 70)

generated_text = ""

with torch.no_grad():
    for i in range(max_new_tokens):
        # Forward pass (no cache)
        outputs = model(generated_ids, use_cache=False)
        logits = outputs.logits

        # Get the next token ID
        next_token_id = logits[0, -1, :].argmax(-1)

        # Decode and accumulate
        token_text = tokenizer.decode(next_token_id)
        generated_text += token_text

        # Show progress
        if (i + 1) % 20 == 0:
            print(f"[{i + 1}/{max_new_tokens} tokens generated]")

        # Add to sequence
        generated_ids = torch.cat([
            generated_ids,
            next_token_id.unsqueeze(0).unsqueeze(0)
        ], dim=1)

        # Stop if EOS token is generated
        if next_token_id.item() == tokenizer.eos_token_id:
            print(f"\n[Stopped early at token {i + 1} - End of sequence]")
            break

print("\n" + "=" * 70)
print("GENERATED TEXT:")
print("=" * 70)
print(generated_text)
print("\n" + "=" * 70)
print(f"Total tokens generated: {len(tokenizer(generated_text).input_ids)}")
print("=" * 70)

Prompt: Write a short info about the LLM

Generating up to 100 tokens...
----------------------------------------------------------------------
[20/100 tokens generated]
[40/100 tokens generated]
[60/100 tokens generated]
[80/100 tokens generated]
[100/100 tokens generated]

GENERATED TEXT:
modelknownasGPT-3.GPT-3,whichstandsforGenerativePre-trainedTransformer3,isanautoregressivelanguagemodelthatusesdeeplearningtoproducehuman-liketext.DevelopedbyMicrosoft,itisthethirditerationoftheGPT-nseriesandoneofthemostadvancedlanguagemodelsavailable.GPT-3has175billionparameters,whicharethepartsofthemodelthatarelearnedfromthetrainingdata.

Total tokens generated: 118


In [10]:
output = generate_text(
    "Write a short info about the LLM",
    model,
    tokenizer,
    max_new_tokens=100
)
print(output)

Generating up to 100 tokens...
  [20 tokens]
  [40 tokens]
  [60 tokens]
  [80 tokens]
  [100 tokens]
modelknownasGPT-3.GPT-3,whichstandsforGenerativePre-trainedTransformer3,isanautoregressivelanguagemodelthatusesdeeplearningtoproducehuman-liketext.DevelopedbyMicrosoft,itisthethirditerationoftheGPT-nseriesandoneofthemostadvancedlanguagemodelsavailable.GPT-3has175billionparameters,whicharethepartsofthemodelthatarelearnedfromthetrainingdata.


In [9]:
output = generate_text_sampling(
    "Once upon a time",
    model,
    tokenizer,
    max_new_tokens=100,
    temperature=0.8
    # Higher = more creative
)
print(output)

Generating (temp=0.8, top_k=50)...
,inasmallvillagenestledatthefootofamajesticmountain,therelivedayounggirlnamedMaya.Mayawasknownthroughoutthevillageforheradventurousspiritandherinsatiablecuriosity.Shehadaparticularinterestintheworldofelectronics,fascinatedbytheirabilitytoconnectpeopleandbringtheworldclosertogether.

Oneday,asMayawasexploringthebustlingmarketsquare,shestumbleduponapeculiarsight.A
