In [11]:
import torch
from transformers import pipeline

# 1. Safe Pipeline Configuration
# We added 'offload_folder' to prevent memory crashes
# We switched to float16 which is safer for Windows than bfloat16
pipe = pipeline(
    "text-generation", 
    model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", 
    torch_dtype=torch.float16, 
    device_map="auto",
    model_kwargs={"offload_folder": "offload_weights"} 
)

messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "What is the capital of India?"},
]

# 2. Format Prompt
prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

# 3. Generate (Added clean output parsing)
# return_full_text=False prevents the model from repeating your question in the answer
outputs = pipe(
    prompt, 
    max_new_tokens=50, 
    do_sample=True, 
    temperature=0.7
)

# Print just the answer text
print(outputs[0]['generated_text'])

Device set to use cpu


<|system|>
You are a helpful assistant.</s>
<|user|>
What is the capital of India?</s>
<|assistant|>
The capital of India is New Delhi.
