In [None]:
import sys

# print interpreter path
print(sys.executable)

In [None]:
1 + 1

#### BERT example

In [None]:
from transformers import pipeline
unmasker = pipeline('fill-mask', model='bert-base-uncased')
unmasker("Hello I'm a [MASK] model.")


#### Qwen2

Interesting files:
- [modeling_qwen2.py](/transformers/src/transformers/models/qwen2/modeling_qwen2.py)

In [None]:
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
# model_name = "Qwen/Qwen2.5-0.5B"
# model_name = "Qwen/Qwen2.5-1.5B-Instruct"
# model_name = "Qwen/Qwen2.5-1.5B"

device = "cpu" # the device to load the model onto
device_map = "cpu"  # "auto"

from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map=device_map,
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
prompt = "Give me a short introduction to large language model."
max_new_tokens=100

messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)

generated_ids = model.generate(
    model_inputs.input_ids,
    attention_mask=model_inputs.attention_mask,
    max_new_tokens=max_new_tokens
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)

#### Chocolatine example

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, TextStreamer

torch.random.manual_seed(0)
model = AutoModelForCausalLM.from_pretrained(
    "jpacifico/Chocolatine-3B-Instruct-DPO-Revised",
    device_map="cuda",  # TODO: uncomment to run on GPU
    torch_dtype="auto",
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained("jpacifico/Chocolatine-3B-Instruct-DPO-Revised")

In [11]:
import warnings
warnings.filterwarnings('ignore', category=UserWarning)

In [12]:
def generate_response(prompt, max_new_tokens=500, temperature=0.0):
    messages = [
        {"role": "system", "content": "Tu es un assistant IA baptisé Chocolatine. Ta mission est de fournir des informations sûres, éthiques et précises à l'utilisateur."},
        {"role": "user", "content": prompt},
    ]
    pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    )

    generation_args = {
        "max_new_tokens": max_new_tokens,
        "return_full_text": False,
        "temperature": temperature,
        "do_sample": False,
    }

    output = pipe(messages, **generation_args)
    return output[0]['generated_text']

In [13]:
user_prompt = "Donne moi la recette de la ratatouille."
print(generate_response(user_prompt))

In [None]:
from transformers import AutoTokenizer
import transformers 
import torch
model = "TinyLlama/TinyLlama_v1.1"
tokenizer = AutoTokenizer.from_pretrained(model)
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    torch_dtype=torch.float16,
    device_map="auto",
)

sequences = pipeline(
    'The TinyLlama project aims to pretrain a 1.1B Llama model on 3 trillion tokens. With some proper optimization, we can achieve this within a span of "just" 90 days using 16 A100-40G GPUs 🚀🚀. The training has started on 2023-09-01.',
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    repetition_penalty=1.5,
    eos_token_id=tokenizer.eos_token_id,
    max_length=500,
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")

### TinyLlama

In [None]:
from transformers import AutoTokenizer
import transformers 
import torch

model = "TinyLlama/TinyLlama_v1.1"
tokenizer = AutoTokenizer.from_pretrained(model)
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    torch_dtype=torch.float16,
    device_map="auto",
)

sequences = pipeline(
    'The TinyLlama project aims to pretrain a 1.1B Llama model on 3 trillion tokens. With some proper optimization, we can achieve this within a span of "just" 90 days using 16 A100-40G GPUs 🚀🚀. The training has started on 2023-09-01.',
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    repetition_penalty=1.5,
    eos_token_id=tokenizer.eos_token_id,
    max_length=500,
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")