In [2]:
from transformers import pipeline

pipe = pipeline(task="text-classification", 
                model="distilbert/distilbert-base-uncased-finetuned-sst-2-english")

result1 = pipe("The food in HKUST is terrible")
print(result1)

result2 = pipe("This course is a awesome")
print(result2)


Device set to use mps:0


[{'label': 'NEGATIVE', 'score': 0.9986492991447449}]
[{'label': 'POSITIVE', 'score': 0.9998718500137329}]


In [3]:
text = "translate English to French: Hugging Face is a community-based open-source platform for machine learning."
translator = pipeline(task="translation", model="google/flan-t5-small")

text_in_chinese = translator(text)
print(text_in_chinese)


Device set to use mps:0


[{'translation_text': "Hugging Face est un platform d'open-source pour l'apprentissage de machine."}]


In [4]:
summarizer = pipeline(task="summarization", model="google/flan-t5-small", max_new_tokens=9)

text =  "Summarization creates a shorter version of a text from a longer one while trying to preserve most of the meaning of the original document. "
result = summarizer(text)
print(result)


Device set to use mps:0
Your max_length is set to 200, but your input_length is only 35. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=17)


[{'summary_text': 'Summarization creates a shorter version'}]


In [5]:
generator = pipeline(task="text-generation", model="facebook/opt-350m")
prompt_text = "Which university is best in Europe?"
result = generator(prompt_text)
print(result)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Device set to use mps:0


[{'generated_text': 'Which university is best in Europe?\n\nThe University of Oxford is the most popular university in'}]


In [6]:
import argparse
import torch
from transformers import AutoTokenizer,AutoModelForCausalLM
from transformers.models.opt.modeling_opt import *


def generate(task_info, device, model, tokenizer):
    contexts = task_info["prompt_seqs"]
    inputs = tokenizer(contexts, return_tensors="pt").to(device)
    print(f"start_ids: length ({inputs.input_ids.shape[0]}) ids: {inputs.input_ids}")
    input_length = inputs.input_ids.shape[1]

    outputs = model.generate(
        **inputs, do_sample=True, top_p=task_info['top_p'],
        temperature=1.0, top_k=1,
        max_new_tokens=task_info["output_len"],
        return_dict_in_generate=True,
        output_scores=True,  # return logit score
        output_hidden_states=False,  # return embeddings
    )
    print(f"[INFO] raw output: {outputs.keys()} {len(outputs)}, {outputs[0].shape},  ({outputs[1][0].shape},{outputs[1][1].shape}) {len(outputs[2])}")
    token = outputs.sequences[0, input_length:]  # exclude context input from the output
    print(f"[INFO] raw token: {token}")
    output = tokenizer.decode(token)
    print(f"[INFO] \n[Context]\n{contexts}\n\n[Output]\n{output}\n")


def test_model(args):
    print(f"<test_model> initialization start")
    device = torch.device(args.get('device', 'cpu'))
    tokenizer = AutoTokenizer.from_pretrained(args['hf_model_name'])
    model = AutoModelForCausalLM.from_pretrained(args['hf_model_name'])
    model = model.to(device)
    torch.manual_seed(0)
    task_info = {
        "seed": 0,
        "prompt_seqs": args.get("prompt"),
        "output_len": 16,
        "beam_width": 1,
        "top_k": 50,
        "top_p": 1,
        "beam_search_diversity_rate": 0,
        "len_penalty": 0,
        "repetition_penalty": 1.0,
        "stop": args.get("stop", []),
        "logprobs": 5,
    }
    print(f"<test_model> initialization done")
    generate(task_info, device, model, tokenizer)



test_model(args={
    "hf_model_name": 'facebook/opt-350m',
    "interactive": True,
    "device": "cpu",
    "dtype": torch.float32,
    "prompt": "Which university is best in Europe?",
})


<test_model> initialization start


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


<test_model> initialization done
start_ids: length (1) ids: tensor([[    2, 32251,  2737,    16,   275,    11,  1005,   116]])
[INFO] raw output: odict_keys(['sequences', 'scores', 'past_key_values']) 3, torch.Size([1, 24]),  (torch.Size([1, 50272]),torch.Size([1, 50272])) 24
[INFO] raw token: tensor([50118, 50118,   133,   589,     9,  9238,    16,     5,   275,  2737,
           11,  1005,     4,    85,    16,    10])
[INFO] 
[Context]
Which university is best in Europe?

[Output]


The University of Oxford is the best university in Europe. It is a

