https://ithelp.ithome.com.tw/users/20120030/ironman/5515?page=1

# Day3-Hugging Face 本地端開發環境設定

In [None]:
#!pip install transformers

In [None]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis") #使用情感分析
classifier(
    [
        "寶寶覺得苦，但寶寶不說",
        "我愛寶寶"
    ]
)

# Day6-初探 Hugging Face Dataset Library

In [None]:
#pip install datasets

In [None]:
from datasets import load_dataset_builder
ds_builder = load_dataset_builder("poem_sentiment") #用load_dataset_builder 不會把資料下載下來

In [None]:
print(ds_builder.info.description)
print(ds_builder.info.features)

In [None]:
from datasets import load_dataset
sentiment = load_dataset("poem_sentiment") #下載資料

In [None]:
sentiment

In [None]:
#切割資料
train_ds = sentiment["train"]
valid_ds = sentiment["validation"]
test_ds = sentiment["test"]

In [None]:
#把 dataset 轉成 Pandas
import pandas as pd

sentiment.set_format(type="pandas")

df = sentiment["train"][:]

df.head(10)

In [None]:
#把 label 轉成文字
def label_int2str(row):
	return sentiment["train"].features["label"].int2str(row)

df["label_name"] = df["label"].apply(label_int2str)
df.head(10)

In [None]:
#dataset 的 label 分佈圖
import matplotlib.pyplot as plt

df["label_name"].value_counts().plot.barh()
plt.title("Poem Classes")
plt.show()

In [None]:
#可以把 pandas 處理過的轉成新的 dataset
from datasets import Dataset

label_name_dataset = Dataset.from_pandas(df)
label_name_dataset

In [None]:
#shuffle 資料
sentiment_train = sentiment["train"].shuffle(seed=5566).select(range(100))

In [None]:
#用詩句的長度過濾資料
sentiment_filtered = sentiment.filter(lambda x: len(x["verse_text"]) > 30)
sentiment_filtered

In [None]:
#把詩句轉成文字長度
new_dataset = sentiment.map(
    lambda x: {"verse_text": [ len(o) for o in x["verse_text"] ] }, batched=True
)
new_dataset['test'][:3]

# Day10-Tokenizer 入門

###### Character tokenization: 很難讓模型得出有意義的結論

In [None]:
#Character tokenization
string = "Only those who will risk going too far can possibly find out how far one can go."
tokenized_str = list(string)
print(tokenized_str)

In [None]:
#numericalization
token2idx = {ch: idx for idx, ch in enumerate(sorted(set(tokenized_str)))}
print(token2idx)

In [None]:
#把原始句子轉為數字
input_ids = [token2idx[token] for token in tokenized_str]
print(input_ids)

###### Word tokenization: 很容易導致參數過大的問題

In [None]:
#Word tokenization
string = "Only those who will risk going too far can possibly find out how far one can go."
tokenized_str = string.split()
print(tokenized_str)

In [None]:
#numericalization
token_word2idx = {ch: idx for idx, ch in enumerate(sorted(set(tokenized_str)))}
print(token_word2idx)

In [None]:
#把原始句子轉為數字
input_ids = [token_word2idx[token] for token in tokenized_str]
print(input_ids)

# Day12-Hugging Face Tokenizer

In [None]:
from transformers import AutoTokenizer

string = "Only those who will risk going too far can possibly find out how far one can go."

model_name = "distilbert-base-uncased-finetuned-sst-2-english" #直接呼叫transformer model 名字
tokenizer = AutoTokenizer.from_pretrained(model_name) #自動使用該 transformer 所使用的 tokenizer

In [None]:
#指定 Tokenizer
from transformers import DistilBertTokenizer

distilbert_tokenizer = DistilBertTokenizer.from_pretrained(model_name)

In [None]:
encoded_str = tokenizer(string, padding=True, truncation=True) 
encoded_str

In [None]:
#把編碼後的文字還原回來
tokens = tokenizer.convert_ids_to_tokens(encoded_str.input_ids)
tokens
'''
Special Token	  [PAD]	[UNK]	[CLS分類]	[SEP終止符號]	[MASK]
Special Token ID	0	100	    101	      102	         103
'''

In [None]:
print(tokenizer.convert_tokens_to_string(tokens))

In [None]:
#多句子分詞
string_array = [
    string,
    "Baby shark, doo doo doo doo doo doo, Baby shark!"
]

encoded_str_arr = tokenizer(string_array, padding=True, truncation=True)
encoded_str_arr

In [None]:
from datasets import load_dataset
sentiment = load_dataset("poem_sentiment")

In [None]:
def tokenize(batch):
    return tokenizer(batch["verse_text"], padding=True, truncation=True)

print(tokenize(sentiment["train"][:3]))

In [None]:
#把整個資料集都做分詞
sentiment_encoded = sentiment.map(tokenize, batched=True, batch_size=None)

In [None]:
print(sentiment_encoded["train"].column_names)

In [None]:
print(sentiment_encoded["train"]["input_ids"])

In [None]:
print(sentiment_encoded["train"]["attention_mask"])

# Day13-Hugging Face Transformer 入門

In [None]:
string_arr = [
    "Only those who will risk going too far can possibly find out how far one can go.",
    "Baby shark, doo doo doo doo doo doo, Baby shark!"
]
inputs = tokenizer(string_arr, padding=True, truncation=True, return_tensors="pt") #return pytorch 的 tensor
print(inputs)

In [None]:
#使用 Transformer Model
from transformers import AutoModelForSequenceClassification

model_name = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
outputs = model(**inputs)

In [None]:
print(outputs.logits)

In [None]:
#PyTorch
import torch

predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)
'''
第一句話的結果是 [0.8561, 0.1439]: 第0個結果的機率是0.8561，第1個結果的機率是0.1439。
第二句話的結果是 [0.0816, 0.9184]: 第0個結果的機率是0.0816，第1個結果的機率是0.9184。
'''

In [None]:
#把 label 來打印出來
model.config.id2label

# Day14-Hugging Face Transformer Pipeline 和 TF model

In [None]:
#TensorFlow
from transformers import TFAutoModel

tf_model = TFAutoModel.from_pretrained(model_name)

# Day15- Fine-tune Transformer --- 資料處理篇

In [None]:
from datasets import load_dataset
sentiment = load_dataset("poem_sentiment")

In [None]:
#把 dataset 轉成 pandas
import pandas as pd

sentiment.set_format(type="pandas")
df = sentiment["train"][:]
df.head()

In [None]:
def label_int2str(row):
    return sentiment["train"].features["label"].int2str(row)

df["label_name"] = df["label"].apply(label_int2str)
df.head()

In [None]:
#把 lebels 指定為變數
labels = sentiment["train"].features["label"].names
print(labels)

In [None]:
#把 dataset 的分布用 matplotlib 印出來
import matplotlib.pyplot as plt

df["label_name"].value_counts(ascending=True).plot.barh()
plt.title("Number of labels")
plt.show()

In [None]:
#把 dataset 的格式 reset 回來
sentiment.reset_format()

In [None]:
#呼叫分詞
from transformers import AutoTokenizer

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
#把資料集做分詞
def tokenize(batch):
    return tokenizer(batch["verse_text"], padding=True, truncation=True)

sentiment_encoded = sentiment.map(tokenize, batched=True, batch_size=None)
next(iter(sentiment_encoded["train"]))

In [None]:
valid_ds = sentiment["validation"]
valid_ds["label"][:]
''' 
dataset validate 的部份，會發現裡面都沒有類別 3 ，未來可能會在做 validation 的時候產生 bug
'''

# Day16- Fine-tune Transformer --- 訓練模型篇

In [None]:
#載入 PyTorch
import torch

#載入 pre-trained model
from transformers import AutoModelForSequenceClassification

#使用 CUDA
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#設定 label 的數量
num_labels = 4

#指定好 id2label 和 label2id
my_model = (AutoModelForSequenceClassification
        .from_pretrained(model_name, num_labels=num_labels #model_name = "distilbert-base-uncased"
        ,id2label={"0": "negative",
                    "1": "positive",
                    "2": "no_impact",
                    "3": "mixed"}
        ,label2id={"negative": "0",
                    "positive": "1",
                    "no_impact": "2",
                    "mixed": "3"})
         .to(device))

In [None]:
from transformers import Trainer, TrainingArguments

batch_size = 64
logging_steps = len(sentiment_encoded["train"]) // batch_size
model_name = "poem_model"

#設定參數
training_args = TrainingArguments(output_dir=model_name, # checkpoint 和最後跑完的模型儲存位置
                                  num_train_epochs=40,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  disable_tqdm=False,
                                  report_to = "azure_ml", #Azure Machine Learning
                                  logging_steps=logging_steps)

In [None]:
#訓練模型的期間可以監控 accuracy_score 和 f1_score
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [None]:
#開始訓練模型
from transformers import Trainer

trainer = Trainer(model=my_model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=sentiment_encoded["train"],
                  eval_dataset=sentiment_encoded["validation"],
                  tokenizer=tokenizer)

In [None]:
#trainer.evaluate()

In [None]:
#trainer.train()

In [None]:
# 用pipeline載入模型
from transformers import pipeline

classifier = pipeline(task= 'sentiment-analysis', 
                      model= "poem_model/checkpoint-500")

In [None]:
classifier(
    [
        "Only those who will risk going too far can possibly find out how far one can go.",
        "Baby shark, doo doo doo doo doo doo, Baby shark!"
    ]
)

# Day18-Hugging Face 文本生成入門

###### 根據輸入的提示，產生輸出 => 條件式文本生成(conditional text generation)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "gpt2-xl"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

###### 方法1: 得到 logits 再過 softmax 選最高機率

In [None]:
input_txt = "I have a pen, I have an "
input_ids = tokenizer(input_txt, return_tensors="pt")["input_ids"].to(device)
iterations = []
n_steps = 10
choices_per_step = 3

with torch.no_grad():
    for _ in range(n_steps):
        iteration = dict()
        iteration["Input"] = tokenizer.decode(input_ids[0])
        output = model(input_ids)
        # 選最後一個 token 然後過 softmax 後選出機率最大
        next_token_logits = output.logits[0, -1, :]
        next_token_probs = torch.softmax(next_token_logits, dim=-1)
        sorted_ids = torch.argsort(next_token_probs, dim=-1, descending=True)
        
        input_ids = torch.cat([input_ids, sorted_ids[None, 0, None]], dim=-1)
        iterations.append(iteration)

print(iterations[-1])

###### 方法2: 使用 generate()

In [None]:
max_length = 64
input_txt = """I have a pen, I have an iphone, I have a laptop. Thus,"""
input_ids = tokenizer(input_txt, return_tensors="pt")["input_ids"].to(device)
output = model.generate(input_ids, max_length=max_length)
print(tokenizer.decode(output[0]))

# Day19-Hugging Face 文本生成進階

###### 方法1: Greedy Search貪婪搜尋
###### 缺點: 產出大量重複的字句
###### 應用場景: 精確任務或為特定問題提供答案

In [None]:
max_length = 256

input_txt = """
Alistair Darling has been forced to consider a second bailout for banks as the lending drought worsens. \n
The Cancellor will decide tithin weeks whether to pump billions more into the economy as evidence mounts that \
the 37 billion part-nationalisation last yearr has failed to keep credit flowing,
"""
input_ids = tokenizer(input_txt, return_tensors="pt")["input_ids"].to(device)
output = model.generate(input_ids, max_length=max_length, num_beams=1,  do_sample=False)
print(tokenizer.decode(output[0]))

###### 方法2: Beam Search波束搜尋
###### 應用場景: 精確任務或為特定問題提供答案

In [None]:
max_length = 256

input_txt = """
Alistair Darling has been forced to consider a second bailout for banks as the lending drought worsens. \n
The Cancellor will decide tithin weeks whether to pump billions more into the economy as evidence mounts that \
the 37 billion part-nationalisation last yearr has failed to keep credit flowing,
"""
input_ids = tokenizer(input_txt, return_tensors="pt")["input_ids"].to(device)
output = model.generate(input_ids, max_length=max_length, num_beams=3,  do_sample=False, no_repeat_ngram_size=5)
print(tokenizer.decode(output[0]))

###### 方法3: Sampling取樣
###### 應用場景: 生成更長或更有創意的文本

In [None]:
max_length = 256

input_txt = """
Alistair Darling has been forced to consider a second bailout for banks as the lending drought worsens. \n
The Cancellor will decide tithin weeks whether to pump billions more into the economy as evidence mounts that \
the 37 billion part-nationalisation last yearr has failed to keep credit flowing,
"""
input_ids = tokenizer(input_txt, return_tensors="pt")["input_ids"].to(device)
output = model.generate(input_ids, max_length=max_length, num_beams=1, do_sample=True, temperature=1.5)
print(tokenizer.decode(output[0]))

In [None]:
max_length = 256

input_txt = """
Alistair Darling has been forced to consider a second bailout for banks as the lending drought worsens. \n
The Cancellor will decide tithin weeks whether to pump billions more into the economy as evidence mounts that \
the 37 billion part-nationalisation last yearr has failed to keep credit flowing,
"""
input_ids = tokenizer(input_txt, return_tensors="pt")["input_ids"].to(device)
output = model.generate(input_ids, max_length=max_length, num_beams=1, do_sample=True, top_k=50)
print(tokenizer.decode(output[0]))

In [None]:
max_length = 256

input_txt = """
Alistair Darling has been forced to consider a second bailout for banks as the lending drought worsens. \n
The Cancellor will decide tithin weeks whether to pump billions more into the economy as evidence mounts that \
the 37 billion part-nationalisation last yearr has failed to keep credit flowing,
"""
input_ids = tokenizer(input_txt, return_tensors="pt")["input_ids"].to(device)
output = model.generate(input_ids, max_length=max_length, num_beams=1, do_sample=True, top_p=0.95)
print(tokenizer.decode(output[0]))

# Day20-Hugging Face 中文的文本生成

In [None]:
from transformers import BertTokenizerFast,AutoModelForCausalLM

tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese')
model = AutoModelForCausalLM.from_pretrained('ckiplab/gpt2-base-chinese').to(device)

In [None]:
max_length=256

input_txt = """
隨著貸款日益枯竭，Alistair Darling 被迫考慮對銀行進行第二次救助。 \
財政大臣將在幾週內決定是否向經濟中再注入數十億美元，因為有證據表明\
去年 370 億的部分國有化未能保持信貸流動，
"""
input_ids = tokenizer(input_txt, return_tensors="pt")["input_ids"].to(device)
output = model.generate(input_ids, max_length=max_length, num_beams=1,  do_sample=True, top_k=50)
print(tokenizer.decode(output[0]))

# Day21-Hugging Face 摘要任務入門

###### Encoder-Decoder transformer

In [None]:
input_text="""
Alistair Darling has been forced to consider a second bailout for banks as the lending drought worsens. 

The Cancellor will decide tithin weeks whether to pump billions more into the economy as evidence mounts that the 37 billion part-nationalisation last yearr has failed to keep credit flowing,

Mr Darling, the former Liberal Democrat chancellor, admitted that the situation had become critical but insisted that there was still time to turn things around. 

He told the BBC that the crisis in the banking sector was the most serious problem facing the economy but also highlighted other issues, such as the falling value of sterling and the threat of inflation. 

"The worst fears about the banking crisis seem not to be panning out," he said, adding that there had not been a single banker arrested or charged over the crash. 

"The economy, the economy"

Mr Darling said "there's been a very, very strong recovery" since the autumn of 2008.

"There are very big problems ahead of us, not least of which is inflation. It is likely to be a very high inflation rate. "

The economy is expected to grow by 0.3% in the quarter to the end of this year.
"""

In [None]:
from transformers import pipeline

pipe = pipeline("summarization", model="t5-large")
result = pipe(input_text)
result

In [None]:
pipe_pegasus = pipeline("summarization", model="google/pegasus-cnn_dailymail")
result_pegasus = pipe_pegasus(input_text)
result_pegasus

In [None]:
#pip install nltk

In [None]:
import nltk
from nltk.tokenize import sent_tokenize

nltk.download("punkt")

In [None]:
string = "The U.S. are a country. Mr. White vs. Heisenberg."

sent_tokenize(string)

In [None]:
#整理摘要結果
paragraph_result_T5 = "\n".join(sent_tokenize(result[0]["summary_text"]))
print(paragraph_result_T5)

print()

paragraph_result_pegasus = "\n".join(sent_tokenize(result_pegasus[0]["summary_text"].replace(" .<n>", " .\n")))
print(paragraph_result_pegasus)

# Day22-評價摘要好壞的演算法

In [None]:
#pip install rouge_score

In [None]:
from datasets import load_metric

rouge_metric = load_metric("rouge")

In [None]:
scores = rouge_metric.compute(
    predictions=[paragraph_result_T5], references=[input_text]
)
print(scores)

In [None]:
scores = rouge_metric.compute(
    predictions=[paragraph_result_pegasus], references=[input_text]
)
print(scores)

# Day23- Fine-tuned 摘要任務的 transformer

In [None]:
dataset_url = "https://huggingface.co/datasets/gopalkalpande/bbc-news-summary/raw/main/bbc-news-summary.csv"

In [None]:
#載入 dataset
from datasets import load_dataset
remote_dataset = load_dataset("csv", data_files=dataset_url)

In [None]:
import pandas as pd

remote_dataset.set_format(type="pandas")

df = remote_dataset["train"][:]

df.head(10)

In [None]:
remote_dataset.reset_format()

In [None]:
train_dataset = remote_dataset.shuffle(seed=5566)

In [None]:
from datasets import DatasetDict

train_test_dataset = train_dataset['train'].train_test_split(test_size=0.1)

test_valid = train_test_dataset['test'].train_test_split(test_size=0.5)

train_test_valid_dataset = DatasetDict({
    'train': train_test_dataset['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

train_test_valid_dataset

In [None]:
from transformers import AutoModelForSeq2SeqLM,AutoTokenizer
import torch

model_name = "google/pegasus-cnn_dailymail"
device = "cuda" if torch.cuda.is_available() else "cpu"

model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def convert_dataset(dataset):
    input_encodings = tokenizer(dataset["Articles"], max_length=512,
                                truncation=True)

    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(dataset["Summaries"], max_length=64,
                                     truncation=True)

    return {"input_ids": input_encodings["input_ids"],
            "attention_mask": input_encodings["attention_mask"],
            "labels": target_encodings["input_ids"]}

dataset_pt = train_test_valid_dataset.map(convert_dataset,
                                       batched=True)
columns = ["input_ids", "labels", "attention_mask"]
dataset_pt.set_format(type="torch", columns=columns)

In [None]:
from transformers import Seq2SeqTrainingArguments, trainer

model_saved_name = model_name.split("/")[-1] 

args = Seq2SeqTrainingArguments( 
    output_dir=f"{model_name}-finetuned", 
    num_train_epochs=1, 
    warmup_steps=100,
    per_device_train_batch_size=1, 
    per_device_eval_batch_size=1,
    weight_decay=0.01, 
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=100, 
    save_steps=1e6,
    gradient_accumulation_steps=64,
    report_to="azure_ml"
)

In [None]:
from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
import nltk
from nltk.tokenize import sent_tokenize

nltk.download("punkt")

In [None]:
from datasets import load_metric

rouge_metric = load_metric("rouge")

In [None]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # 這裡把 DataCollatorForSeq2Seq 會填入的 -100 排除掉
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]

    result = rouge_metric.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract the median scores
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset= dataset_pt["train"],
    eval_dataset = dataset_pt["valid"],
    data_collator=seq2seq_data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
#trainer.evaluate()

In [None]:
#trainer.train()

# Day24- Hugging Face Named Entity Recognition

###### Named Entity Recognition(NER)。 一般翻譯為命名實體辨識、命名實體識別，或也有人翻成專有名詞辨識

In [None]:
sample_text = """
Alistair Darling has been forced to consider a second bailout for banks as the lending drought worsens. 

The Cancellor will decide tithin weeks whether to pump billions more into the economy as evidence mounts that the 37 billion part-nationalisation last yearr has failed to keep credit flowing,

Mr Darling, the former Liberal Democrat chancellor, admitted that the situation had become critical but insisted that there was still time to turn things around. 

He told the BBC that the crisis in the banking sector was the most serious problem facing the economy but also highlighted other issues, such as the falling value of sterling and the threat of inflation. 

"The worst fears about the banking crisis seem not to be panning out," he said, adding that there had not been a single banker arrested or charged over the crash. 

"The economy, the economy"

Mr Darling said "there's been a very, very strong recovery" since the autumn of 2008.

"There are very big problems ahead of us, not least of which is inflation. It is likely to be a very high inflation rate. "

The economy is expected to grow by 0.3% in the quarter to the end of this year.
"""

In [None]:
from transformers import pipeline
import pandas as pd


ner = pipeline("ner")
outputs = ner(sample_text)
pd.DataFrame(outputs)

# Day25- Hugging Face 問答任務

In [None]:
question = "who is Mr Darling"
context = """
Alistair Darling has been forced to consider a second bailout for banks as the lending drought worsens. 

The Cancellor will decide tithin weeks whether to pump billions more into the economy as evidence mounts that the 37 billion part-nationalisation last yearr has failed to keep credit flowing,

Mr Darling, the former Liberal Democrat chancellor, admitted that the situation had become critical but insisted that there was still time to turn things around. 

He told the BBC that the crisis in the banking sector was the most serious problem facing the economy but also highlighted other issues, such as the falling value of sterling and the threat of inflation. 

"The worst fears about the banking crisis seem not to be panning out," he said, adding that there had not been a single banker arrested or charged over the crash. 

"The economy, the economy"

Mr Darling said "there's been a very, very strong recovery" since the autumn of 2008.

"There are very big problems ahead of us, not least of which is inflation. It is likely to be a very high inflation rate. "

The economy is expected to grow by 0.3% in the quarter to the end of this year.
"""

In [None]:
from transformers import AutoModelForQuestionAnswering
from transformers import AutoTokenizer

model_name = "deepset/roberta-base-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

In [None]:
from transformers import pipeline

pipe = pipeline("question-answering", model=model, tokenizer=tokenizer)
pipe(question=question, context=context, top_k=3) #top_k=3: 會秀選出機率最高的答案前三名

In [None]:
question = "What is the problem Mr Darling told to BBC?"

In [None]:
from transformers import pipeline

pipe = pipeline("question-answering", model=model, tokenizer=tokenizer)
pipe(question=question, context=context, top_k=3)

# Day27-Transformer 效能優化

In [None]:
#pip install optimum

In [None]:
#pip install torch torchvision torchaudio

In [None]:
#pip install --upgrade transformers

In [1]:
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForQuestionAnswering

model = ORTModelForQuestionAnswering.from_pretrained("optimum/roberta-base-squad2") 
tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")

onnx_qa = pipeline("question-answering",model=model,tokenizer=tokenizer)

question = "What's my name?"
context = "My name is Ko Ko and I live in Taiwan."
result = onnx_qa(question, context)

print(result)

2023-09-28 06:25:09.086600: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-09-28 06:25:09.218734: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-28 06:25:09.906323: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-09-28 06:25:09.906417: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] 

Downloading (…)lve/main/config.json:   0%|          | 0.00/728 [00:00<?, ?B/s]

Downloading model.onnx:   0%|          | 0.00/496M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

{'score': 0.9248433709144592, 'start': 11, 'end': 16, 'answer': 'Ko Ko'}
