In [1]:
import argparse
import json
import logging
import math
import os
import random
from pathlib import Path

import datasets
import evaluate
import nltk
import numpy as np
import torch
from accelerate import Accelerator
from accelerate.logging import get_logger
from accelerate.utils import set_seed
from datasets import load_dataset
from filelock import FileLock
from huggingface_hub import Repository, create_repo
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

import transformers
from transformers import (
    CONFIG_MAPPING,
    MODEL_MAPPING,
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    SchedulerType,
    get_scheduler,
)

  from .autonotebook import tqdm as notebook_tqdm
2023-10-27 13:51:12.447070: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-10-27 13:51:12.447094: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-10-27 13:51:12.447111: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-10-27 13:51:12.451857: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
model_path = "/nfs/nas-6.1/whlin/ADL/ADL23-HW2/checkpoint/google_mt5_small_3e-4/checkpoint-23202"

print("summary_model_path: ", model_path)
# -------------------------- prepare dataset

# load raw dataset
raw_datasets = load_dataset("json", data_files={"test": "../data/public.jsonl"})
raw_datasets["test"] = raw_datasets["test"].select(range(10))

# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

embedding_size = model.get_input_embeddings().weight.shape[0]
if len(tokenizer) > embedding_size:
    model.resize_token_embeddings(len(tokenizer))
if model.config.decoder_start_token_id is None:
    raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")

prefix = "summarize: "


max_target_length = 64
column_names = raw_datasets["test"].column_names

padding = False
text_column = "maintext"
summary_column = "title"

def preprocess_function(examples):
    inputs = examples[text_column]
    inputs = [prefix + inp for inp in inputs]
    model_inputs = tokenizer(inputs, max_length=384, padding=padding, truncation=True)

    return model_inputs

def postprocess_text(preds):
    preds = [pred.strip() for pred in preds]
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    return preds


test_dataset = raw_datasets["test"].map(
    preprocess_function,
    batched=True,
    remove_columns=column_names,
    desc="Running tokenizer on dataset",
)


label_pad_token_id = -100 
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=None,
)

test_dataloader = DataLoader(test_dataset, collate_fn=data_collator, batch_size=4)

# ----------------- prepare model --------------
# load model
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu");
model.to(device)
# model, test_dataloader = accelerator.prepare(model, test_dataloader)

# --------------- predict ---------------
model.eval()

gen_kwargs = {
    "max_length": 64,
    "num_beams": 1,
}

all_prediction = list()
for batch in tqdm(test_dataloader):
    batch = {k: v.to(device) for k, v in batch.items()}

    with torch.no_grad():
        generated_tokens = model.generate(
            batch["input_ids"],
            attention_mask=batch["attention_mask"],
            **gen_kwargs,
        )

        generated_tokens = generated_tokens.detach().cpu().numpy()
        decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

        decoded_preds= postprocess_text(decoded_preds)
        all_prediction.extend(decoded_preds)

summary_model_path:  /nfs/nas-6.1/whlin/ADL/ADL23-HW2/checkpoint/google_mt5_small_3e-4/checkpoint-23202


  0%|          | 0/3 [00:00<?, ?it/s]You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 3/3 [00:01<00:00,  2.17it/s]


In [3]:
print(all_prediction)

['Anker新款真無線藍牙耳機 確定引進台灣市場', '全台最熱門鐵道自行車!\n三條「最美C路線」 加碼親子旅遊推薦', '華碩推出換上Intel第11代Core處理器 Chromebook Flip CX5 擴展企業應用需求', '新冠肺炎疫情改變產業發展 從供應端看產業轉型', '微軟:全球僅有15億裝置 但仍有超過1億台', '台幣貶值轉換到明年?', '美國網購平台上架「哈台馬克杯」 網友熱銷1萬件、亞馬遜網購平台還賣到缺貨', '華碩更新雙螢幕筆電、ZenBook Duo 14、Pro Duo 15OLED 增加更自然操作需求', '週末炸雞加酒!\n臺虎推「周末炸雞俱樂部」 首款炸物專門啤酒', 'NBA/曾被交易到籃網 紐媒爆料厄文「未爆彈」']


In [21]:
import pandas as pd

In [23]:
df = pd.DataFrame(list(zip(raw_datasets['test']['id'], all_prediction)),
            columns =['id', 'title'])

In [24]:
df

Unnamed: 0,id,title
0,21710,Anker新款真無線藍牙耳機 確定引進台灣市場
1,21711,全台最熱門鐵道自行車!\n三條「最美C路線」 加碼親子旅遊推薦
2,21712,華碩推出換上Intel第11代Core處理器 Chromebook Flip CX5 擴展企...
3,21713,新冠肺炎疫情改變產業發展 從供應端看產業轉型
4,21714,微軟:全球僅有15億裝置 但仍有超過1億台
5,21715,台幣貶值轉換到明年?
6,21716,美國網購平台上架「哈台馬克杯」 網友熱銷1萬件、亞馬遜網購平台還賣到缺貨
7,21717,華碩更新雙螢幕筆電、ZenBook Duo 14、Pro Duo 15OLED 增加更自然操作需求
8,21718,週末炸雞加酒!\n臺虎推「周末炸雞俱樂部」 首款炸物專門啤酒
9,21719,NBA/曾被交易到籃網 紐媒爆料厄文「未爆彈」
