# Processor

In [2]:
from transformers.image_processing_utils import BaseImageProcessor,BatchFeature
from datasets import load_dataset
from transformers.processing_utils import ProcessorMixin

In [None]:
class MobileNetV1ImageProcessor(BaseImageProcessor):
    model_input_names = ["pixel_values"]
    
    """
    只处理图片
    """
    def __init__(self):
        super().__init__()
    
    def resize(self):
        return 
    def preprocess(self，images，**kwargs):
        """
        数据transforms只需要是处理PIL格式的函数即可，在这里实现数据的类型和通道的调整,
        如果要用mmpretrain中的增益方法，输入必须是{'img':img}的格式，需要加一个字符串映射
        images (`ImageInput`): PIL
        Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
        passing in images with pixel values between 0 and 1, set `do_rescale=False`.
        #基类的__call__就是调用这个函数，images可以是一个，也可以是个batch，PIL--》np.array
        可选操作包括
        do_resize
        do_center_crop
        do_rescale
        do_normalize
        最后含有Totensor()的操作
        """
        
        data = {"pixel_values": images}#这里只有图片的输出
        return BatchFeature(data=data, tensor_type=return_tensors)

In [None]:
class AlignProcessor(ProcessorMixin):
    
    """
    同时处理图片和文本的处理器
    """
    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "EfficientNetImageProcessor"
    tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
    
    def __init__(self, image_processor=None, tokenizer=None):
        super().__init__(image_processor, tokenizer)
        
    def __call__(self, text=None, images=None,
                 padding="max_length",
                 max_length=64,
                 return_tensors=None, **kwargs):
        if text is None and images is None:
            raise ValueError("You have to specify either text or images. Both cannot be none.")

        if text is not None:
            encoding = self.tokenizer(
                text, padding=padding, max_length=max_length, return_tensors=return_tensors, **kwargs
            )

        if images is not None:
            image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)

        if text is not None and images is not None:
            encoding["pixel_values"] = image_features.pixel_values
            return encoding
        elif text is not None:
            return encoding
        else:
            #这个类只要是根据return_tensors来确定是否对输入转化成张量的操作类
            return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)


# Tokener

In [None]:
from transformers import AutoTokenizer
#映射token
tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [None]:
#训练
new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(), vocab_size=25000)
#使用
new_tokenizer(dataset[:5]["text"])

#保存
new_tokenizer.save_pretrained("my-new-tokenizer")
#载入
tok = new_tokenizer.from_pretrained("my-new-tokenizer")

# 从头训练一个

In [None]:
#Byte-Pair Encoding (BPE), WordPiece, and SentencePiece三种

In [None]:
from tokenizers import decoders, models, normalizers, pre_tokenizers, processors, trainers, Tokenizer

tokenizer = Tokenizer(models.WordPiece(unl_token="[UNK]"))

In [None]:
#norm部分
tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True)
#预先定义部分
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()

In [None]:
#训练方式比较特殊，要一个特殊的trainer
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens)

In [None]:
tokenizer.train_from_iterator(batch_iterator(), trainer=trainer)

In [None]:
encoding = tokenizer.encode("This is one sentence.", "With this one we have a pair.")

# 完整实例

In [None]:
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
tokenizer.pre_tokenizer.pre_tokenize_str("This is an example!")
trainer = trainers.BpeTrainer(vocab_size=25000, special_tokens=["<|endoftext|>"])
tokenizer.train_from_iterator(batch_iterator(), trainer=trainer)

In [None]:
from transformers import GPT2TokenizerFast
new_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer)

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenizer.tokenize("I have a new GPU!")

# 视频图片处理

In [None]:
def read_video_pyav(container, indices):
    '''
    对视频进行采样取祯
    '''
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])

def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
    '''
    生成采样的序列号
    '''
    converted_len = int(clip_len * frame_sample_rate)
    end_idx = np.random.randint(converted_len, seg_len)
    start_idx = end_idx - converted_len
    indices = np.linspace(start_idx, end_idx, num=clip_len)
    indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
    return indices


In [None]:
# video clip consists of 300 frames (10 seconds at 30 FPS)
file_path = hf_hub_download(
    repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
)
container = av.open(file_path)

# sample 32 frames
indices = sample_frame_indices(clip_len=32, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
#输出就是一个张量
video = read_video_pyav(container=container, indices=indices)

In [None]:
image_processor = VivitImageProcessor.from_pretrained("google/vivit-b-16x2-kinetics400")
# prepare video for the model
inputs = image_processor(list(video), return_tensors="pt")
