In [1]:
import csv
import json
import os
from tqdm import tqdm

def create_jsonl_from_csv(csv_file, output_jsonl_file, image_dir):
    """
    Function to create a JSONL file from a CSV where the question is based on the entity_name.
    """
    jsonl_entries = []

    # Open the CSV file
    with open(csv_file, 'r') as f:
        reader = csv.DictReader(f)

        # Iterate over each row in the CSV with tqdm for progress bar
        for idx, row in tqdm(enumerate(reader), total=sum(1 for _ in open(csv_file))-1, desc="Processing data"):
            # Get the local image path
            image_filename = os.path.basename(row["image_link"])
            local_image_path = os.path.join(image_dir, image_filename)

            # The question is simply the entity_name
            question = f"{row['entity_name']}"

            entry = {
                "id": idx,
                "image": local_image_path,
                "conversations": [
                    {
                        "from": "human",
                        "value": f"<image>\n{question}"
                    }
                ]
            }

            jsonl_entries.append(entry)

    # Write the JSONL data to a file
    with open(output_jsonl_file, 'w') as f:
        for entry in jsonl_entries:
            json.dump(entry, f)
            f.write('\n')

    print(f"JSONL dataset saved to {output_jsonl_file}")

# Example usage:
csv_file = '/workspace/dataset/test.csv'  # Path to your test CSV file
image_dir = '/workspace/dataset/test/'  # Directory where your images are stored
output_jsonl_file = '/workspace/dataset/test_data.jsonl'  # Path to save the output JSONL file

# Create the JSONL dataset
create_jsonl_from_csv(csv_file, output_jsonl_file, image_dir)


Processing data: 100%|██████████| 131187/131187 [00:00<00:00, 142632.85it/s]


JSONL dataset saved to /workspace/dataset/test_data.jsonl


In [1]:
import numpy as np
import torch
import torchvision.transforms as T
from decord import VideoReader, cpu
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)

def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
    best_ratio_diff = float('inf')
    best_ratio = (1, 1)
    area = width * height
    for ratio in target_ratios:
        target_aspect_ratio = ratio[0] / ratio[1]
        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
        if ratio_diff < best_ratio_diff:
            best_ratio_diff = ratio_diff
            best_ratio = ratio
        elif ratio_diff == best_ratio_diff:
            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                best_ratio = ratio
    return best_ratio

def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
    orig_width, orig_height = image.size
    aspect_ratio = orig_width / orig_height

    # calculate the existing image aspect ratio
    target_ratios = set(
        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
        i * j <= max_num and i * j >= min_num)
    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

    # find the closest aspect ratio to the target
    target_aspect_ratio = find_closest_aspect_ratio(
        aspect_ratio, target_ratios, orig_width, orig_height, image_size)

    # calculate the target width and height
    target_width = image_size * target_aspect_ratio[0]
    target_height = image_size * target_aspect_ratio[1]
    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

    # resize the image
    resized_img = image.resize((target_width, target_height))
    processed_images = []
    for i in range(blocks):
        box = (
            (i % (target_width // image_size)) * image_size,
            (i // (target_width // image_size)) * image_size,
            ((i % (target_width // image_size)) + 1) * image_size,
            ((i // (target_width // image_size)) + 1) * image_size
        )
        # split the image
        split_img = resized_img.crop(box)
        processed_images.append(split_img)
    assert len(processed_images) == blocks
    if use_thumbnail and len(processed_images) != 1:
        thumbnail_img = image.resize((image_size, image_size))
        processed_images.append(thumbnail_img)
    return processed_images

def load_image(image_file, input_size=448, max_num=12):
    image = Image.open(image_file).convert('RGB')
    transform = build_transform(input_size=input_size)
    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
    pixel_values = [transform(image) for image in images]
    pixel_values = torch.stack(pixel_values)
    return pixel_values



In [2]:
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram',
        'kilogram',
        'microgram',
        'milligram',
        'ounce',
        'pound',
        'ton'},
    'maximum_weight_recommendation': {'gram',
        'kilogram',
        'microgram',
        'milligram',
        'ounce',
        'pound',
        'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre',
        'cubic foot',
        'cubic inch',
        'cup',
        'decilitre',
        'fluid ounce',
        'gallon',
        'imperial gallon',
        'litre',
        'microlitre',
        'millilitre',
        'pint',
        'quart'}
}

In [3]:
generation_config = dict(max_new_tokens=1024, do_sample=False)

In [None]:
import re
import json
import torch
from PIL import Image
from transformers import AutoModel, AutoTokenizer
import csv
from tqdm import tqdm
import math
import torch
from transformers import AutoTokenizer, AutoModel


# Function to load the test.jsonl file and extract image paths and questions
# def load_test_data_from_jsonl(jsonl_file):
#     image_files = []
#     questions_list = []
#     indices = []

#     with open(jsonl_file, 'r') as f:
#         for line in tqdm(f, desc="Loading test data", total=sum(1 for _ in open(jsonl_file))):
#             data = json.loads(line)
#             image_path = data['image']  # Extract the image file path
#             image_files.append(image_path)  # Add image file path to the list

#             conversation = data['conversations']
#             for item in conversation:
#                 if item['from'] == 'human':  # Extract the question from the human input
#                     key = item['value'].split('\n')[1]  # Extract entity name from the question

#                     # Now use this key to get the corresponding units from entity_unit_map
#                     item_weight_units = ', '.join(entity_unit_map[key])

#                     # Modify the question string by replacing the placeholder with the actual units
#                     question = f' in one of the following ({item_weight_units}) unit'
#                     questions_list.append('What is ' + item['value'] + ' of the item' + question)

#                     indices.append(data['id'])  # Add the index from the test data
                    
#     return image_files, questions_list, indices

def load_test_data_from_jsonl(jsonl_file, max_lines=1000):
    image_files = []
    questions_list = []
    indices = []

    with open(jsonl_file, 'r') as f:
        for i, line in enumerate(tqdm(f, desc="Loading test data", total=sum(1 for _ in open(jsonl_file)))):
            if i >= max_lines:  # Stop after processing max_lines
                break
            data = json.loads(line)
            image_path = data['image']  # Extract the image file path
            image_files.append(image_path)  # Add image file path to the list

            conversation = data['conversations']
            for item in conversation:
                if item['from'] == 'human':  # Extract the question from the human input
                    key = item['value'].split('\n')[1]  # Extract entity name from the question

                    # Use this key to get the corresponding units from entity_unit_map
                    item_weight_units = ', '.join(entity_unit_map[key])

                    # Modify the question string by replacing the placeholder with the actual units
                    question = f' in one of the following ({item_weight_units}) unit'
                    questions_list.append('What is ' + item['value'] + ' of the item' + question)

                    indices.append(data['id'])  # Add the index from the test data

    return image_files, questions_list, indices

# Function to load and preprocess the image
def load_image(image_file, input_size=448, max_num=12):
    image = Image.open(image_file).convert('RGB')
    transform = build_transform(input_size=input_size)
    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
    pixel_values = [transform(image) for image in images]
    pixel_values = torch.stack(pixel_values)
    return pixel_values


def convert_and_append_unit(text):
    match = re.search(r'\d+(\.\d+)?', text)  # Search for a number (with optional decimal)
    if match:
        number = float(match.group())  # Convert the matched number to float
        unit = text.split()[-1]  # Extract the unit (last part of the string)
        return f"{number} {unit}"
    else:
        return ""  # Return an empty string if no number is found


# Function to run inference and make predictions on test data
def run_inference_on_test_data(model, tokenizer, image_files, questions):
    predictions = []

    for image_file, question in tqdm(zip(image_files, questions), desc="Running inference", total=len(image_files)):
        pixel_values = load_image(image_file, max_num=12).to(torch.bfloat16).cuda()
        responses = model.batch_chat(tokenizer, pixel_values, num_patches_list=[pixel_values.size(0)], generation_config=generation_config, questions=[question])

        predicted_weight = responses[0]  # Assuming a single response per image
        predictions.append(convert_and_append_unit(predicted_weight))

    return predictions


# Function to save predictions to a CSV file in the required format
def save_predictions_to_csv(predictions, indices, output_csv_file):
    with open(output_csv_file, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['index', 'prediction'])  # Write the header

        for idx, prediction in tqdm(zip(indices, predictions), desc="Saving predictions", total=len(predictions)):
            writer.writerow([idx, prediction])  # Write index and prediction


# Load test data (image paths, questions, and indices) from JSONL file
image_files, questions_list, indices = load_test_data_from_jsonl('/workspace/dataset/test_data.jsonl')

def split_model(model_name):
    device_map = {}
    world_size = torch.cuda.device_count()
    num_layers = {
        'InternVL2-1B': 24, 'InternVL2-2B': 24, 'InternVL2-4B': 32, 'InternVL2-8B': 32,
        'InternVL2-26B': 48, 'InternVL2-40B': 60, 'InternVL2-Llama3-76B': 80}[model_name]
    # Since the first GPU will be used for ViT, treat it as half a GPU.
    num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
    num_layers_per_gpu = [num_layers_per_gpu] * world_size
    num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
    layer_cnt = 0
    for i, num_layer in enumerate(num_layers_per_gpu):
        for j in range(num_layer):
            device_map[f'language_model.model.layers.{layer_cnt}'] = i
            layer_cnt += 1
    device_map['vision_model'] = 0
    device_map['mlp1'] = 0
    device_map['language_model.model.tok_embeddings'] = 0
    device_map['language_model.model.embed_tokens'] = 0
    device_map['language_model.output'] = 0
    device_map['language_model.model.norm'] = 0
    device_map['language_model.lm_head'] = 0
    device_map[f'language_model.model.layers.{num_layers - 1}'] = 0

    return device_map

generation_config = dict(max_new_tokens=1024, do_sample=False)

# Load model and tokenizer
# path = '/workspace/work_dirs/internvl_chat_v2_0/internvl2_2b_internlm2_1_8b_dynamic_res_2nd_finetune_lora_amazon_merge'
# device_map = split_model('InternVL2-8B')
path = 'OpenGVLab/InternVL2-2B'

model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    use_flash_attn=True,
    trust_remote_code=True).eval().cuda()
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)

# Run inference on the test data
predictions = run_inference_on_test_data(model, tokenizer, image_files, questions_list)

# Save the predictions to a CSV file in the required format
output_csv_file = '/workspace/dataset/test_predictions.csv'
save_predictions_to_csv(predictions, indices, output_csv_file)

print(f"Predictions saved to {output_csv_file}")


Loading test data:   1%|          | 1000/131187 [00:00<00:00, 216536.09it/s]
Running inference:   4%|▎         | 37/1000 [02:36<1:14:16,  4.63s/it]

In [1]:
import torch
torch.cuda.empty_cache()

import gc
gc.collect()

0

In [None]:
from huggingface_hub import login
from huggingface_hub import HfApi, HfFolder

login('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')

In [None]:
import re
import json
import torch
from PIL import Image
from transformers import AutoModel, AutoTokenizer, GenerationConfig
import csv
from tqdm import tqdm
import math
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

# Define the entity_unit_map
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'maximum_weight_recommendation': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'fluid ounce', 'gallon',
                    'imperial gallon', 'litre', 'microlitre', 'millilitre', 'pint', 'quart'}
}


class TestDataset(Dataset):
    def __init__(self, jsonl_file, transform, entity_unit_map, max_lines=1000):
        self.image_paths = []
        self.questions = []
        self.indices = []
        self.transform = transform
        self.entity_unit_map = entity_unit_map

        with open(jsonl_file, 'r') as f:
            lines = f.readlines()
            if max_lines:
                lines = lines[:max_lines]
            for line in tqdm(lines, desc="Loading test data"):
                data = json.loads(line)
                image_path = data['image']  # Extract the image file path
                self.image_paths.append(image_path)  # Add image file path to the list

                conversation = data['conversations']
                for item in conversation:
                    if item['from'] == 'human':  # Extract the question from the human input
                        key = item['value'].split('\n')[1]  # Extract entity name from the question

                        # Use this key to get the corresponding units from entity_unit_map
                        item_weight_units = ', '.join(self.entity_unit_map[key])

                        # Modify the question string by replacing the placeholder with the actual units
                        question = f"What is {key} of the item in one of the following units ({item_weight_units})?"
                        self.questions.append(question)

                        self.indices.append(data['id'])  # Add the index from the test data

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        # Load and preprocess the image
        image_file = self.image_paths[idx]
        image = Image.open(image_file).convert('RGB')

        # Process image into patches
        images = dynamic_preprocess(
            image,
            image_size=448,
            use_thumbnail=True,
            max_num=12
        )
        pixel_values = [self.transform(img) for img in images]
        pixel_values = torch.stack(pixel_values)  # Shape: [num_patches, channels, height, width]

        # Get the question and index
        question = self.questions[idx]
        index = self.indices[idx]

        return pixel_values, question, index

def collate_fn(batch):
    pixel_values_list, questions, indices = zip(*batch)
    # Combine pixel values into a list
    pixel_values = list(pixel_values_list)
    questions = list(questions)
    indices = list(indices)
    return pixel_values, questions, indices

def convert_and_append_unit(text):
    match = re.search(r'\d+(\.\d+)?', text)  # Search for a number (with optional decimal)
    if match:
        number = float(match.group())  # Convert the matched number to float
        unit = text.split()[-1]  # Extract the unit (last part of the string)
        return f"{number} {unit}"
    else:
        return ""  # Return an empty string if no number is found

def run_inference_on_test_data(model, tokenizer, dataloader, device):
    predictions = []
    indices_list = []

    model.eval()
    for pixel_values_list, questions, indices in tqdm(dataloader, desc="Running inference"):
        # Move inputs to the appropriate device and flatten pixel values
        pixel_values = [pv.to(device, dtype=torch.bfloat16) for pv in pixel_values_list]
        num_patches_list = [pv.size(0) for pv in pixel_values]  # Number of patches per image

        # Concatenate pixel_values along the batch dimension
        pixel_values = torch.cat(pixel_values, dim=0)  # Shape: [total_patches, channels, height, width]

        # Run inference
        with torch.no_grad():
            responses = model.batch_chat(
                tokenizer,
                pixel_values,
                num_patches_list=num_patches_list,
                generation_config=generation_config,
                questions=questions
            )

        for response in responses:
            predicted_weight = convert_and_append_unit(response)
            predictions.append(predicted_weight)

        indices_list.extend(indices)

    return predictions, indices_list

def save_predictions_to_csv(predictions, indices, output_csv_file):
    with open(output_csv_file, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['index', 'prediction'])  # Write the header

        for idx, prediction in tqdm(zip(indices, predictions), desc="Saving predictions", total=len(predictions)):
            writer.writerow([idx, prediction])  # Write index and prediction

def split_model(model_name, num_layers):
    device_map = {}
    world_size = torch.cuda.device_count()

    # Since the first GPU will be used for ViT, treat it as half a GPU.
    num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
    num_layers_per_gpu_list = [num_layers_per_gpu] * world_size
    num_layers_per_gpu_list[0] = math.ceil(num_layers_per_gpu_list[0] * 0.5)
    layer_cnt = 0

    for i, num_layer in enumerate(num_layers_per_gpu_list):
        for _ in range(num_layer):
            if layer_cnt < num_layers:
                device_map[f'language_model.model.layers.{layer_cnt}'] = i
                layer_cnt += 1

    # Assign other components to GPU 0
    components_on_gpu0 = [
        'vision_model', 'mlp1', 'language_model.model.tok_embeddings',
        'language_model.model.embed_tokens', 'language_model.output',
        'language_model.model.norm', 'language_model.lm_head',
        f'language_model.model.layers.{num_layers - 1}'
    ]
    for component in components_on_gpu0:
        device_map[component] = 0

    return device_map

# Main execution
if __name__ == "__main__":
    # Build the image transform
    transform = build_transform(input_size=448)

    # Create the dataset
    test_dataset = TestDataset(
        jsonl_file='/workspace/dataset/test_data.jsonl',
        transform=transform,
        entity_unit_map=entity_unit_map,
        max_lines=1000  # Set to an integer if you want to limit the number of samples
    )

    # Create the DataLoader with the custom collate function
    batch_size = 16  # Adjust based on your GPU memory
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

    # Load model and tokenizer
    # path = '/workspace/work_dirs/internvl_chat_v2_0/internvl2_2b_internlm2_1_8b_dynamic_res_2nd_finetune_lora_amazon_merge'
    path = 'OpenGVLab/InternVL2-2B'
    model_name = 'InternVL2-2B'  # Adjust according to your model

    # Load the model temporarily to get the number of layers
    model_temp = AutoModel.from_pretrained(
        path,
        torch_dtype=torch.bfloat16,
        low_cpu_mem_usage=True,
        use_flash_attn=True,
        trust_remote_code=True
    )
    actual_num_layers = len(model_temp.language_model.model.layers)
    print(f"The model has {actual_num_layers} layers.")
    del model_temp
    torch.cuda.empty_cache()

    device_map = split_model(model_name, actual_num_layers)
    model = AutoModel.from_pretrained(
        path,
        torch_dtype=torch.bfloat16,
        low_cpu_mem_usage=True,
        use_flash_attn=True,
        trust_remote_code=True,
        device_map=device_map
    ).eval()

    tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)

    # Adjust the code in modeling_internvl_chat.py if necessary
    # For example, change generation_config['eos_token_id'] to generation_config.eos_token_id

    # Determine the device for the vision model (assumed to be on GPU 0)
    device = torch.device(f'cuda:{device_map["vision_model"]}')

    # Run inference on the test data
    predictions, indices = run_inference_on_test_data(model, tokenizer, test_loader, device)

    # Save the predictions to a CSV file in the required format
    output_csv_file = '/workspace/dataset/test_predictions.csv'
    save_predictions_to_csv(predictions, indices, output_csv_file)

    print(f"Predictions saved to {output_csv_file}")


Loading test data: 100%|██████████| 1000/1000 [00:00<00:00, 226413.17it/s]


config.json:   0%|          | 0.00/3.93k [00:00<?, ?B/s]

configuration_internvl_chat.py:   0%|          | 0.00/3.85k [00:00<?, ?B/s]

configuration_internlm2.py:   0%|          | 0.00/7.00k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL2-2B:
- configuration_internlm2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


configuration_intern_vit.py:   0%|          | 0.00/5.55k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL2-2B:
- configuration_intern_vit.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL2-2B:
- configuration_internvl_chat.py
- configuration_internlm2.py
- configuration_intern_vit.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_internvl_chat.py:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

modeling_intern_vit.py:   0%|          | 0.00/18.1k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL2-2B:
- modeling_intern_vit.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_internlm2.py:   0%|          | 0.00/61.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL2-2B:
- modeling_internlm2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


conversation.py:   0%|          | 0.00/15.0k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL2-2B:
- conversation.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL2-2B:
- modeling_internvl_chat.py
- modeling_intern_vit.py
- modeling_internlm2.py
- conversation.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/4.41G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/115 [00:00<?, ?B/s]

The model has 24 layers.


tokenizer_config.json:   0%|          | 0.00/4.00k [00:00<?, ?B/s]

tokenization_internlm2.py:   0%|          | 0.00/8.79k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL2-2B:
- tokenization_internlm2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


tokenizer.model:   0%|          | 0.00/1.48M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/179 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

Running inference:   2%|▏         | 1/63 [01:54<1:58:48, 114.97s/it]