In [1]:
import argparse
import os
import torch
import gc
import bitsandbytes
from datasets import load_dataset
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

from transformers import set_seed
from transformers import (
    LlamaForCausalLM,
    LlamaTokenizer,
    AutoModelForCausalLM,
    AutoConfig,
)
from transformers import (
    AutoTokenizer,
    get_linear_schedule_with_warmup,
    DataCollatorWithPadding,
    DataCollatorForSeq2Seq,
)
from transformers import BitsAndBytesConfig
from peft import (
    get_peft_model,
    prepare_model_for_kbit_training,
    LoraConfig,
    TaskType,
)
from torch.distributed.fsdp.fully_sharded_data_parallel import (
    FullOptimStateDictConfig,
    FullStateDictConfig,
)
from accelerate import Accelerator, FullyShardedDataParallelPlugin
from psutil import Process
from pynvml import (
    nvmlInit,
    nvmlDeviceGetHandleByIndex,
    nvmlDeviceGetMemoryInfo,
    nvmlDeviceGetCount,
)


class SystemMonitor:
    def __init__(self):
        # Initialize NVML for GPU monitoring
        self.nvml_initialized = SystemMonitor._initialize_nvml()

    @classmethod
    def _initialize_nvml(cls):
        try:
            nvmlInit()
            return True
        except Exception as e:
            print(f"Error initializing NVML: {e}")
            return False

    def get_ram_usage(self):
        return Process().memory_info().rss / (1024 * 1024)

    def get_gpu_memory_usage(self):
        if not self.nvml_initialized:
            print("NVML not initialized.")
            return None

        gpu_memory_usage = []
        try:
            gpu_count = nvmlDeviceGetCount()
            for i in range(gpu_count):
                handle = nvmlDeviceGetHandleByIndex(i)
                info = nvmlDeviceGetMemoryInfo(handle)
                gpu_memory_usage.append(info.used // 1024**3)
        except Exception as e:
            print(f"Error retrieving GPU memory info: {e}")
            return None

        return gpu_memory_usage

    def get_gpu_utilization(self):
        gpu_memory_usages = self.get_gpu_memory_usage()
        return gpu_memory_usages if gpu_memory_usages is not None else None

In [2]:
import json
import os.path as osp
from typing import Union


class Prompter(object):
    __slots__ = ("template", "_verbose")

    def __init__(self, template_name: str = "", verbose: bool = False):
        self._verbose = verbose
        if not template_name:
            # Enforce the default here, so the constructor can be called with '' and will not break.
            template_name = "alpaca"
        file_name = osp.join("templates", f"{template_name}.json")
        if not osp.exists(file_name):
            raise ValueError(f"Can't read {file_name}")
        with open(file_name) as fp:
            self.template = json.load(fp)
        if self._verbose:
            print(
                f"Using prompt template {template_name}: {self.template['description']}"
            )

    def generate_prompt(
        self,
        instruction: str,
        input: Union[None, str] = None,
        label: Union[None, str] = None,
    ) -> str:
        # returns the full prompt from instruction and optional input
        # if a label (=response, =output) is provided, it's also appended.
        if input:
            res = self.template["prompt_input"].format(
                instruction=instruction, input=input
            )
        else:
            res = self.template["prompt_no_input"].format(instruction=instruction)
        if label:
            res = f"{res}{label}"
        if self._verbose:
            print(res)
        return res

    def get_response(self, output: str) -> str:
        return output.split(self.template["response_split"])[1].strip()


class TokenizerHelper:
    def __init__(self, prompter, tokenizer, cutoff_len):
        self.prompter = prompter
        self.tokenizer = tokenizer
        self.train_on_inputs = True
        self.add_eos_token = True
        self.cutoff_len = cutoff_len

    def tokenize(self, prompt):
        result = self.tokenizer(
            prompt,
            truncation=True,
            padding=False,
            max_length=self.tokenizer.model_max_length,
        )
        return result

    def generate_and_tokenize_prompt(self, data_point):
        full_prompt = self.prompter.generate_prompt(
            data_point["instruction"],
            data_point["input"],
            data_point["output"],
        )
        tokenized_full_prompt = self.tokenize(full_prompt)

        if not self.train_on_inputs:
            user_prompt = self.prompter.generate_prompt(
                data_point["instruction"], data_point["input"]
            )
            tokenized_user_prompt = self.tokenize(user_prompt)
            user_prompt_len = len(tokenized_user_prompt["input_ids"])

            if self.add_eos_token:
                user_prompt_len -= 1

            tokenized_full_prompt["labels"] = [
                -100
            ] * user_prompt_len + tokenized_full_prompt["input_ids"][
                user_prompt_len:
            ]  # could be sped up, probably
        else:
            tokenized_full_prompt["labels"] = tokenized_full_prompt["input_ids"]
        # print(tokenized_full_prompt)
        return tokenized_full_prompt

In [3]:
if torch.cuda.is_available():
    torch.backends.cuda.matmul.allow_tf32 = True

set_seed(1001)
fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(
        offload_to_cpu=True, rank0_only=False
    ),
)

accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [4]:
class Configuration:
    def __init__(self, **kwargs):
        self.device_count = torch.cuda.device_count()
        self.experiment_name = kwargs.get("experiment_name", "default_experiment")
        self.keep_fraction = kwargs.get("keep_fraction", 0.99)
        self.test_fraction = kwargs.get("test_fraction", 0.2)
        self.scratch_path = kwargs.get("scratch_path", "/scratch/vgn2004")
        self.num_workers = kwargs.get("num_workers", 8)
        self.batch_size = kwargs.get("batch_size", 8)
        self.lr = kwargs.get("lr", 3e-4)
        self.num_epochs = kwargs.get("num_epochs", 5)
        self.seq_length = kwargs.get("seq_length", 32768)
        self.device = kwargs.get("device", accelerator.device)
        self.device_map = kwargs.get("device_map", "auto")
        self.max_gpu_memory = kwargs.get("max_gpu_memory", "45080MB")
        # self.device_map = kwargs.get("device_map", {"": accelerator.process_index})

        self.model_name_or_path = kwargs.get(
            "model_name_or_path",
            "togethercomputer/LLaMA-2-7B-32K",  # "NousResearch/Llama-2-7b-chat-hf"
        )

        self.r = kwargs.get("r", 16)
        self.lora_alpha = kwargs.get("lora_alpha", 64)
        self.lora_dropout = kwargs.get("lora_dropout", 0.2)
        self.lora_bias = kwargs.get("lora_bias", "none")
        self.is_gradient_checkpointing_enabled = kwargs.get(
            "is_gradient_checkpointing_enabled", True
        )
        self.is_gradient_accumulation_enabled = kwargs.get(
            "is_gradient_accumulation_enabled", True
        )
        self.gradient_accumulation_steps = kwargs.get(
            "gradient_accumulation_steps", self.batch_size
        )
        self.batch_size = 1

        self.is_quantized = kwargs.get("is_quantized", True)

    def __str__(self):
        return "\n".join(f"{k}: {v}" for k, v in vars(self).items())

In [5]:
parser = argparse.ArgumentParser(description="Fine-tuning configuration")
parser.add_argument("--experiment_name", type=str, default="default_experiment")
args, unknown = parser.parse_known_args()

kwargs = vars(args)
kwargs.update(
    dict((arg[0].lstrip("-"), arg[1]) for arg in zip(unknown[::2], unknown[1::2]))
)
print(f"KWARGS: {kwargs}")

torch.cuda.empty_cache()
gc.collect()

# os.environ.update(env_vars)

config = Configuration(**kwargs)
print(f"Configuration: \n{config}")

KWARGS: {'experiment_name': 'default_experiment', 'f': '/home/vgn2004/.local/share/jupyter/runtime/kernel-4e2efc4c-f2bf-471a-b434-16f2153f95b2.json'}
Configuration: 
device_count: 1
experiment_name: default_experiment
keep_fraction: 0.99
test_fraction: 0.2
scratch_path: /scratch/vgn2004
num_workers: 8
batch_size: 1
lr: 0.0003
num_epochs: 5
seq_length: 2024
device: cuda
device_map: auto
max_gpu_memory: 45080MB
model_name_or_path: togethercomputer/LLaMA-2-7B-32K
r: 16
lora_alpha: 64
lora_dropout: 0.2
lora_bias: none
is_gradient_checkpointing_enabled: True
is_gradient_accumulation_enabled: True
gradient_accumulation_steps: 8
is_quantized: True


In [6]:
monitor = SystemMonitor()
print(f"Baseline usage: {monitor.get_gpu_utilization()} GB of GPU")

Baseline usage: [0] GB of GPU


In [7]:
tokenizer = AutoTokenizer.from_pretrained(config.model_name_or_path)
tokenizer.model_max_length = config.seq_length
tokenizer.padding_side = "left"
tokenizer.pad_token, tokenizer.eos_token

('<unk>', '</s>')

In [8]:
model_config = AutoConfig.from_pretrained(config.model_name_or_path)
model_config.max_position_embeddings = config.seq_length
model_config.bos_token_id = tokenizer.bos_token_id
model_config.eos_token_id = tokenizer.eos_token_id
model_config.pad_token_id = tokenizer.pad_token_id
model_config

LlamaConfig {
  "_name_or_path": "togethercomputer/LLaMA-2-7B-32K",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "auto_map": {
    "AutoModelForCausalLM": "togethercomputer/LLaMA-2-7B-32K--modeling_flash_llama.LlamaForCausalLM"
  },
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 2024,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": 0,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 8.0,
    "type": "linear"
  },
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.33.2",
  "use_cache": true,
  "vocab_size": 32000
}

In [9]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=True,
)

if config.is_quantized:
    model = AutoModelForCausalLM.from_pretrained(
        config.model_name_or_path,
        config=model_config,
        device_map=config.device_map,
        quantization_config=quantization_config,
        max_memory={i: config.max_gpu_memory for i in range(config.device_count)},
        trust_remote_code=False,
    )
else:
    model = AutoModelForCausalLM.from_pretrained(config.model_name_or_path)

if config.is_gradient_checkpointing_enabled:
    model.config.use_cache = False
    model.enable_input_require_grads()
    model.gradient_checkpointing_enable()

# Model settings
model.config.pretraining_tp = 1
model.config.torch_dtype = torch.float32
setattr(model, "model_parallel", True)
setattr(model, "is_parallelizable", True)


def find_all_linear_names(m):
    cls = bitsandbytes.nn.Linear4bit
    lora_module_names = set()
    for name, module in m.named_modules():
        if isinstance(module, cls):
            names = name.split(".")
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if "lm_head" in lora_module_names:
        lora_module_names.remove("lm_head")
    return list(lora_module_names)


peft_config = LoraConfig(
    target_modules=find_all_linear_names(model),
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=config.r,
    lora_alpha=config.lora_alpha,
    lora_dropout=config.lora_dropout,
    bias=config.lora_bias,
)

model = prepare_model_for_kbit_training(
    model, use_gradient_checkpointing=config.is_gradient_checkpointing_enabled
)
# model = get_peft_model(model, peft_config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
print(model.config)

# Print dtypes
dtypes = {}
for _, p in model.named_parameters():
    dtype = p.dtype
    if dtype not in dtypes:
        dtypes[dtype] = 0
    dtypes[dtype] += p.numel()
total = 0
for k, v in dtypes.items():
    total += v
for k, v in dtypes.items():
    print(k, v, v / total)

LlamaConfig {
  "_name_or_path": "togethercomputer/LLaMA-2-7B-32K",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "auto_map": {
    "AutoModelForCausalLM": "togethercomputer/LLaMA-2-7B-32K--modeling_flash_llama.LlamaForCausalLM"
  },
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 2024,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": 0,
  "pretraining_tp": 1,
  "quantization_config": {
    "bnb_4bit_compute_dtype": "float16",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.0,
    "load_in_4bit": true,
    "load_in_8bit": false,
    "quant_method": "bitsandbytes"
  },
  "rms_norm_eps": 1e-05,
  "rope_scali

In [11]:
!nvidia-smi

Wed Dec  6 00:33:50 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.54.03              Driver Version: 535.54.03    CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Quadro RTX 8000                On  | 00000000:06:00.0 Off |                    0 |
| N/A   44C    P0              67W / 250W |   5075MiB / 46080MiB |      4%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                         

In [12]:
#  input_context = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

# ### Instruction:
# Write a summary of the following article:
# With new inventions and the advancement of technology, as well as the increased popularity of having online relations on social networking websites, online communication has become a common occurrence for people all over the world. Due to this sudden advancement, there has been a debate regarding the use of online social networking over face-to-face communications and relationships. Due to people’s ability to express their true self on computer mediated conversations, along with the formation of secure online romantic relationships and positive friendships, this paper will argue that online relations are just as good, if not better, than face-to-face interactions.
# To begin, ones true self is whom a person actually is whether they choose to.
# This was evident in a study conducted at New York University (Bargh et al., 2002).
# In this study, participants were asked to sort a series of self-descriptive adjectives as they related to them; categorizing them as either “me” or “not me”. Following that activity, participants were randomly assigned to a face-to-face condition group or an online condition group where they were matched with partners and instructed to begin interacting. This study found that participants in the Internet condition group were faster to respond to their actual self traits, whereas those in the in-person interaction conditions were not able to sort the aforementioned adjectives at the same rate. This supports the claim that an online interaction leads to the activation of ones true self qualities. Therefore, the Internet gives people the confidence and means to express their true self and behave in positive ways that they normally would not if placed in a face-to-face interaction.
# ### Response:"""

# # Encode the input context
# input_ids = tokenizer.encode(input_context, return_tensors="pt", padding=False, truncation=True).to("cuda")
# with torch.inference_mode():
#     output = model.generate(input_ids=input_ids, max_length=512, eos_token_id=29937, top_k=50, temperature=1.0)

#     # Decode the output
#     output_text = tokenizer.decode(output[0], skip_special_tokens=True)

#     # Print the result
#     print(output_text)

In [13]:
dataset = load_dataset("Yukang/LongAlpaca-12k")
dataset = dataset["train"].train_test_split(test_size=0.2)

Repo card metadata block was not found. Setting CardData to empty.


In [14]:
from transformers import TrainingArguments
from trl import DataCollatorForCompletionOnlyLM

# training_args = TrainingArguments(
#     output_dir="sft_trainer_output",
#     num_train_epochs=3,
#     per_device_train_batch_size=1,
#     gradient_accumulation_steps=8,
#     gradient_checkpointing=True,
#     optim="paged_adamw_32bit",
#     logging_steps=10,
#     save_strategy="epoch",
#     learning_rate=2e-4,
#     max_grad_norm=0.3,
#     warmup_ratio=0.03,
#     lr_scheduler_type="constant",
# #     disable_tqdm=False,  # disable tqdm since with packing values are in correct
# )

from trl import SFTTrainer


def format_instruction(sample) -> str:
    # returns the full prompt from instruction and optional input
    # if a label (=response, =output) is provided, it's also appended.
    template = {
        "description": "Template used by Alpaca-LoRA.",
        "prompt_input": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n",
        "prompt_no_input": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:\n",
    }

    if sample["input"]:
        res = template["prompt_input"].format(
            instruction=sample["instruction"][:250], input=sample["input"][:250]
        )
    else:
        res = template["prompt_no_input"].format(
            instruction=sample["instruction"][:250]
        )
    if sample["output"]:
        res = f"{res}{sample['output']}"
    return res


print(format_instruction(dataset["train"][2])[:1000])

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Below is a paper. Memorize the paper and answer my question after the paper.
 The paper begins. 
 Abstract
be used either figuratively or literally, in a context dependent manner: For example, the phrase "clean can be interpreted literally, as in We 

### Response:
The work opens up many possibilities for future research including:

1. Extending the proposed framework to more datasets and tasks involving non-compositional language processing. The current work focuses on idiom usage recognition and metaphor detection but the proposed CLCL framework could potentially be applied to other tasks like concept metaphor detection, simile detection, sarcasm detection, etc. 

2. Further analyzing the differences between non-compositionality in idioms versus metaphors. The results showed some key differences in how the proposed framework performed on idiom usage recognition 



In [15]:
config.seq_length

2024

In [16]:
def tokenize_fn(element):
    return tokenizer(
        format_instruction(element),
        truncation=True,
        padding=False,
        max_length=config.seq_length,
        return_overflowing_tokens=False,
        return_length=False,
        return_tensors="pt",
    )

In [17]:
tokenized_train = (
    dataset["train"]
    .select(range(20))
    .map(
        tokenize_fn,
        batched=True,
        num_proc=config.num_workers,
        remove_columns=dataset["train"].column_names,
        load_from_cache_file=False,
        desc="Running tokenizer on dataset",
    )
)

tokenized_test = (
    dataset["test"]
    .select(range(20))
    .map(
        tokenize_fn,
        batched=True,
        num_proc=config.num_workers,
        remove_columns=dataset["test"].column_names,
        load_from_cache_file=False,
        desc="Running tokenizer on dataset",
    )
)

Running tokenizer on dataset (num_proc=8):   0%|          | 0/20 [00:00<?, ? examples/s]

Running tokenizer on dataset (num_proc=8):   0%|          | 0/20 [00:00<?, ? examples/s]

In [None]:
response_template_with_context = (
    "\n### Response:"  # We added context here: "\n". This is enough for this tokenizer
)
response_template_ids = tokenizer.encode(
    response_template_with_context, add_special_tokens=False
)[
    2:
]  # Now we have it like in the dataset texts: `[2277, 29937, 4007, 22137, 29901]`
data_collator = DataCollatorForCompletionOnlyLM(
    response_template_ids, tokenizer=tokenizer
)

In [18]:
training_dataloader = torch.utils.data.DataLoader(
    tokenized_train,
    batch_size=config.batch_size,
    num_workers=config.num_workers,
    collate_fn=data_collator,
)
validation_dataloader = torch.utils.data.DataLoader(
    tokenized_test,
    batch_size=config.batch_size,
    num_workers=config.num_workers,
    collate_fn=data_collator,
)

In [19]:
optimizer = (
    torch.optim.AdamW(model.parameters(), lr=config.lr)
    if not config.is_quantized
    else bitsandbytes.optim.AdamW(
        model.parameters(), lr=config.lr, is_paged=True, optim_bits=32
    )
)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(training_dataloader) * config.num_epochs),
)

(
    model,
    optimizer,
    training_dataloader,
    validation_dataloader,
    scheduler,
) = accelerator.prepare(
    model, optimizer, training_dataloader, validation_dataloader, lr_scheduler
)


should_exit = False
for epoch in range(config.num_epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(training_dataloader)):
        if epoch == 0 and step < 5:
            print(f"Usage: {monitor.get_gpu_utilization()} GB of GPU")
        optimizer.zero_grad()
        batch = {k: v for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        if torch.isnan(loss):
            print(f"NaN loss detected at Epoch {epoch}, Step {step}")
            should_exit = True
            break
        total_loss += loss.detach().float()
        loss = loss / config.gradient_accumulation_steps
        accelerator.backward(loss)
        if (step + 1) % config.gradient_accumulation_steps == 0:
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

    if should_exit:
        break

    model.eval()
    precision_val, recall_val, accuracy_val, f1_val, eval_loss = evaluate(
        validation_dataloader
    )
    print(
        f"Validation Data - Precision: {precision_val}, Recall: {recall_val}, Accuracy: {accuracy_val}, F1: {f1_val}"
    )
    eval_epoch_loss = eval_loss / len(validation_dataloader)
    eval_ppl = torch.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / len(training_dataloader)
    train_ppl = torch.exp(train_epoch_loss)
    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")

  0%|          | 0/8 [00:00<?, ?it/s]You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.

### Instruction:
['Below is some paragraphs in the book, Don Quixote. Memorize the content and answer my question after the book.\n afflicted damsels or to the prayers of wise, magisterial, ancient\nenchanters and sages. In short, Sancho, either you must be whipped by\nyourself, or they must whip you, or you shan’t be governor.”\n\n“Señor,” said Sancho, “won’t two days’ grace be given me in which to\nconsider what is best for me?”\n\n“No, certainly not,” said Merlin; “here, this minute, and on the spot,\nthe matter must be settled; either Dulcinea will return to the cave of\nMontesinos and to her former condition of peasant wench, or else in her\npresent form shall be carried to the Elysian fields, where she will\nremain wait

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.

### Instruction:
["Below is a paper. Memorize the paper and answer my question after the paper.\n The paper begins. \n Abstract\nMulti-scale features have been proven highly effective huge come with for object detection but  often and even prohibitive extra computation costs, especially for the re - cent Transformer-based detectors: In this paper; pro- we IMFA pose Iterative Multi-scale Feature Aggregation generic paradigm that enables efficient use of multi-scale The core features in Transformer-based object detectors: idea to exploit spa


### Instruction:
['There are two papers. Memorize them and answer my question after the paper.\n The first paper begins. \n Abstract\nensembling We present LLM-BLEND ER, an framework designed to attain consistently su- perior performance by leveraging the diverse strengths of multiple open-source large lan- guage models (LLMs) Our framework con- sists of two modules: PAIRRANKER and GEN FUSER, addressing the observation that opti mal LLMs for different examples can signif- icantly vary: PAIRRANKER employs spe cialized pairwise comparison method to dis tinguish subtle differences between candidate It jointly encodes the input text and outputs using of candidates, cross-attention pair en coders to determine the superior one. Our re sults demonstrate that PAIRRANKER exhibits the highest correlation with ChatGPT-based ranking: Then GENFUSER aims t0 merge the top-ranked candidates, generating an improved output by capitalizing on their strengths and mitigating their weaknesses. To facilitat


### Instruction:
['Below is some paragraphs in the book, Dream of the Red Chamber. Memorize the content and answer my question after the book.\n \n“This is, indeed, strange!” exclaimed Pao-yue. “If you won’t go, what’s the good of all this fuss? I can’t stand this bawling, so it will be a riddance if you would get out of the way!”\n\nSaying this, he was resolved upon going to report the matter. Hsi Jen found herself powerless to dissuade him. She had in consequence no other resource but to fall on her knees.\n\nPi Hen, Ch’iu Wen, She Yueeh and the rest of the waiting-maids had realised what a serious aspect the dispute had assumed, and not a sound was to be heard to fall from their lips. They remained standing outside listening to what was going on.\n\nWhen they now overheard Hsi Jen making solicitous entreaties on her knees, they rushed into the apartment in a body; and with one consent they prostrated themselves on the floor.\n\nPao-yue at once pulled Hsi Jen up. Then with a sigh, h


### Instruction:
['Below is a paper. Memorize the paper and answer my question after the paper.\n The paper begins. \n Abstract\nWe focus on single-view 3D reconstruction, where the shape, appearance, and camera pose is to reconstruct Igoal single image of from Such an object (Fig: 1) a task has applications in content creation, augmented & virtual reality (ARIVR), robotics, and is also interesting from scientific perspective, as most neural architectures cannot As humans, reason about 3D scenes we learn object pri- ors, abstract representations that allow US to imagine what a partially-observed object would look like from other view points.   Incorporating such knowledge into a model would enable higher forms of 3D reasoning: While early work exploiting annotated on 3D reconstruction has focused on 16,20,57,63,72], e.g. ground-truth 3D shapes or mul data tiple 2D views, more recent work has relaxed the assump required by the task. In particular; there has been effort tions learning t


### Instruction:
['Below is a paper. Memorize the paper and answer my question after the paper.\n The paper begins. \n ABSTRACT\nWe study the problem of aligning the supports of distributions. Compared to the existing work 0n distribution alignment, support alignment does not require the densities to be matched We propose symmetric support difference as a divergence quantify the mismatch between supports_ We show that select discrimi measure t0 nators (e.g. discriminator trained for Jensen-Shannon divergence) are able to map support differences as support differences in their one-dimensional output space_ Following this result; our method aligns supports by minimizing a symmetrized optimal transport cost in the discriminator ID space via an adversarial relaxed Furthermore, we show that our approach can be viewed as limit of process_ existing notions of alignment by increasing transportation assignment tolerance We quantitatively evaluate the method across domain adaptation tasks with 

Usage: [5] GB of GPU


  0%|          | 0/8 [00:03<?, ?it/s]

NaN loss detected at Epoch 0, Step 0





In [20]:
# # train
# trainer.train() # there will not be a progress bar since tqdm is disabled

# # save model
# trainer.save_model()

In [21]:
# prompter = Prompter("alpaca")
# tokenizer_helper = TokenizerHelper(
#     prompter, tokenizer, config.max_length
# )
# train_data = (
#     dataset["train"].shuffle().map(tokenizer_helper.generate_and_tokenize_prompt)
# )
# val_data = (
#     dataset["test"]
#     .shuffle()
#     .map(tokenizer_helper.generate_and_tokenize_prompt)
# )

In [22]:
# import matplotlib.pyplot as plt

# token_lengths = [len(tokens['input_ids']) for tokens in train_data]

# # Plotting the histogram
# plt.figure(figsize=(10, 6))
# plt.hist(token_lengths, bins=50)
# plt.title("Histogram of Tokenized Instruction Lengths")
# plt.xlabel("Length of Tokenized Instructions")
# plt.ylabel("Frequency")
# plt.show()

In [23]:
# train_data.save_to_disk("./tokenized_long_qa_train")
# val_data.save_to_disk("./tokenized_long_qa_val")

In [24]:
# train_data_loader = torch.utils.data.DataLoader(stream_dataset,
#                                                     batch_size=args.batch_size * args.data_group_size,
#                                                     shuffle=False,
#                                                     num_workers=num_workers,
#                                                     pin_memory=True,
#                                                     collate_fn=None)

In [25]:
# Function to compute perplexity
# def compute_perplexity(model, dataloader):
#     model.eval()
#     total_loss = 0
#     total_length = 0

#     for batch in tqdm(dataloader, desc="Evaluating"):
#         with torch.no_grad():
#             inputs = batch['input_ids'].to(config.device)
#             labels = batch['labels'].to(config.device)

#             outputs = model(inputs, labels=labels)
#             loss = outputs.loss
#             total_loss += loss.item() * inputs.size(0)
#             total_length += inputs.size(0)

#     avg_loss = total_loss / total_length
#     perplexity = torch.exp(torch.tensor(avg_loss))

#     return perplexity.item()

# # Evaluate the model
# perplexity_score = compute_perplexity(model, validation_dataloader)
# print(f"Perplexity: {perplexity_score}")

In [26]:
# # Function to calculate metrics
# def calculate_metrics(preds, labels):
#     precision = precision_score(labels, preds, average="macro")
#     recall = recall_score(labels, preds, average="macro")
#     accuracy = accuracy_score(labels, preds)
#     f1 = f1_score(labels, preds, average="macro")
#     return precision, recall, accuracy, f1

# # Evaluate a dataloader
# def evaluate(dataloader):
#     model.eval()
#     all_preds = []
#     all_labels = []

#     eval_loss = 0.0
#     with torch.no_grad():
#         for data in tqdm(dataloader):
#             batch = {k: v for k, v in data.items()}
#             outputs = model(**batch)
#             loss = outputs.loss
#             eval_loss += loss.detach().float()
#             preds = torch.argmax(torch.softmax(outputs.logits, dim=1), dim=1)
#             labels = batch["labels"]

#             all_preds.extend(preds.cpu().numpy())
#             all_labels.extend(labels.cpu().numpy())

#     precision, recall, accuracy, f1 = calculate_metrics(all_preds, all_labels)
#     return precision, recall, accuracy, f1, eval_loss