In [1]:
import os
import json
import datasets
from tqdm import tqdm
from memorag import Model
from functools import partial
from functools import partial
from transformers.utils import logging
from torch.utils.data import DataLoader
from longbench_utils import DATASET2CATEGORY, scorer, DATASET2PROMPT, DATASET2MAXNEWTOKENS, makedirs, FileLogger, DefaultDataCollator

logger = logging.get_logger(__name__)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
print(os.getcwd())

/nvme1/data_rt/hierarchy_graphrag


In [3]:
gen_model_name_or_path = "/home/rt/data/model/Qwen/Qwen2.5-7B-Instruct"
load_in_4bit = True
enable_flash_attn = False
gen_model = Model(gen_model_name_or_path, cache_dir=None, access_token='', load_in_4bit=load_in_4bit, enable_flash_attn=enable_flash_attn)

Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.08it/s]


In [4]:
out = gen_model.generate(
    prompts='write a C++ program to implement the admm algorithm',
    max_new_tokens=2048,
    do_sample=True
)
print(out[0])

Certainly! The Alternating Direction Method of Multipliers (ADMM) is a popular method for solving convex optimization problems involving a separation of variables. Below is an implementation of the ADMM algorithm in C++ for a simple optimization problem.

Let's consider the optimization problem:

\[
\text{minimize} \quad f(x) + g(z) \quad \text{subject to} \quad Ax + Bz = c
\]

Where \( f \) and \( g \) are convex functions, \( A \) is a matrix, \( B \) is another matrix, and \( c \) is a vector.

Here's a basic implementation:

```cpp
#include <iostream>
#include <vector>
#include <cmath>

// Function to evaluate f(x)
double evaluate_f(const double x) {
    // Example f(x) = x^2
    return x * x;
}

// Function to evaluate g(z)
double evaluate_g(const double z) {
    // Example g(z) = z^2
    return z * z;
}

// Function to compute the gradient of f(x)
double gradient_f(const double x) {
    // Example gradient f'(x) = 2x
    return 2 * x;
}

// Function to compute the gradient of g(z

In [4]:
def process_longbench(data, indices, tokenizer, max_length=3500, truncate_from_middle=True):
    outputs = {'context': [], 'question': [], "dataset": [], "index": [], "length": []}

    for input, context, dataset, index in zip(data['input'], data['context'], data['dataset'], indices):
        if dataset.endswith("_e"):
            dataset = dataset[:-2]

        if dataset in ['narrativeqa', 'qasper', 'multifieldqa_en', 'hotpotqa', '2wikimqa', 'musique', 'qmsum']:
            question = input
        elif dataset == "gov_report":
            question = ""
        elif dataset == "multi_news":
            question = ""
        else:
            continue
        
        if max_length is not None:
            if truncate_from_middle:
                try:
                    tokenized_context = tokenizer.encode(context, add_special_tokens=False)
                except:
                    tokenized_context = tokenizer.encode(context)
                if len(tokenized_context) > max_length:
                    half = int(max_length / 2)
                    context = tokenizer.decode(tokenized_context[:half]) + tokenizer.decode(tokenized_context[-half:])
            else:
                tokenized_context = tokenizer.encode(context)
                context = tokenizer.decode(tokenized_context[-max_length:])

        length = len(tokenizer.encode(context))

        outputs["context"].append(context)
        outputs["question"].append(question)
        outputs["dataset"].append(dataset)
        outputs["index"].append(index)
        outputs["length"].append(length)

    return outputs

In [5]:
output_dir = "./results/longbench/"

dataset_names = ['narrativeqa', 'qasper', 'multifieldqa_en', 'hotpotqa', '2wikimqa', 'musique'] # ['narrativeqa', 'qasper', 'hotpotqa']
# raw_dataset = datasets.load_dataset("json", data_files=f'/home/rt/data/MemoRAG/THUDM/LongBench/data/{dataset_names[0]}.jsonl', split="train")
raw_dataset = datasets.load_dataset("json", data_files='../dataset/TommyChien/MemoRAG-data/longbench.json', split="train")

In [6]:
max_length = 100000
truncate_from_middle = True

process_fn = partial(
            process_longbench, 
            tokenizer=gen_model.tokenizer,
            max_length=max_length,
            truncate_from_middle=truncate_from_middle
        )

dataset = raw_dataset.map(process_fn, batched=True, num_proc=32, with_indices=True, remove_columns=raw_dataset.column_names)
groupby_dataset = dataset.to_pandas().groupby("dataset")

In [7]:
metrics = {}
result_dir = ''
result_dir = os.path.join(output_dir, result_dir)

for i, dataset_name in enumerate(dataset_names):
    logger.info(f"Evaluating {dataset_name} ({i + 1} / {len(dataset_names)})...")

    result_path = os.path.join(result_dir, f"{dataset_name}.json")
    
    dataset = datasets.Dataset.from_pandas(groupby_dataset.get_group(dataset_name), preserve_index=False)

    data_collator = DefaultDataCollator(padding_side="left")
    dataloader = DataLoader(
        dataset, 
        batch_size=1, 
        collate_fn=data_collator,
        # only pin memory when no gpu
    )

    indices = []
    preds = []
    memory_results = []
    _prompt = DATASET2PROMPT[dataset_name]
    task_max_new_token=DATASET2MAXNEWTOKENS[dataset_name]
    
    for i, x in enumerate(tqdm(dataloader, desc="Generating")):
        x.pop("dataset")
        index = x.pop("index")[0]
        
        # if "QA" in DATASET2CATEGORY[dataset_name]:
        #     output = [pipe(x["context"][0], x["question"][0], prompt_template=_prompt, task_type="rag", max_new_tokens=task_max_new_token, reset_each_call=True, use_memory_answer=True)]
        # else:
        #     output = [pipe(x["context"][0], x["question"][0], prompt_template=_prompt, task_type="summarize", max_new_tokens=task_max_new_token, reset_each_call=True, use_memory_answer=True)]

        # generate output
        prompt = _prompt.format(context=x["context"][0], input=x["question"][0])
        output = gen_model.generate(prompts=prompt, max_new_tokens=task_max_new_token, do_sample=True)

        # print(output)

        index = index.tolist()
        preds.extend(output)
        if isinstance(index, list):
            indices.extend(index)
        else:
            # single process
            indices.append(index)

        raw_dataset_subset = raw_dataset[indices]
        answers = raw_dataset_subset["answers"]
        lengths = raw_dataset_subset["length"]
        all_classes = []
        score = scorer(dataset_name, preds, answers, all_classes)        
        
        logger.info(f"{dataset_name}: {score}")
        metrics[dataset_name] = score

        with open(makedirs(result_path), "w", encoding="utf-8") as f:
            f.write(json.dumps(score, ensure_ascii=False) + "\n")
            for index, pred in zip(indices, preds):
                sample = raw_dataset[index]
                del sample["context"]
                sample["pred"] = pred
                f.write(json.dumps(sample, ensure_ascii=False) + "\n")


Generating:   2%|▏         | 4/200 [00:12<08:06,  2.48s/it]This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (32768). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.
Generating: 100%|██████████| 200/200 [15:42<00:00,  4.71s/it]
Generating: 100%|██████████| 200/200 [03:03<00:00,  1.09it/s]
Generating: 100%|██████████| 150/150 [03:30<00:00,  1.41s/it]
Generating: 100%|██████████| 200/200 [05:53<00:00,  1.77s/it]
Generating: 100%|██████████| 200/200 [03:26<00:00,  1.03s/it]
Generating: 100%|██████████| 200/200 [07:29<00:00,  2.25s/it]
