In [1]:
import os 
#os.environ["NCCL_P2P_DISABLE"]="1"
# os.environ["CUDA_VISIBLE_DEVICES"]="0,2"
prompt_template = """You have been assigned a retrieval task about {topic}. 
With the following queries: 
{questions}

Produce two german document, each at least 100 words long, on the subject of {topic}. These documents should be composed in a style that mirrors the type of content one would typically find when searching for answers to a question, such as a Wikipedia article, blog post, news article, list, advertisement etc. Never create documents that only advice on how or where to search for information! For example, if the query is "Search for information about the history of Berlin", the document should provide a detailed account of Berlin's history, rather than general advice on how to search for historical information. The style of the documents should mimic the type of results that the question is searching for. Both texts should be of similar length to ensure consistency when comparing them.

The first document serves as a 'hard negative' example. It should discuss close to the topic of {topic}, but it should never answer the queries!:
{questions}
Again the hard negative should never provide the answer to the query. For instance, if the query is "When is Costco open?", the hard negative example might discuss the opening hours of Walmart instead.

The second document should act as a 'positive' example. It should directly answer the queries:
{questions}
This document should be informative and precise, offering a specific answer or solution to the queries. Always create both documents in german!"""# prompt_template = """You have been assigned a retrieval task {topic}
# With the following queries: 
# {questions}

# Your mission is to write one text retrieval example for this task with the following elements:
# - "positive_document": a relevant document for the query.
# - "hard_negative_document": a hard negative document that only appears relevant to the query.

# Please adhere to the following guidelines:
# - All documents must be created independent of the query. Avoid copying the query verbatim. It’s acceptable if some parts of the "positive_document" are not topically related to the query.
# - All documents should be at least 100 words long.
# - The "hard_negative_document" contains some useful information, but it should be less useful or comprehensive compared to the "positive_document".
# - The documents should be in german.
# - Do not provide any explanation in any document on why it is relevant or not relevant to the query.

# - Both the query and documents require college level education to understand."""


response_template = """Hard negative german document (not containing the viable information for the queries!):\n"""

In [2]:
import torch 
import vllm 
import pandas as pd 
from vllm import SamplingParams
from transformers import AutoTokenizer

model_name = "TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ"
sampling_params = SamplingParams(temperature=0.1, max_tokens=16000)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
llm = vllm.LLM(model=model_name, quantization="gptq", 
               #device="cuda:0,cuda:2",
               dtype=torch.float16, 
               tensor_parallel_size=2, 
               max_model_len=16000, 
               revision="gptq-4bit-32g-actorder_True", 
               gpu_memory_utilization=0.75, 
               # enforce_eager=True, 
               # disable_custom_all_reduce=True
)



2024-02-07 21:33:54,387	INFO worker.py:1724 -- Started a local Ray instance.


INFO 02-07 21:33:55 llm_engine.py:72] Initializing an LLM engine with config: model='TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ', tokenizer='TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ', tokenizer_mode=auto, revision=gptq-4bit-32g-actorder_True, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=16000, download_dir=None, load_format=auto, tensor_parallel_size=2, disable_custom_all_reduce=False, quantization=gptq, enforce_eager=False, kv_cache_dtype=auto, seed=0)
[36m(RayWorkerVllm pid=36128)[0m INFO 02-07 21:34:00 weight_utils.py:164] Using model weights format ['*.safetensors']
INFO 02-07 21:34:01 weight_utils.py:164] Using model weights format ['*.safetensors']
INFO 02-07 21:34:15 llm_engine.py:322] # GPU blocks: 1940, # CPU blocks: 4096
INFO 02-07 21:34:17 model_runner.py:632] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce



[36m(RayWorkerVllm pid=36128)[0m INFO 02-07 21:34:52 model_runner.py:698] Graph capturing finished in 35 secs.
INFO 02-07 21:34:52 model_runner.py:698] Graph capturing finished in 35 secs.


In [3]:
import pandas as pd
import numpy as np 
df = pd.read_parquet("03_parsed_questions.parquet")
df[["Positive", "Hard Negative"]] = np.nan
df = df.iloc[::-1]

In [4]:
from tqdm import tqdm 

def generate_prompt(row):
    row = row.fillna("")
    questions = "\n".join(row[["Imperative Form", "Question", "Search String"]].str.removesuffix('"').str.removeprefix('"').to_list())
    topic = row["topic"]
    formatted_prompt = tokenizer.apply_chat_template(conversation=[
        {"role": "user", "content":prompt_template.replace("{questions}", str(questions)).replace("{topic}", str(topic))},
        {"role": "assistant", "content":response_template}
        ], tokenize=False)
    formatted_prompt = formatted_prompt.removesuffix("</s>")
    return formatted_prompt


BATCH_SIZE = 32

df = pd.read_parquet("04_results_texts_v5.parquet")
df_nan = df[df["raw_texts"]=="nan"]


for i in tqdm(range(0, len(df_nan), BATCH_SIZE), desc="Processing batches"):
    batches = df_nan[["topic", "Imperative Form", "Question", "Search String"]].iloc[i:i+BATCH_SIZE]
    formatted_prompt =[generate_prompt(batch) for n, batch in batches.iterrows()]
    results = llm.generate(formatted_prompt, sampling_params=sampling_params)
    results_adj = [result.prompt.split("[/INST]")[-1]+ result.outputs[0].text for result in results]
    df.loc[batches.index, 'raw_texts'] = results_adj
    df.to_parquet("04_results_texts_v5.parquet")   


# vllm 0.2.7
# Processed prompts: 100%|██████████| 32/32 [02:03<00:00,  3.87s/it]
# Processed prompts: 100%|██████████| 32/32 [02:15<00:00,  4.25s/it]
# Processed prompts: 100%|██████████| 32/32 [02:14<00:00,  4.21s/it]
# Processed prompts: 100%|██████████| 32/32 [02:17<00:00,  4.31s/it]
# Processed prompts: 100%|██████████| 32/32 [02:07<00:00,  3.98s/it]
# Processed prompts: 100%|██████████| 32/32 [02:16<00:00,  4.28s/it]

# vllm 0.3: disable cuda graph & all reduce
# Processed prompts: 100%|██████████| 32/32 [02:13<00:00,  4.17s/it]
# Processed prompts: 100%|██████████| 32/32 [02:39<00:00,  4.99s/it]
# Processed prompts: 100%|██████████| 32/32 [02:05<00:00,  3.93s/it]
    
# vllm 0.3: disable all reduce
# Processed prompts: 100%|██████████| 32/32 [02:14<00:00,  4.22s/it]
# Processed prompts: 100%|██████████| 32/32 [02:07<00:00,  3.99s/it]
# Processed prompts: 100%|██████████| 32/32 [02:03<00:00,  3.86s/it]
# Processed prompts: 100%|██████████| 32/32 [02:00<00:00,  3.78s/it]
# Processed prompts: 100%|██████████| 32/32 [02:01<00:00,  3.80s/it]
# Processed prompts: 100%|██████████| 32/32 [01:56<00:00,  3.65s/it]
# Processed prompts: 100%|██████████| 32/32 [02:08<00:00,  4.01s/it]
# Processed prompts: 100%|██████████| 32/32 [02:13<00:00,  4.16s/it]
# Processed prompts: 100%|██████████| 32/32 [02:08<00:00,  4.03s/it]
# Processed prompts: 100%|██████████| 32/32 [01:53<00:00,  3.54s/it]
# Processed prompts: 100%|██████████| 32/32 [01:45<00:00,  3.28s/it]
# Processed prompts: 100%|██████████| 32/32 [02:00<00:00,  3.76s/it]
    

    

Processed prompts: 100%|██████████| 32/32 [02:30<00:00,  4.69s/it]
Processed prompts: 100%|██████████| 32/32 [02:26<00:00,  4.57s/it]6s/it]
Processed prompts: 100%|██████████| 32/32 [02:01<00:00,  3.79s/it]0s/it]
Processed prompts: 100%|██████████| 32/32 [02:06<00:00,  3.94s/it]2s/it]
Processed prompts: 100%|██████████| 32/32 [02:20<00:00,  4.38s/it]7s/it]
Processed prompts: 100%|██████████| 32/32 [02:08<00:00,  4.02s/it]7s/it]
Processed prompts: 100%|██████████| 32/32 [02:14<00:00,  4.19s/it]6s/it]
Processed prompts: 100%|██████████| 32/32 [02:19<00:00,  4.35s/it]0s/it]
Processed prompts: 100%|██████████| 32/32 [02:12<00:00,  4.15s/it]6s/it]
Processed prompts: 100%|██████████| 32/32 [02:15<00:00,  4.22s/it]4s/it]
Processed prompts: 100%|██████████| 32/32 [02:12<00:00,  4.13s/it]15s/it]
Processed prompts: 100%|██████████| 32/32 [02:00<00:00,  3.75s/it]36s/it]
Processed prompts: 100%|██████████| 32/32 [02:22<00:00,  4.46s/it]20s/it]
Processed prompts: 100%|██████████| 32/32 [02:13<00:00

KeyboardInterrupt: 