# Fragen erzeugen als search string, Question und imperativ question

In [64]:
previous_response = """"Bestimme das Jahr, in dem das Apple File System (APFS) von Apple vorgestellt wurde."
Question: "In welchem Jahr wurde das Apple File System (APFS) von Apple vorgestellt?"
Search String: "Jahr Apple File System (APFS) vorgestellt"
"""

In [65]:
prompt = """You are a Data Scientist wanting to generate a Dataset for Training an Embedding model for retreival tasks. Therefore in the first step generate an instruction, question or search string with the topic:
{TOPICCC}
Generate the questions/searchstring/instruction in the following form:
Imperative Form: [This is like telling someone to do something. Command or order. It's a sentence that gives direct advice or instruction on the topic. Do not use a "!" at the end]
Question: [This is like asking someone about something.]
Search String: [A set of words that you would type into a search engine to find information on the internet.]

All of the generated question/instruction/searchstring should only be different formulations of each other and be about the same question and in german. Do not use "Sie"
"""

response_template ="{TOPICCC}\nImperative Form: "

In [50]:
import pandas as pd 
import numpy as np 

df = pd.read_parquet("results_parsed.parquet")
df = df.explode(column="questions")
df.reset_index(inplace=True)
df["gen_questions"] = np.nan

In [20]:
import torch 
import vllm 
import pandas as pd 
from vllm import SamplingParams
from transformers import AutoTokenizer

model_name = "TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ"
sampling_params = SamplingParams(temperature=0.1, max_tokens=4000)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
llm = vllm.LLM(model=model_name, quantization="gptq", dtype=torch.float16, tensor_parallel_size=2, max_model_len=2000, revision="gptq-4bit-32g-actorder_True", gpu_memory_utilization=0.75)



2024-01-25 10:05:26,854	INFO worker.py:1724 -- Started a local Ray instance.


INFO 01-25 10:05:27 llm_engine.py:70] Initializing an LLM engine with config: model='TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ', tokenizer='TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ', tokenizer_mode=auto, revision=gptq-4bit-32g-actorder_True, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=2000, download_dir=None, load_format=auto, tensor_parallel_size=2, quantization=gptq, enforce_eager=False, seed=0)
INFO 01-25 10:05:36 llm_engine.py:275] # GPU blocks: 3497, # CPU blocks: 4096
INFO 01-25 10:05:36 model_runner.py:501] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 01-25 10:05:36 model_runner.py:505] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode.
[36m(RayWorkerVllm pid=484370)[0m INFO 



INFO 01-25 10:06:11 model_runner.py:547] Graph capturing finished in 35 secs.


[36m(RayWorkerVllm pid=484370)[0m INFO 01-25 10:06:11 model_runner.py:547] Graph capturing finished in 35 secs.


In [67]:
from tqdm import tqdm 

def generate_prompt(questions):
    formatted_prompt = tokenizer.apply_chat_template(conversation=[
        {"role": "user", "content": prompt.replace("{TOPICCC}", "Apple File System (APFS)")},
        {"role": "assistant", "content":response_template.replace("{TOPICCC}", "Apple File System (APFS)")+previous_response},
        {"role": "user", "content":prompt.replace("{TOPICCC}", questions)},
        {"role": "assistant", "content":response_template.replace("{TOPICCC}", questions)}
        ], tokenize=False)
    formatted_prompt = formatted_prompt.removesuffix("</s>")
    return formatted_prompt

BATCH_SIZE = 32

for i in tqdm(range(0, len(df), BATCH_SIZE)):
    batches = df["questions"].iloc[i:i+BATCH_SIZE]
    formatted_prompt =[generate_prompt(batch) for batch in batches]
    results = llm.generate(formatted_prompt, sampling_params=sampling_params)
    results_adj = [result.prompt.split("[/INST]")[-1]+ result.outputs[0].text for result in results]
    df.loc[batches.index, 'gen_questions'] = results_adj
    df.to_parquet("results_questions.parquet")   


Processed prompts: 100%|██████████| 32/32 [01:09<00:00,  2.16s/it]
Processed prompts: 100%|██████████| 32/32 [00:49<00:00,  1.56s/it]
Processed prompts:  62%|██████▎   | 20/32 [13:02<07:49, 39.11s/it]
Processed prompts: 100%|██████████| 32/32 [00:56<00:00,  1.76s/it]
Processed prompts: 100%|██████████| 32/32 [00:49<00:00,  1.53s/it]
Processed prompts: 100%|██████████| 32/32 [00:43<00:00,  1.36s/it]
Processed prompts: 100%|██████████| 32/32 [00:52<00:00,  1.64s/it]
Processed prompts: 100%|██████████| 32/32 [00:50<00:00,  1.57s/it]
Processed prompts: 100%|██████████| 32/32 [00:49<00:00,  1.56s/it]
Processed prompts: 100%|██████████| 32/32 [00:45<00:00,  1.42s/it]
Processed prompts: 100%|██████████| 32/32 [00:55<00:00,  1.73s/it]
Processed prompts: 100%|██████████| 32/32 [00:50<00:00,  1.58s/it]
Processed prompts: 100%|██████████| 32/32 [00:40<00:00,  1.27s/it]
Processed prompts: 100%|██████████| 32/32 [00:40<00:00,  1.25s/it]
Processed prompts: 100%|██████████| 32/32 [00:42<00:00,  1.32s