# Generierung von Aufgaben zum Embedding: 
Wir verwenden den Quora scrape um verschiedene Topics zu erhalten. Damit generieren wir uns jeweils 20 diverse Aufgaben. 

In [None]:
# from huggingface_hub import notebook_login
# notebook_login()

In [1]:
# from huggingface_hub import hf_hub_download
# hf_hub_download(repo_id="SebastianBodza/Quora_deutsch_ger_Pairs_RL_DPO", filename="output.jsonl", repo_type="dataset")
import pandas as pd
import random 
import urllib.parse

#quora = pd.read_json("/home/bodza/.cache/huggingface/hub/datasets--SebastianBodza--Quora_deutsch_ger_Pairs_RL_DPO/snapshots/f81129b4c2a5453b6c037e0571a52b701da8f6b8/output.jsonl", orient="records", lines=True)
quora = pd.read_csv('/home/bodza/.cache/huggingface/hub/datasets--SebastianBodza--Quora_Deutsch_ger/snapshots/b852cae090a09b06d443ccc8dcef36bc02ef74e2/Quora_deutsch.csv', index_col=0)
quora = quora.drop_duplicates(subset=["topic"])

# def sample_topics(n=5): 
#     samples = random.sample(quora["topic"].unique().tolist(), n)
#     samples = [urllib.parse.unquote(encoded_string) for encoded_string in samples]
#     return samples

def sample_topics(n=5):
    topics = sorted(quora["topic"].unique().tolist())
    for i in range(0, len(topics), n):
        samples = topics[i:i+n]
        samples = [urllib.parse.unquote(encoded_string) for encoded_string in samples]
        yield samples

In [2]:
prompt = """Create a list of potentially useful text retrieval tasks (RAG).
Stick to the following guidelines:
- Specify what the query is and what the requested documents are.
- Each retrieval task should cover a wide range of requests and should not be too specific.

Your output should always be just a list of strings, with about 5 elements each, and each element corresponds to a unique retrieval task in a set. Don't explain yourself or give anything else away. Be creative.

Create these tasks in the following areas:
{tasks}
and add five additional areas be creative! 
Create all entries completely in German! Never use English! Never use "Sie"! Never use the german word "Sie"!"""

response_template ="'{category_1}':\n1."

In [3]:
import torch 
import vllm 
import pandas as pd 
from vllm import SamplingParams
from transformers import AutoTokenizer

model_name = "TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ"
df = pd.DataFrame(columns=['topics', 'predicted_text'])
sampling_params = SamplingParams(temperature=0.1, max_tokens=4000)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
llm = vllm.LLM(model=model_name, quantization="gptq", dtype=torch.float16, tensor_parallel_size=2, max_model_len=2000, revision="gptq-4bit-32g-actorder_True", gpu_memory_utilization=0.75)



2024-01-23 15:54:58,876	INFO worker.py:1724 -- Started a local Ray instance.


INFO 01-23 15:54:59 llm_engine.py:70] Initializing an LLM engine with config: model='TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ', tokenizer='TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ', tokenizer_mode=auto, revision=gptq-4bit-32g-actorder_True, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=2000, download_dir=None, load_format=auto, tensor_parallel_size=2, quantization=gptq, enforce_eager=False, seed=0)
INFO 01-23 15:55:19 llm_engine.py:275] # GPU blocks: 3564, # CPU blocks: 4096
INFO 01-23 15:55:20 model_runner.py:501] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 01-23 15:55:20 model_runner.py:505] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode.
[36m(RayWorkerVllm pid=49045)[0m INFO 0



INFO 01-23 15:55:55 model_runner.py:547] Graph capturing finished in 35 secs.


In [4]:
from tqdm import tqdm 

def generate_prompt():
    topics = sample_topics()
    formatted_prompt = tokenizer.apply_chat_template(conversation=[{"role": "user", "content":prompt.format(tasks="\n".join(topics))},{"role": "assistant", "content":response_template.format(category_1=topics[0])}], tokenize=False)
    formatted_prompt = formatted_prompt.removesuffix("</s>")
    return (topics, formatted_prompt)

BATCH_SIZE = 32
df = pd.read_parquet("results.parquet")
# df = pd.DataFrame(columns=["topics", "results"])
for n in tqdm(range(0, 5000, BATCH_SIZE)):
    topics, formatted_prompt = zip(*[generate_prompt() for _ in range(BATCH_SIZE)])
    results = llm.generate(formatted_prompt, sampling_params=sampling_params)
    results_adj = [result.prompt.split("[/INST]")[-1]+ result.outputs[0].text for result in results]
    batch_df = pd.DataFrame({"topics": list(topics), "results": results_adj})
    df = pd.concat([df, batch_df], ignore_index=True)
    df.to_parquet("results.parquet")   
    # 2:30 min for 24 | 
    # Theoretisch for 32: 3:20 min 
    # Theoretisch 6:40 min mit 64 

  0%|          | 0/157 [00:00<?, ?it/s]

[36m(RayWorkerVllm pid=49045)[0m INFO 01-23 15:55:55 model_runner.py:547] Graph capturing finished in 35 secs.


Processed prompts: 100%|██████████| 32/32 [02:42<00:00,  5.08s/it]
Processed prompts: 100%|██████████| 32/32 [03:09<00:00,  5.92s/it]
Processed prompts: 100%|██████████| 32/32 [03:08<00:00,  5.90s/it]
Processed prompts: 100%|██████████| 32/32 [03:00<00:00,  5.63s/it]
Processed prompts: 100%|██████████| 32/32 [02:51<00:00,  5.37s/it]
Processed prompts: 100%|██████████| 32/32 [03:09<00:00,  5.92s/it]
Processed prompts: 100%|██████████| 32/32 [03:07<00:00,  5.86s/it]
Processed prompts: 100%|██████████| 32/32 [03:05<00:00,  5.79s/it]
Processed prompts: 100%|██████████| 32/32 [03:03<00:00,  5.74s/it]
Processed prompts: 100%|██████████| 32/32 [03:10<00:00,  5.96s/it]
Processed prompts: 100%|██████████| 32/32 [03:09<00:00,  5.91s/it]
Processed prompts: 100%|██████████| 32/32 [03:09<00:00,  5.91s/it]
Processed prompts: 100%|██████████| 32/32 [03:10<00:00,  5.94s/it]
Processed prompts: 100%|██████████| 32/32 [02:53<00:00,  5.41s/it]
Processed prompts: 100%|██████████| 32/32 [03:09<00:00,  5.92s

KeyboardInterrupt: 