In [1]:
from llama_cpp import Llama
from tqdm import tqdm
import pandas as pd

# Генерация вопросов с помощью Zephyr и Llama 2

In [2]:
def generate_prompt(city_from, city_to, date_str, date_back, need_luggage):
    prompt = "Q: Generate a request or desire to buy a plane ticket"
    if city_from is not None and not pd.isna(city_from):
        prompt += f" from {city_from}"
    if city_to is not None and not pd.isna(city_to):
        prompt += f" to {city_to}"
    if date_str is not None and not pd.isna(date_str):
        prompt += f" on {date_str}"
    if date_back is not None and not pd.isna(date_back):
        prompt += f" with a return ticket on {date_back}"
    if need_luggage is not None and not pd.isna(need_luggage):
        if need_luggage is True:
            prompt += f" with luggage"
        else:
            prompt += f" without luggage"
    prompt += ". Add only information given, do not add anything else. Write briefly, in one-two sentences in English. A:"

    return prompt

In [9]:
data = pd.read_csv("../data/preprocessed/test.csv")
llm = Llama(model_path="../models/zephyr-7b-beta.Q4_K_M.gguf")

outputs_zephyr = []

for i, row in tqdm(data.iterrows()):
    if i < 236:
        continue
    city_from, city_to, date_str, date_back, need_luggage = (
        row["city_from"],
        row["city_to"],
        row["date"],
        row["date_back"],
        row["need_luggage"],
    )
    question = generate_prompt(city_from, city_to, date_str, date_back, need_luggage)
    output = llm.create_chat_completion(
        messages=[{"role": "user", "content": question}], temperature=0.8
    )

    outputs_zephyr.append(output["choices"][0]["message"]["content"])


data["zephyr_question"] = outputs_zephyr

0it [00:00, ?it/s]Llama.generate: prefix-match hit
237it [00:10, 22.79it/s]
llama_print_timings:        load time =    5330.46 ms
llama_print_timings:      sample time =      11.33 ms /    56 runs   (    0.20 ms per token,  4942.19 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =   10294.05 ms /    56 runs   (  183.82 ms per token,     5.44 tokens per second)
llama_print_timings:       total time =   10392.91 ms
Llama.generate: prefix-match hit
238it [00:27,  6.89it/s]
llama_print_timings:        load time =    5330.46 ms
llama_print_timings:      sample time =      16.47 ms /    83 runs   (    0.20 ms per token,  5038.85 tokens per second)
llama_print_timings: prompt eval time =    1663.83 ms /    30 tokens (   55.46 ms per token,    18.03 tokens per second)
llama_print_timings:        eval time =   15137.20 ms /    82 runs   (  184.60 ms per token,     5.

In [12]:
llm = Llama(model_path="../models/llama-2-7b-chat.Q4_K_M.gguf")

outputs_llama = []
for i, row in tqdm(data.iterrows()):
    city_from, city_to, date_str, date_back, need_luggage = (
        row["city_from"],
        row["city_to"],
        row["date"],
        row["date_back"],
        row["need_luggage"],
    )
    question = generate_prompt(city_from, city_to, date_str, date_back, need_luggage)
    output = llm.create_chat_completion(
        messages=[{"role": "user", "content": question}], temperature=0.8
    )

    outputs_llama.append(output["choices"][0]["message"]["content"])

data["llama_question"] = outputs_llama
data.to_csv("../data/preprocessed/test_with_questions.csv")

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from ../models/llama-2-7b-chat.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q6_K     [ 11008,  4096,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q4_K     [  4096,  4096,     1,     1 

## Генерация простых вопросов

In [3]:
from llama_cpp import Llama
from tqdm import tqdm
import pandas as pd
import json

In [4]:
llm = Llama(model_path="../models/zephyr-7b-beta.Q4_K_M.gguf")

llama_model_loader: loaded meta data with 21 key-value pairs and 291 tensors from ../models/zephyr-7b-beta.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q4_K     [  4096,  4096,     1

In [15]:
NUM_QUESTIONS = 100

outputs_zephyr = []

for i in range(NUM_QUESTIONS):
    prompt = f"Q: Generate one question on any topic. A:"
    output = llm.create_chat_completion(
        messages=[{"role": "user", "content": prompt}],
        temperature=0.8,
        max_tokens=100,
        stop=["\n", "A:"],
    )

    outputs_zephyr.append(output["choices"][0]["message"]["content"])


with open("../data/random_questions.json", "w") as f:
    json.dump(outputs_zephyr, f)

Llama.generate: prefix-match hit

llama_print_timings:        load time =    1894.37 ms
llama_print_timings:      sample time =       2.50 ms /    12 runs   (    0.21 ms per token,  4792.33 tokens per second)
llama_print_timings: prompt eval time =     705.20 ms /    10 tokens (   70.52 ms per token,    14.18 tokens per second)
llama_print_timings:        eval time =    2068.93 ms /    11 runs   (  188.08 ms per token,     5.32 tokens per second)
llama_print_timings:       total time =    2796.50 ms
Llama.generate: prefix-match hit

llama_print_timings:        load time =    1894.37 ms
llama_print_timings:      sample time =       6.92 ms /    30 runs   (    0.23 ms per token,  4334.01 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =    5662.28 ms /    30 runs   (  188.74 ms per token,     5.30 tokens per second)
llama_print_timings:       total time =    5

In [16]:
NUM_QUESTIONS = 100

outputs_zephyr = []

for i in range(NUM_QUESTIONS):
    prompt = f"Q: Generate one question on a topic of airplains and flights. A:"
    output = llm.create_chat_completion(
        messages=[{"role": "user", "content": prompt}],
        temperature=0.8,
        max_tokens=100,
        stop=["\n", "A:"],
    )

    outputs_zephyr.append(output["choices"][0]["message"]["content"])


with open("../data/plains_flights_questions.json", "w") as f:
    json.dump(outputs_zephyr, f)

Llama.generate: prefix-match hit

Llama.generate: prefix-match hit
llama_print_timings:        load time =    1894.37 ms
llama_print_timings:      sample time =       3.02 ms /    14 runs   (    0.22 ms per token,  4635.76 tokens per second)
llama_print_timings: prompt eval time =    1124.38 ms /    16 tokens (   70.27 ms per token,    14.23 tokens per second)
llama_print_timings:        eval time =    2438.82 ms /    13 runs   (  187.60 ms per token,     5.33 tokens per second)
llama_print_timings:       total time =    3594.31 ms

llama_print_timings:        load time =    1894.37 ms
llama_print_timings:      sample time =       0.24 ms /     1 runs   (    0.24 ms per token,  4237.29 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =     179.59 ms /     1 runs   (  179.59 ms per token,     5.57 tokens per second)
llama_print_timings:       total time =     