# Load 10 test datapoints

In [29]:
# Load test data
import json
data_path = '/data/user_data/tianyuca/QL_dataset/popqa/readability/original_queries.jsonl'
with open(data_path, 'r') as f:
    data = [json.loads(line) for line in f]
data[:2]

[{'question': 'Who was the director of The Boys?',
  's_pop': 148,
  'o_pop': 155,
  'id': 5913920,
  'roberta_formality_score': 0.9866698384,
  'concreteness_score': 1.91,
  'linguistic_formality_score': 68.75,
  'readability': 90.9585714286,
  's_pop_bin': 0,
  'answers': ['Mikko Niskanen', 'Mikko Johannes Niskanen']},
 {'question': 'Who was the director of Mother and Child?',
  's_pop': 59,
  'o_pop': 404,
  'id': 1153613,
  'roberta_formality_score': 0.9929551482,
  'concreteness_score': 3.7775,
  'linguistic_formality_score': 66.6666666667,
  'readability': 82.39,
  's_pop_bin': 0,
  'answers': ['Hans Steinhoff']}]

In [30]:
random_samples = data[:10]
questions = [d['question'] for d in random_samples]
# original_scores = [d['readability'] for d in random_samples]
# original_scores = [d['roberta_formality_score'] for d in random_samples]
questions

['Who was the director of The Boys?',
 'Who was the director of Mother and Child?',
 'Who is the author of Branches?',
 'Who was the composer of Trans?',
 'Who was the director of The Chicken?',
 'What is the religion of Bernard William Schmitt?',
 'In what country is Springwood High School?',
 'What genre is Operation Sabotage?',
 "What is Henk Bleker's occupation?",
 "What is Andreas Rüdiger's occupation?"]

# Llama-3.1-8B-Instruct Prompting

In [61]:
read_prompt_prefix_list = [
    'You are a query rewriting assistant. Your task is to transform the following query into a version with reduced readability. Use techniques such as increasing lexical complexity, introducing syntactic intricacy, and embedding semantic ambiguity, all while keeping the original meaning intact. Just do the query rewriting, DO NOT answer it.',
    'As an AI language model specialized in rewriting queries, convert the provided query into one that is less readable. Employ advanced vocabulary and complex sentence structures to obscure the clarity of the text, but ensure that the semantic content remains unchanged. Just do the query rewriting, DO NOT answer it.',
    'Your role is to rewrite the following query to lower its readability. Apply linguistic techniques such as intricate syntactic constructions, elevated diction, and nuanced semantic ambiguity. The final result should convey the same meaning as the original query despite its increased complexity. Just do the query rewriting, DO NOT answer it.',
    'Transform the given query into a version that is more difficult to read. Use high-level vocabulary, elaborate grammatical structures, and ambiguous phrasing where appropriate. It is essential that the underlying meaning of the query is preserved. Just do the query rewriting, DO NOT answer it.',
    'You are tasked with converting the following query into a less accessible version by increasing its lexical and syntactic complexity. Introduce elements of semantic ambiguity without altering the query’s original meaning. Provide the transformed query as your final output. Just do the query rewriting, DO NOT answer it.',
    'Rewrite the following query into one that is intentionally less readable. Enhance the complexity by incorporating advanced vocabulary, multifaceted sentence structures, and ambiguous phrasing, ensuring that the core meaning of the query remains the same. Just do the query rewriting, DO NOT answer it.',
]

formal_prompt_prefix_list = [
    "You are a query rewriting assistant. Your task is to transform the following query into a very informal and casual version. Use conversational tone, filler words, slang, idioms, and even occasional misspellings to give it a relaxed feel, but keep the meaning intact. Just do the query rewriting, DO NOT answer it.",
    "As an AI language model skilled in rephrasing, convert the provided query into one that's super casual and informal. Change the tone to be conversational, add some slang or idiomatic expressions, and feel free to include informal touches like typos or filler words, all while preserving the original meaning. Just do the query rewriting, DO NOT answer it.",
    "Convert the following query into an informal version using diverse and creative casual language. Do not stick to a repetitive template; instead, try different styles such as starting with a casual remark, inserting slang mid-sentence, or even reordering sentence elements. Use a mix of everyday speech and informal idioms while ensuring the original meaning is maintained. Just do the query rewriting, DO NOT answer it.",
    "You are an AI assistant skilled at transforming formal queries into casual, everyday language. Rewrite the following query so that it sounds very informal. Experiment with different colloquial openings, varied sentence constructions, and a mix of slang, idioms, and casual expressions throughout the sentence. Avoid using the same phrase repeatedly (e.g., \"hey, so like\") and ensure the meaning remains unchanged. Just do the query rewriting, DO NOT answer it.",
    "Your task is to convert the given query into an informal version that feels natural and conversational. Instead of a uniform introductory phrase, use a range of informal expressions (such as interjections, casual questions, or slang) at different parts of the sentence. Mix up the structure—sometimes start with an interjection, other times rephrase the sentence completely—while keeping the original meaning intact. Just do the query rewriting, DO NOT answer it.",
    "Rewrite the provided query in an informal and relaxed style. Avoid a one-size-fits-all beginning like “hey, so like.” Instead, choose from a broad range of casual language features—interjections, slang, filler words, varied sentence ordering, or even playful misspellings when appropriate. The output should be informal and varied, yet fully retain the semantic meaning of the original query. Just do the query rewriting, DO NOT answer it.",
]

polite_prompt_prefix_list = [
    'You are a skilled query rewriter. Transform the provided query into a polite variant. Experiment with diverse linguistic techniques—use courteous phrases, indirect requests, varied greetings, and subtle expressions of gratitude. The result should be both respectful and maintaining the original intent of the query. Just do the query rewriting, DO NOT answer it.',
    'As an AI with advanced linguistic capabilities, rewrite the following query to be more polite. Make sure to integrate natural expressions of courtesy, such as gentle requests or softeners, that make the query sound more respectful and friendly. Avoid using the same phrases repeatedly by varying your sentence structures and word choices. Just do the query rewriting, DO NOT answer it.',
    'You are tasked with rewriting the following query into a polite version. Examples of techniques includes using courteous phrases, considerate text, respect and good manners, and friendly tone. Vary your approach by using different sentence constructions, varying the placement of polite expressions. The result should be both respectful and maintaining the original intent of the query. Just do the query rewriting, DO NOT answer it.',
    'Your role is to transform the provided query into a more polite and courteous version. Utilize your linguistic skills to rephrase the query with friendly and respectful language, ensuring that the original intent and meaning are fully preserved. Just do the query rewriting, DO NOT answer it.',
    'Your task is to convert the given query into a polite, respectful version. This time, aim for diversity by using multiple sentence structures and alternative polite expressions. For example, vary your use of greetings, requests, or acknowledgments, and avoid a one-size-fits-all template. The final version should sound natural, courteous, and maintain the original intent of the query. Just do the query rewriting, DO NOT answer it.',
    'You are a query rewriting assistant tasked with transforming the given query into a polite version. Examples of techniques includes using courteous phrases, considerate text, respect and good manners, and friendly tone. Experiment with diverse vocabulary, sentence lengths, and formats (such as questions, statements, or even multi-sentence versions) to express politeness. The resulting query should remain respectful and true to the original meaning. Just do the query rewriting, DO NOT answer it.',
]

# gpt-4o-mini prompting

In [None]:
read_prompt_prefix_list = [
    'You are a query rewriting assistant. Your task is to transform the following query into a version with reduced readability. Use techniques such as increasing lexical complexity, introducing syntactic intricacy, and embedding semantic ambiguity, all while keeping the original meaning intact.',
    'As an AI language model specialized in rewriting queries, convert the provided query into one that is less readable. Employ advanced vocabulary and complex sentence structures to obscure the clarity of the text, but ensure that the semantic content remains unchanged.',
    'Your role is to rewrite the following query to lower its readability. Apply linguistic techniques such as intricate syntactic constructions, elevated diction, and nuanced semantic ambiguity. The final result should convey the same meaning as the original query despite its increased complexity.',
    'Transform the given query into a version that is more difficult to read. Use high-level vocabulary, elaborate grammatical structures, and ambiguous phrasing where appropriate. It is essential that the underlying meaning of the query is preserved.',
    'You are tasked with converting the following query into a less accessible version by increasing its lexical and syntactic complexity. Introduce elements of semantic ambiguity without altering the query’s original meaning. Provide the transformed query as your final output.',
    'Rewrite the following query into one that is intentionally less readable. Enhance the complexity by incorporating advanced vocabulary, multifaceted sentence structures, and ambiguous phrasing, ensuring that the core meaning of the query remains the same.',
]

formal_prompt_prefix_list = [
    "You are a query rewriting assistant. Your task is to transform the following query into a very informal and casual version. Use conversational tone, filler words, slang, idioms, and even occasional misspellings to give it a relaxed feel, but keep the meaning intact.",
    "As an AI language model skilled in rephrasing, convert the provided query into one that's super casual and informal. Change the tone to be conversational, add some slang or idiomatic expressions, and feel free to include informal touches like typos or filler words, all while preserving the original meaning.",
    "Convert the following query into an informal version using diverse and creative casual language. Do not stick to a repetitive template; instead, try different styles such as starting with a casual remark, inserting slang mid-sentence, or even reordering sentence elements. Use a mix of everyday speech and informal idioms while ensuring the original meaning is maintained.",
    "You are an AI assistant skilled at transforming formal queries into casual, everyday language. Rewrite the following query so that it sounds very informal. Experiment with different colloquial openings, varied sentence constructions, and a mix of slang, idioms, and casual expressions throughout the sentence. Avoid using the same phrase repeatedly (e.g., \"hey, so like\") and ensure the meaning remains unchanged.",
    "Your task is to convert the given query into an informal version that feels natural and conversational. Instead of a uniform introductory phrase, use a range of informal expressions (such as interjections, casual questions, or slang) at different parts of the sentence. Mix up the structure—sometimes start with an interjection, other times rephrase the sentence completely—while keeping the original meaning intact.",
    "Rewrite the provided query in an informal and relaxed style. Avoid a one-size-fits-all beginning like “hey, so like.” Instead, choose from a broad range of casual language features—interjections, slang, filler words, varied sentence ordering, or even playful misspellings when appropriate. The output should be informal and varied, yet fully retain the semantic meaning of the original query.",
]

polite_prompt_prefix_list = [
    'You are a skilled query rewriter. Transform the provided query into a polite variant. Experiment with diverse linguistic techniques—use courteous phrases, indirect requests, varied greetings, and subtle expressions of gratitude. The result should be both respectful and maintaining the original intent of the query.',
    'As an AI with advanced linguistic capabilities, rewrite the following query to be more polite. Make sure to integrate natural expressions of courtesy, such as gentle requests or softeners, that make the query sound more respectful and friendly. Avoid using the same phrases repeatedly by varying your sentence structures and word choices.',
    'You are tasked with rewriting the following query into a polite version. Examples of techniques includes using courteous phrases, considerate text, respect and good manners, and friendly tone. Vary your approach by using different sentence constructions, varying the placement of polite expressions. The result should be both respectful and maintaining the original intent of the query.',
    'Your role is to transform the provided query into a more polite and courteous version. Utilize your linguistic skills to rephrase the query with friendly and respectful language, ensuring that the original intent and meaning are fully preserved.',
    'Your task is to convert the given query into a polite, respectful version. This time, aim for diversity by using multiple sentence structures and alternative polite expressions. For example, vary your use of greetings, requests, or acknowledgments, and avoid a one-size-fits-all template. The final version should sound natural, courteous, and maintain the original intent of the query.',
    'You are a query rewriting assistant tasked with transforming the given query into a polite version. Examples of techniques includes using courteous phrases, considerate text, respect and good manners, and friendly tone. Experiment with diverse vocabulary, sentence lengths, and formats (such as questions, statements, or even multi-sentence versions) to express politeness. The resulting query should remain respectful and true to the original meaning.',
]

# For each query, randomly select a prompt prefix

In [None]:
import random

# Set a specific seed value for reproducibility
seed_value = 42
random.seed(seed_value)

# prompts = [(random.choice(read_prompt_prefix_list), 'Original Query: ' + query + '\nQuery with Low Readability: ') for query in questions]
# prompts = [(random.choice(formal_prompt_prefix_list), 'Original Query: ' + query + '\nInformal Query: ') for query in questions]
prompts = [(random.choice(polite_prompt_prefix_list), 'Original Query: ' + query + '\nPolite Query: ') for query in questions]
prompts

In [None]:
def format_prompt(system_prompt, user_prompt):
    return [
        {
            "role": "system", 
            "content": system_prompt
        },
        {
            "role": "user",
            "content": user_prompt
        }
    ]

messages_list = [format_prompt(prompt[0], prompt[1]) for prompt in prompts]
messages_list

In [75]:
import os
from openai import OpenAI

client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),  # This is the default and can be omitted
)

In [31]:
model = 'gpt-4o-mini'
temperature = 0.7
max_tokens = 100
top_p = 1.0
n = 1 # number of completions to generate

In [91]:
# GPT-4o-mini
modified_queries = []

for i in range(len(questions)):
    chat_completion = client.chat.completions.create(
        messages=messages_list[i],
        model=model,
        temperature=temperature,
        max_tokens=max_tokens,
        top_p=top_p,
        n=n,
    )
    modified_queries.append(chat_completion.choices[0].message.content.replace('\n', '~| ').replace('\t', ' '))

In [None]:
from vllm import LLM, SamplingParams
llm = LLM(model='meta-llama/Llama-3.1-8B-Instruct', download_dir='/data/user_data/tianyuca/models')

In [64]:
sampling_params = SamplingParams(
    temperature=temperature,
    top_p=top_p,
    max_tokens=max_tokens,
)
# completion = llm.generate(prompts, sampling_params)
completion = llm.chat(messages_list, sampling_params)
print(completion)
modified_queries = [output.outputs[0].text.split('\n')[0] for output in completion]

Processed prompts: 100%|██████████| 10/10 [00:02<00:00,  3.99it/s, est. speed input: 506.99 toks/s, output: 124.85 toks/s]

[RequestOutput(request_id=111, prompt='<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\nYou are a query rewriting assistant tasked with transforming the given query into a polite version. Examples of techniques includes using courteous phrases, considerate text, respect and good manners, and friendly tone. Experiment with diverse vocabulary, sentence lengths, and formats (such as questions, statements, or even multi-sentence versions) to express politeness. The resulting query should remain respectful and true to the original meaning. Just do the query rewriting, DO NOT answer it.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nOriginal Query: Who was the director of The Boys?\nPolite Query:<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n', prompt_token_ids=[128000, 128000, 128006, 9125, 128007, 271, 38766, 1303, 33025, 2696, 25, 6790, 220, 2366, 18, 198, 15724, 2696, 25, 220, 1627, 10263, 2




In [65]:
modified_queries

['Could you please tell me who the director of the popular TV series The Boys was?',
 'I was wondering if you could tell me who the director of the film "Mother and Child" is.',
 'I was wondering if you could tell me who the author of the book "Branches" is.',
 'Could you please tell me who the composer behind the music of Trans is?',
 'Could you please tell me who directed the film "The Chicken"? ',
 "Could you kindly tell me more about Bernard William Schmitt's religious affiliation?",
 'Could you please tell me which country Springwood High School is located in? ',
 'Could you please tell me what genre Operation Sabotage falls under?',
 "Could you please tell me more about Henk Bleker's profession?",
 "Could you kindly tell me what Andreas Rüdiger's profession is?"]

In [59]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from query_rewriting import flesch_reading_ease, predict_formality

model_name = "s-nlp/roberta-base-formality-ranker"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

modified_scores = [predict_formality(tokenizer, model, query) for query in modified_queries]
# modified_scores = [flesch_reading_ease(query) for query in modified_queries]
print(modified_scores)
# avg_original_score = sum(original_scores) / len(original_scores)
avg_modified_score = sum(modified_scores) / len(modified_scores)
avg_modified_score

[0.9921875, 0.99609375, 0.99609375, 0.99609375, 0.96875, 0.99609375, 0.99609375, 0.99609375, 0.99609375, 0.99609375]


0.99296875

In [66]:
from transformers import pipeline

classifier = pipeline(
    'text-classification',
    model='Intel/polite-guard'
)

def politeness_score(classifier: pipeline, query):
    # return the sum of the scores for 'polite' and 'somewhat polite'
    result = classifier(query, return_all_scores=True)
    return result[0][0]['score'] + result[0][1]['score']

original_scores = []
for query in questions:
    original_scores.append(politeness_score(classifier, query))

print('--------------------------------------')
modified_scores = []
for query in modified_queries:
    print(query, politeness_score(classifier, query))
    modified_scores.append(politeness_score(classifier, query))

avg_original_score = sum(original_scores) / len(original_scores)
avg_modified_score = sum(modified_scores) / len(modified_scores)
avg_original_score, avg_modified_score

Device set to use cuda:0


--------------------------------------
Could you please tell me who the director of the popular TV series The Boys was? 0.7976957559585571
I was wondering if you could tell me who the director of the film "Mother and Child" is. 0.8014834821224213
I was wondering if you could tell me who the author of the book "Branches" is. 0.795767530798912
Could you please tell me who the composer behind the music of Trans is? 0.4727376848459244
Could you please tell me who directed the film "The Chicken"?  0.4700736105442047
Could you kindly tell me more about Bernard William Schmitt's religious affiliation? 0.4825034886598587
Could you please tell me which country Springwood High School is located in?  0.42887407541275024
Could you please tell me what genre Operation Sabotage falls under? 0.6220478266477585
Could you please tell me more about Henk Bleker's profession? 0.6570035070180893
Could you kindly tell me what Andreas Rüdiger's profession is? 0.8874029219150543


(0.2142079645418562, 0.641558988392353)