In [1]:
import logging
import json

import pandas as pd
from tqdm.auto import tqdm

from dotenv import load_dotenv
load_dotenv('../.env')

from kaggle_competition_assistant import llm
from kaggle_competition_assistant.utils import create_documents

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

In [2]:
response = llm('tell me a joke')
print(response[0])

2024-09-16 14:31:23,615 - kaggle_competition_copilot.llm - INFO - Starting LLM request to google/gemini-1.5-flash-latest...
2024-09-16 14:31:24,779 - kaggle_competition_copilot.llm - INFO - LLM request successfully finished.


Why don't scientists trust atoms?

Because they make up everything! 



## Data generation

In [20]:
competition_slug = 'rohlik-orders-forecasting-challenge'
# competition_slug = 'llm-zoomcamp-2024-competition'
competition_data_path = '../data/' + competition_slug
documents = create_documents(competition_slug, competition_data_path)

2024-09-16 14:38:37,998 - kaggle_competition_copilot.utils - INFO - Starting creating documents for rohlik-orders-forecasting-challenge...
2024-09-16 14:38:38,038 - kaggle_competition_copilot.utils - INFO - Documents created: 1117


In [21]:
documents[0]

{'source': 'overview',
 'section': 'competition name',
 'text': 'Rohlik Orders Forecasting Challenge',
 'url': 'https://www.kaggle.com/competitions/rohlik-orders-forecasting-challenge/overview',
 'id': 0}

In [22]:
documents[-1]

{'source': 'discussions',
 'section': 'Calendar only have data from Budapest',
 'text': '[Yan Teixeira](/yantxx) · 19th in this Competition · Posted 3 months ago\n\n\n### Calendar only have data from Budapest\nBoth the train and test calendars only contain data from Budapest. Was this\nintended? From reading the description of the data, I thought this dataset was\nsupposed to include data for all warehouses.\n\n\n## 3 Comments\n\n\n### [MichalKecera](/mkecera)\nHi [@yantxx](https://www.kaggle.com/yantxx),  \nGood catch. Thanks. This is now corrected - it has all the warehouses.\n\n\n### [Yan Teixeira](/yantxx)\n[@mkecera](https://www.kaggle.com/mkecera) I have another quick question\nBudapest_1 is the only warehouse with a latest date of 2024-03-14, whereas all\nthe other warehouses have a latest date of 2024-03-15. The first date for all\nwarehouses in the test set is 2024-03-16, which makes me think that the\ntraining data might be missing the 2024-03-15 data for Budapest.\n![](https

In [23]:
def build_prompt(doc, n_questions=1):
    prompt_template = """
You are a participant in a Kaggle competition.
Formulate exactly {n_questions} question-answer pairs based on information about competition from the record.

Questions must:
- be semantically unique and diverse
- relate directly to the main topic of the record
- not expect answers with URLs or using URLs information
- be short to medium size
- be complete

The record must contain the answer to the questions.
If possible, use as fewer words as possible from the record for the question.

Provide the output in parsable JSON without using any code blocks:
{response_format}

Record:
source: {source}
section: {section}
text:
{text}
    """.strip()
    
    discussion_prompt_template = """
You are a participant in a Kaggle competition.
Formulate exactly {n_questions} question-answer pairs from the discussion post of the competition.

Questions must:
- be semantically unique and diverse
- relate directly to the main topic of the discussion thread
- not expect answers with URLs or using URLs information
- be short to medium size
- be complete

Avoid:
- specific questions or answers about exact results numbers, rather use more general form questions
- using questions formulated like "what did you do?" rather ask more generic form like "what can be done?".

The discussion post must contain the answer to the questions.

Provide the output in parsable JSON without using any code blocks:
{response_format}

Discussion post:
{text}
    """.strip()

    if doc['source'] == 'discussions':
        prompt = discussion_prompt_template
    else:
        prompt = prompt_template
        
    if not doc['section']:
        prompt = prompt.replace('\nsection: {section}', '')
        
    response_format = '[' + ', '.join([f'{{"question": "answer"}}' for _ in range(n_questions)]) + ']'
        
    prompt = prompt.format(**doc, n_questions=n_questions, response_format=response_format)
    
    return prompt

Test prompt + question generation

In [24]:
prompt = build_prompt(documents[0], n_questions=1)
print(prompt)

You are a participant in a Kaggle competition.
Formulate exactly 1 question-answer pairs based on information about competition from the record.

Questions must:
- be semantically unique and diverse
- relate directly to the main topic of the record
- not expect answers with URLs or using URLs information
- be short to medium size
- be complete

The record must contain the answer to the questions.
If possible, use as fewer words as possible from the record for the question.

Provide the output in parsable JSON without using any code blocks:
[{"question": "answer"}]

Record:
source: overview
section: competition name
text:
Rohlik Orders Forecasting Challenge


In [25]:
questions = llm(prompt)[0]

2024-09-16 14:38:48,556 - kaggle_competition_copilot.llm - INFO - Starting LLM request to google/gemini-1.5-flash-latest...
2024-09-16 14:38:49,729 - kaggle_competition_copilot.llm - INFO - LLM request successfully finished.


In [26]:
json.loads(questions)

[{'question': 'What is the name of the competition?',
  'answer': 'Rohlik Orders Forecasting Challenge'}]

In [10]:
prompt = build_prompt(documents[-1], n_questions=1)
print(prompt)

You are a participant in a Kaggle competition.
Formulate exactly 1 question-answer pairs from the discussion post of the competition.

Questions must:
- be semantically unique and diverse
- relate directly to the main topic of the discussion thread
- not expect answers with URLs or using URLs information
- be short to medium size
- be complete

Avoid:
- specific questions or answers about exact results numbers, rather use more general form questions
- using questions formulated like "what did you do?" rather ask more generic form like "what can be done?".

The discussion post must contain the answer to the questions.

Provide the output in parsable JSON without using any code blocks:
[{"question": "answer"}]

Discussion post:
[KABIR OLAWALE MOHAMMED](/kabirolawalemohammed) · 7th in this Competition ·

### A good prompt could be a game changer
I made submission and got a score 0.5.
Ran the same codes again but only this time changing the wordings of my
prompt, then the score jumped to 0

In [35]:
questions = llm(prompt, model_choice='google/gemini-1.5-pro-latest')[0]
questions

2024-09-15 20:45:06,038 - kaggle_competition_copilot.llm - INFO - Starting LLM request to google/gemini-1.5-flash-latest...
2024-09-15 20:45:07,405 - kaggle_competition_copilot.llm - INFO - LLM request successfully finished.


'[{"question": "What is the reason why the Budapest_1 warehouse has a latest date of 2024-03-14 while other warehouses have a latest date of 2024-03-15?", "answer": "2024-03-15 was a public holiday in Budapest and the warehouse was not open."}] \n'

In [36]:
json.loads(questions)

[{'question': 'What is the reason why the Budapest_1 warehouse has a latest date of 2024-03-14 while other warehouses have a latest date of 2024-03-15?',
  'answer': '2024-03-15 was a public holiday in Budapest and the warehouse was not open.'}]

Generate whole dataset

In [33]:
def is_valid_json(json_string):
    try:
        json.loads(json_string)
        return True
    except json.JSONDecodeError:
        return False

def is_qa_pair_valid(qa_pair: dict):
    """Mini validation of generated result"""
    return 'question' in qa_pair and 'answer' in qa_pair

def generate_questions(doc: dict[str]) -> list:        
    # select number of questions
    if doc['source'] in ['overview', 'data description']:
        n_questions = max(len(doc['text'].split('\n')) // 3, 1)
    elif doc['source'] == 'leaderboard':
        viable_leaderboard_places = [f'place: {i}\n' for i in range(1, 6)]
        if any(place in doc['text'] for place in viable_leaderboard_places):
            n_questions = 1
        else:
            n_questions = 0
    elif doc['source'] == 'discussions':
        n_questions = 1
    else:
        raise ValueError(f'Unexpected doc source: {doc["source"]}')
    
    # use "smarter" model for discussions, since they have more text and require more understanding
    if doc['source'] == 'discussions':
        model_choice = 'google/gemini-1.5-flash-latest' # pro TODO
    else:
        model_choice = 'google/gemini-1.5-flash-latest'

    # generate questions
    if n_questions:        
        prompt = build_prompt(doc, n_questions=n_questions)
        # print(prompt)
        
        # generate qa pairs with 3 tries
        num_tries = 0
        while True:
            num_tries += 1
            
            questions = llm(prompt, model_choice=model_choice)[0]

            # still some scrubbing of code blocks required
            questions = questions.removeprefix("```json").removesuffix("```")

            if is_valid_json(questions):
                questions = json.loads(questions)
            else:
                print(f'Invalid json: {questions}, trying again...')
                continue
            # print(questions)
            
            # validate generated results
            results_valid = True
            for qa_pair in questions:
                if not is_qa_pair_valid(qa_pair):
                    results_valid = False
                    print(f'Invalid qa pair {qa_pair}, trying again...')
                    break
            
            if results_valid or (num_tries == 3):
                break
            
        return questions
    
    return []

In [28]:
results = {}

In [None]:
for doc in tqdm(documents, desc='Generating retrieval evaluation data'): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions = generate_questions(doc)
    if questions:
        results[doc_id] = questions

In [35]:
results

{0: [{'question': 'What is the name of the competition?',
   'answer': 'Rohlik Orders Forecasting Challenge'}],
 1: [{'question': 'What is the main task in the Rohlik Orders Forecasting Challenge?',
   'answer': 'Use historical data to predict customer orders'}],
 2: [{'question': 'What is the main focus of the Kaggle competition?',
   'answer': 'Predicting the number of grocery deliveries at selected warehouses for the next 60 days.'},
  {'question': 'In which countries does Rohlik Group operate?',
   'answer': 'Czech Republic, Germany, Austria, Hungary, and Romania.'},
  {'question': 'When does the competition close?',
   'answer': 'August 23, 2024'}],
 3: [{'question': 'What is the main reason for needing accurate order forecasts?',
   'answer': 'Accurate order forecasts are crucial for planning process, impacting workforce allocation, delivery logistics, inventory management, and supply chain efficiency.'},
  {'question': 'What is the expected impact of optimizing forecasts?',
   '

In [36]:
final_results = []

for doc_id, questions in results.items():
    for qa_pair in questions:
        if is_qa_pair_valid(qa_pair):
            final_results.append((doc_id, qa_pair['question'], qa_pair['answer']))

In [37]:
df_results = pd.DataFrame(final_results, columns=['doc_id', 'question', 'answer'])

# drop duplicates, just in case
df_results = df_results.drop_duplicates(subset=['question', 'answer'], keep='first').reset_index(drop=True)

In [38]:
df_results.to_csv(f'../data/evaluation/{competition_slug}-ground-truth.csv', index=False)

In [39]:
df_results.head(10)

Unnamed: 0,doc_id,question,answer
0,0,What is the name of the competition?,Rohlik Orders Forecasting Challenge
1,1,What is the main task in the Rohlik Orders For...,Use historical data to predict customer orders
2,2,What is the main focus of the Kaggle competition?,Predicting the number of grocery deliveries at...
3,2,In which countries does Rohlik Group operate?,"Czech Republic, Germany, Austria, Hungary, and..."
4,2,When does the competition close?,"August 23, 2024"
5,3,What is the main reason for needing accurate o...,Accurate order forecasts are crucial for plann...
6,3,What is the expected impact of optimizing fore...,"By optimizing forecasts, we can minimize waste..."
7,3,How will participant contributions directly af...,Your participation in this challenge will dire...
8,4,What metric is used to evaluate submissions in...,Mean Absolute Percentage Error
9,5,What is the name of the column that contains t...,ORDERS


In [40]:
df_results.tail(10)

Unnamed: 0,doc_id,question,answer
105,1106,What are the factors that determine whether a ...,The complexity of the model should be determin...
106,1107,What is the meaning of the mov_change column?,0.7 means there was a small minimum order valu...
107,1108,What is the main reason why participants are c...,Participants are concerned about a potential s...
108,1109,What are the potential benefits of incorporati...,Uncertainty quantification can help reduce the...
109,1110,Why are there more winter school holiday weeks...,The 'winter school holidays' is a week-long sp...
110,1112,What causes the inconsistency in the 'holiday'...,The inconsistency arises from the Czech Republ...
111,1113,What were the key factors that contributed to ...,The key factors that contributed to achieving ...
112,1114,What type of external data can be used to enha...,Calendar events is one dataset that could be a...
113,1115,"What is the purpose of using the ""margin"" valu...",The margin is used to ensure that the `oof` (c...
114,1116,Was there a specific reason why Budapest_1 was...,2024-03-15 was a public holiday in Budapest - ...
