In [1]:
import os
os.chdir("..")

In [2]:
from pathlib import Path
import ollama
from open_extract.llm import keep_alive
from open_extract.llm import QUESTIONS


In [None]:
# keep_alive(model="deepseek-r1-70b-15k-ctx", host="olvi-1:11434")

In [None]:
DECOMPOSER_SYSTEM_PROMPT = """

You are a research assistant specializing in agriculture, your role is to break down a complex research question into a few smaller questions, you will use these questions to determine whether a paper is related to a given question.
You need to check:

- Whether the study measures or evaluates the key element in our question.
- Whether the study design addresses a significant part of that question.

For example: 

Input: What is the effectiveness of foliar fungicide applications in controlling white mold and improving soybean yield in fields where white mold is a primary concern?  

Output: 
a. Were foliar fungicide treatments evaluated in this study? 
b. Was a white mold control treatment evaluated in this study?
...
Input: How do no-till practices influence insect and slug pest pressures and soybean yield in different regions? 

Output:
a. Were tillage practices a treatment in this study? 
b. Was pest pressure evaluated?  
c. Was soybean yield evaluated?
...
"""



In [None]:
def remove_deepseek_thinking_tokens(response: str) -> str:
    """Remove thinking tokens from the response."""
    return response.split("</think>")[-1].strip()


def decompose(question: str) -> str:
    """Decompose with Deepseek r1."""
    client = ollama.Client(host="http://olvi-1:11434")
    response = client.chat(
        model="deepseek-r1-70b-15k-ctx",
        messages=[
            {
                "role": "system",
                "content": DECOMPOSER_SYSTEM_PROMPT
            },
            {
                "role": "user",
                "content": f"Break down this question into smaller ones: {question}"
            }
        ]
    )
    return remove_deepseek_thinking_tokens(response.message.content)

In [5]:
outputs = []

for i, question in QUESTIONS.items():
    outputs.append(
        {
            "i": i,
            "question": question,
            "decomposition": decompose(question)
        }
    )

In [6]:
import json

with open("runs/question_decom_250226/outputs.json", "w") as f:
    json.dump(outputs, f, indent=2)

In [7]:
# save jsonl

with open("runs/question_decom_250226/outputs.jsonl", "w") as f:
    for output in outputs:
        f.write(json.dumps(output) + "\n")

In [10]:
for output in outputs:
    print(output["i"], output["question"])
    print(output["decomposition"])
    print("="*80)

Q1 How do different seed treatments (insecticide and fungicide) impact soybean yield when planted before May 1 compared to after May 1?
To address the research question effectively, we can break it down into the following smaller questions:

1. **Were seed treatments (insecticide and/or fungicide) evaluated in this study?**
2. **Was soybean yield measured as an outcome in the study?**
3. **Did the study compare planting dates before May 1 versus after May 1?**

These sub-questions ensure that each key component of the main research question is addressed, allowing us to determine if a study relates to the impact of seed treatments on soybean yield across different planting times.
Q2 What is the effectiveness of foliar fungicide applications in controlling white mold and improving soybean yield in fields where white mold is a primary concern?
To determine whether a paper is related to the research question "What is the effectiveness of foliar fungicide applications in controlling white m