In [None]:
import os
from tqdm import tqdm

from pathlib import Path

os.chdir("..")

In [None]:
from open_extract.llm import OLLAMAExtractor
from open_extract.data_model import Screening

In [None]:
md_files = sorted(Path("data/screening_250205/mds").glob("*.md"))
print(f"{len(md_files)=}")
extractor = OLLAMAExtractor(model_name="long-context-deepseek", ollama_host="olvi-1:11434", target_model=Screening)

In [None]:
# Check max words

word_count_dict = {}

for f in md_files:
    text = f.read_text()
    words = len(text.split(" "))
    word_count_dict[f.name] = words

# top 10 in word cound
sorted(word_count_dict.items(), key=lambda x: x[1], reverse=True)[:10]

- 978-3-030-23400-3.md: Entire book (Subset to which chapter?)
- 978-3-030-68483-9.md: Entire book (Subset to which chapter?)
- 24-d-past-present-and-future-a-review: Long review paper -> extraction still somewhat ok
- agronomy-14-01229: Review paper, extract ok.
- agronomy-09-00458: Review paper, extract ok.



In [None]:
sorted(word_count_dict.items(), key=lambda x: x[1], reverse=False)[:10]


- Segalin_Impact Chemical Seed Treatment_abstract.md: abstract only
- Kandel_Influence Planting Date_abstract.md: abstract only
- 361.md: single page.
- 350.md: single page.
- php-10-21-0130-br: short report, should be ok
- es7b06015_si_001: supplementary material only
- 1-s2.0-S2352340920302705-main: short report, study contains a mix of cover crop, soybean is one of it, but extraction seems to missed it

In [None]:
# words distribution

import matplotlib.pyplot as plt
word_counts = list(word_count_dict.values())
plt.hist(word_counts, bins=100)
plt.title("Word count distribution of all PDFs")



In [None]:
plt.hist([w for w in word_counts if w < 25000], bins=50)
plt.title("Word count distribution of all PDFs (zoomed-in)")


In [None]:
for file in tqdm(md_files):
    save_file_path = Path("runs/screening_250205") / file.with_suffix(".json").name
    if not save_file_path.exists():
        y = extractor.run(file.read_text())
        save_file_path.write_text(y.model_dump_json(indent=4))

In [None]:
extracted = []

for file in Path("runs/screening_250205/").glob("*.json"):
    data = Screening.model_validate_json(file.read_text()).model_dump()
    data["file_name"] = file.name
    data["word_count"] = word_count_dict[file.with_suffix(".md").name]
    extracted.append(data)
    

In [None]:
import pandas as pd
df = pd.DataFrame(extracted)
df

In [None]:
df.to_csv("runs/screening_250205.csv", index=False)

### Some reporting

In [None]:
df.groupby(["study_within_us", "is_soybean_study", "has_yield_data"]).size().to_frame("count").reset_index()

# add percentage

df.groupby(["study_within_us", "is_soybean_study", "has_yield_data"]).size().to_frame("count").reset_index().assign(
    percentage=lambda x: round(x["count"] / x["count"].sum(),3)
)

### Spot checking

In [None]:

check1 = df[df["study_within_us"]].sample(5).file_name.to_list() + df[~df["study_within_us"]].sample(5).file_name.to_list()
sorted(check1)

In [None]:

check1 = df[df["is_soybean_study"]].sample(5).file_name.to_list() + df[~df["is_soybean_study"]].sample(5).file_name.to_list()
sorted(check1)

In [None]:

check1 = df[df["has_yield_data"]].sample(5).file_name.to_list() + df[~df["has_yield_data"]].sample(5).file_name.to_list()
sorted(check1)

See [results](https://docs.google.com/spreadsheets/d/1SLbSbVGXBOXYUswalsgbJjHHfIii1kEQcxUh_gOGOwo/edit?gid=0#gid=0)

tl;dr; 90-100% agreement