# Zotero Retriever Tests

Helps you find literature in your Zotero library

In [1]:
# Settings

from dotenv import load_dotenv
import os

load_dotenv()
zotero_api_key = os.environ.get("ZOTERO_API_KEY_READ") # A Zotero API key, here sourced from .env - read only access is enough
zotero_library_id = os.environ.get("ZOTERO_TEST_GROUP_ID") # The library ID of the Zotero library to be accessed. Here a test group library
zotero_library_type = "group" # set the library type to either 'user' or 'group'. Make sure this corresponds to the library ID ('user' for your personal library)

zot_db = "./test_zotero_db" # directory to save the zotero vector store

reload_zotero = True # set to True to regenerate the zotero database

verbose = True # print additional information on routing decisions etc.

zotero_docs_returned = 10 # number of abstracts returned by the retriever

zotero_retriever_type = "mmr" # "similarity"  or "mmr"

embedding_model = "nomic-embed-text" # very good embedding model for retrieval tasks

# textgen model. generates answers to questions, with retrieved documents as context
textgen_model = "llama3.2"
# textgen_model = "phi4", # phi4 is a rather powerful model, but requires more gpu compute (that is, it will be slower than llama3.2 if not enough gpu memory is available)

# reasoning model. handles deciding whether or not to query the retriever
# reasoning_model = "nemotron-mini" # nvidia's nemotron-mini is smaller, but still capable of the simple reasoning task we need (retrieve or not)
reasoning_model = "llama3.2" # fast, but not very good at reasoning tasks
# reasoning_model = "qwq" # qwq is a rather powerful experimental reasoning model. However, as it is rather large (20b), it either requires a fair bit of GPU memory or will be very slow 

# selfquery model. generates queries for the retriever
# selfquery_model = "phi4" # phi4 seems better at query construction than llama3.2, but requires more gpu compute (i.e. may be slower)
selfquery_model = "deepseek-r1:1.5b"


In [2]:
from langchain_ollama import ChatOllama
from langchain_ollama import OllamaEmbeddings
from pyzotero import zotero

zot = zotero.Zotero(zotero_library_id, zotero_library_type, zotero_api_key)

textgen_llm = ChatOllama(
    model = textgen_model,
    temperature = 0.0,
    num_predict = 512, # max number of tokens to generate
    )

# we can use different models for different tasks, e.g. a more expensive model for reasoning (should documents be retrieved?) and a cheaper model for text generation
reasoning_llm = ChatOllama(
    model = reasoning_model, 
    temperature = 0.0,
    num_predict = 512, # max number of tokens to generate
    )

selfquery_llm = ChatOllama(
    model = selfquery_model, 
    temperature = 0.0,
    num_predict = 512, # max number of tokens to generate
    )

embeddings = OllamaEmbeddings(model = embedding_model)

 ## pass settings for retriever (adjust as needed)
if zotero_retriever_type == "mmr":
    zot_search_type="mmr" # MMR (Maximal Marginal Relevance) aims to diversify search results. the amount of diversification is set via the lambda_mult parameter
    zot_search_kwargs={"k": zotero_docs_returned, # make sure the number of documents passed (k) fits into the context window
                    "fetch_k": zotero_docs_returned * 5, # could be adjusted, potentially run tests
                    "lambda_mult": 0.8} # amount of diversification, with 0 being maximum diversity
                   
if zotero_retriever_type == "similarity":
    zot_search_type="similarity" # similarity score; optionally with threshold ("similarity_score_threshold" with "score_threshold" kwarg)
    zot_search_kwargs={"k": zotero_docs_returned} # make sure the number of documents passed (k) fits into the context window

### Turn Zotero Library into Vector Store Retriever

In [4]:
from langchain_core.documents import Document
from langchain_chroma import Chroma


if not os.path.exists(zot_db) or reload_zotero:

    lib = zot.everything(zot.top()) # note that "everything" includes items in the trash as well. However, double check if the API returned all entries

    docs = [
            Document(
                page_content = entry.get("data").get("abstractNote"),
                metadata={
                    **{
                        "type": entry.get("data").get("itemType", ""),
                        # note that the additional "name" passed here is in case the name is not split into first and last name
                        "authors": ", ".join(f"{creator.get('firstName', '')} {creator.get('lastName', '')} {creator.get('name', '')}" for creator in entry.get("data", {}).get("creators", [])),
                        "tags": ", ".join(f"{tag.get('tag', '')}" for tag in entry.get("data", {}).get("tags", []))
                    },
                    **(
                    {
                        "title": entry.get("data").get("caseName", ""),
                        "court": entry.get("data").get("court", ""),
                        "date": entry.get("data").get("dateDecided", ""),
                        "publication": entry.get("data").get("reporter", ""),
                        "volume": entry.get("data").get("reporterVolume", ""),
                        "pages": entry.get("data").get("firstPage", ""),
                    } # extra scheme for case law. Potentially add more schemes later, but the standard below should be sufficient for most documents
                    if entry.get("data", {}).get("itemType", "") == "case" else
                    {
                        "title": entry.get("data").get("title", ""),
                        "publication": entry.get("data").get("publicationTitle", ""),
                        "volume": entry.get("data").get("volume", ""),
                        "issue": entry.get("data").get("issue", ""),
                        "pages": entry.get("data").get("pages", ""),
                        "date": entry.get("data").get("date", ""),
                        "DOI": entry.get("data").get("DOI", ""),
                    }
                    )
                }
            )
            for entry in lib
        ]

    zot_store = Chroma(
            collection_name="lit_helper_zot_test",
            embedding_function=embeddings,
            persist_directory=zot_db,  # save data locally
            )
    
    zot_store.add_documents(docs)

else: 
    zot_store = Chroma(
            collection_name="lit_helper_zot_test",
            embedding_function=embeddings,
            persist_directory=zot_db,  # save data locally
            )


In [66]:
# make the retriever self-querying

from langchain.chains.query_constructor.schema import AttributeInfo
from langchain.chains.query_constructor.base import get_query_constructor_prompt
from langchain.chains.query_constructor.base import StructuredQueryOutputParser
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_community.query_constructors.chroma import ChromaTranslator

zotero_metadata_info = [
    # note that the attribute infos can have a vast influence on the performance of the retriever (likely model dependant)
    #   The name seems to be most important when the model chooses the filter - with notable side effects for other attributes when changing the name of one!
    #   Beware that the names here need to correspond to the names in the metadata of the documents set above (changes in the name here require updating the chroma database)
    AttributeInfo(
        name = "authors",
        description="The paper authors' names, comma-seperated.",
        type="string",
    ),
    AttributeInfo(
        name = "title",
        description="The title of the document.",
        type="string",
    ),
    AttributeInfo(
        name = "tags",
        description="Tags associated with the document, comma-seperated.",
        type="string",
    ),
    AttributeInfo(
        name = "type",
        description="The type of document.",
        type="string",
    ),
    AttributeInfo(
        name = "publication",
        description="The publication the document was published in.",
        type="string",
    ),
    AttributeInfo(
        name = "volume",
        description="The volume of the publication.",
        type="string",
    ),
    AttributeInfo(
        name = "issue",
        description="The issue of the publication.",
        type="string",
    ),
    AttributeInfo(
        name = "pages",
        description="The pages of the publication.",
        type="string",
    ),
    AttributeInfo( 
        name = "date",
        description="The publication date.",
        type="string",
    ),
    AttributeInfo(
        name = "DOI",
        description="The DOI of the document.",
        type="string",
    ),
    AttributeInfo(
        name = "court",
        description="For law cases, court the case was decided in.",
        type="string",
    ),
]

zotero_abstract_description = "An abstract of the document."

allowed_comparators = [ # we need to fine tune these, otherwise the "contains" operator is used, which is not supported by Chroma
    "$eq",  # Equal to (number, string, boolean)
    "$ne",  # Not equal to (number, string, boolean)
    "$gt",  # Greater than (number)
    "$gte",  # Greater than or equal to (number)
    "$lt",  # Less than (number)
    "$lte",  # Less than or equal to (number)
]

retrieval_prompt = get_query_constructor_prompt( # we're using a premade prompt for the query constructor
    document_contents=zotero_abstract_description,
    attribute_info=zotero_metadata_info,
    allowed_comparators=allowed_comparators, 
    # examples = retriever examples, # we can optionally add examples for the model to fine tune the retrieval. See https://github.com/langchain-ai/langchain/blob/master/cookbook/self_query_hotel_search.ipynb
)

output_parser = StructuredQueryOutputParser.from_components( # we're also using a premade output parser
    fix_invalid=True, # this automatically fixes invalid queries (that is, it should avoid errors being thrown when the model constructs an invalid query)
    ) 

query_constructor = retrieval_prompt | selfquery_llm | output_parser # make the query constructor chain

zot_retriever = SelfQueryRetriever(
    query_constructor=query_constructor,
    vectorstore=zot_store,
    structured_query_translator=ChromaTranslator(), # we need to specify the translator for our database scheme, e.g. Chroma
    search_type=zot_search_type,
    search_kwargs=zot_search_kwargs, 
)

print(retrieval_prompt.format(query="{query}")) # print out the prompt used for the query constructor



Your goal is to structure the user's query to match the request schema provided below.

<< Structured Request Schema >>
When responding use a markdown code snippet with a JSON object formatted in the following schema:

```json
{
    "query": string \ text string to compare to document contents
    "filter": string \ logical condition statement for filtering documents
}
```

The query string should contain only text that is expected to match the contents of documents. Any conditions in the filter should not be mentioned in the query as well.

A logical condition statement is composed of one or more comparison and logical operation statements.

A comparison statement takes the form: `comp(attr, val)`:
- `comp` ($eq | $ne | $gt | $gte | $lt | $lte): comparator
- `attr` (string):  name of attribute to apply the comparison to
- `val` (string): is the comparison value

A logical operation statement takes the form `op(statement1, statement2, ...)`:
- `op` (and | or | not): logical operator
- 

In [47]:
q = "Find texts about Amazon with the tag 'Surveillance'"

In [49]:
query_constructor.invoke(q)

StructuredQuery(query='Amazon', filter=Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='tags', value='Surveillance'), limit=None)

In [48]:
zot_retriever.invoke(q)

[]

In [62]:
zot_store.similarity_search("", filter = {"title": "Big Data and quality data for fake news and misinformation detection"}, k = 5) # we can also directly search the database

[Document(metadata={'DOI': '10.1177/2053951719843310', 'authors': 'FatemehTorabi Asr, MaiteTaboada', 'date': 'January 1, 2019', 'issue': '1', 'pages': '', 'publication': 'Big Data & Society', 'tags': 'Fake news, Methods & Methodology, Quantitative Methods', 'title': 'Big Data and quality data for fake news and misinformation detection', 'type': 'journalArticle', 'volume': '6'}, page_content='Fake news has become an important topic of research in a variety of disciplines including linguistics and computer science. In this paper, we explain how the problem is approached from the perspective of natural language processing, with the goal of building a system to automatically detect misinformation in news. The main challenge in this line of research is collecting quality data, i.e., instances of fake and real news articles on a balanced distribution of topics. We review available datasets and introduce the MisInfoText repository as a contribution of our lab to the community. We make availab

In [67]:
zot_store.get()

{'ids': ['a024ebf0-4026-48fc-b9fd-8b2af5557ece',
  'a456c9cb-f23d-41c0-93c7-691d5bdad6c8',
  'ebb08027-94a2-43e0-8675-40a92056ceef',
  '9abe5eae-0d5a-43a4-a1a1-dcc9733edcac',
  '46efc6e2-8f97-432c-8ad2-adc37e1410f1',
  '7d3f5f84-e17e-4db5-8055-8b005c2b6e5b',
  '4dd73010-7b30-4177-a2b7-33e1c1d308c5',
  '21f9cea2-d7f6-434f-a962-72252fe338b1',
  'b770ef65-85ee-4e3a-9ebb-d13cde98d982',
  'ecb0aef1-979b-4626-aeb4-83cf5d87b966',
  '98b11d6c-fdce-4d9e-a5b1-f06def316111',
  '343a2e41-1d20-4bee-91d3-c2f56ff0ca05',
  'a13b3bbd-db00-4409-bdd5-d81a6b0ab157',
  '9e7c73a9-3869-4547-9f0e-04c3136d0878',
  'd5451509-0656-4b35-8466-5adbdb1699df',
  '9ab41454-a6d2-4935-8725-362a76db9bad',
  '957553cc-39be-4660-928d-4bf2cd7fbd2d',
  '4918d9b9-39d4-4671-87e2-c13cfcadb3e6',
  '16e12de6-8f27-479b-9c7f-b43dc50f965f',
  'fd94969b-ebe3-455a-99df-9cd65a31d91e',
  '2f2a8d13-0fe1-438e-af16-86e5c7809773',
  '3ad87ad5-b72a-4730-a742-9ec2429f798b',
  '12acc8f2-8cfc-43af-9918-701580e96adc',
  '456634fa-926a-4b2d-a6b7-

Problem: we can only pass literal strings to the metadata field, e.g. the tags "Amazon" and "Surveillance" need to be concatenated to "Amazon, Surveillance", making filtering for either of them impossible (only the exact match would be returned). This is seemingly the case for all vector stores, as neither the `Document()` nor the `db.add_texts()` method allow lists to be passed as metadata. 
Solution: parse the tags at the end of the abstract in order to take advantage of the tagging by having the vector search pick up on them

Note: the same problem will occur with e.g. authors. Any multi-entry fields should be excluded from the self-query retriever schema. Only clear indicators (e.g. year, publication) should be included for the filtering. However, esp. authors should be returned in a structured output (hence should be kept as metadata). Check if a self querying retriever is still sufficiently useful and worth the computational overhead in this case

Possible alternative solution: partial string matching in the filtering, e.g. a filter operator capable of matching "Amazon" in "Amazon, Surveillance". This may require a different vector store db

Check if it is possible to build a retriever directly onto the Zotero API (without vector embeddings, and rather use zotero's built-in search) - this might circumvent these problems altogether (but potentially yields weaker results in the free form search)!

=> Check https://www.zotero.org/support/dev/client_coding/direct_sqlite_database_access, https://www.zotero.org/support/dev/web_api/v3/start (esp. searching) and LangChain community integrations (potentially contribute a custom integration?)

Alternatively (if abstract search via API sucks): parallel system were abstracts are retrieved and searched from a local vector store, and a tag/author search is provided via the zotero api? - documents could reliably be connected via zotero IDs

Note that building a retriever directly ontop of the Zotero API would require a custom scheme for querying in free text. That is, in order to make use of the filtering (which is the whole point!) we would have to write a robust prompt template / query generator for an LLM to make use of it

Alternative Solution 2: build 2 retrievers, one for tags, one for abstracts, and match them on their IDS. However, retrieving documents only on tags may be too broad and not yield good results

=> Zotero Retriever is working. Write Tests etc.; write prompting template for self-querying

=> Zotero API does not allow searching over abstracts. Build a second retriever with an abstract vector store, and make a retriever combining both results (metadata via self querying ZoteroRetriever, text search via a simple MMR AbstractRetriever)

In [9]:
test_doc = Document(page_content = "some text",
                        metadata={
        "title": "Surveillance and Amazon",
        "tags": ["Surveillance", "Amazon"]
    }
                    )

test_db = Chroma(collection_name="test", embedding_function=embeddings)

test_db.add_documents([test_doc])

ValueError: Expected metadata value to be a str, int, float or bool, got ['Surveillance', 'Amazon'] which is a list in upsert.

Try filtering complex metadata from the document using langchain_community.vectorstores.utils.filter_complex_metadata.

In [74]:
lib[0].get("data")

{'key': 'NYMVCIWW',
 'version': 33298,
 'itemType': 'journalArticle',
 'title': 'Big Data and quality data for fake news and misinformation detection',
 'creators': [{'creatorType': 'author',
   'firstName': 'Fatemeh',
   'lastName': 'Torabi Asr'},
  {'creatorType': 'author', 'firstName': 'Maite', 'lastName': 'Taboada'}],
 'abstractNote': 'Fake news has become an important topic of research in a variety of disciplines including linguistics and computer science. In this paper, we explain how the problem is approached from the perspective of natural language processing, with the goal of building a system to automatically detect misinformation in news. The main challenge in this line of research is collecting quality data, i.e., instances of fake and real news articles on a balanced distribution of topics. We review available datasets and introduce the MisInfoText repository as a contribution of our lab to the community. We make available the full text of the news articles, together with 

In [3]:
from pydantic import BaseModel, Field

zot_prompt = (
  "You are a helpful assistant for finding relevant scientific literature."
   "..."
)

# note that the generated answer is highly dependent on the system prompt, e.g. adding \n between the lines changes the output
#   (potentially depending on the model)

class literature(BaseModel):
        """Articles relevant to the question"""
        titles: List[str]
        # ...
