In [22]:
# !pip install rdflib
# !pip install load_dotenv
# !pip install faiss-cpu
# !pip install --upgrade langchain-ollama
# !pip install --upgrade langchain 
# !pip install --upgrade langchain-community

In [23]:
import os
import re
import pickle
from pprint import pprint
from dotenv import load_dotenv
from rdflib import Graph

In [24]:
from langchain_core.documents import Document
from langchain_ollama import OllamaEmbeddings
from langchain.chat_models import ChatOllama
from langchain_core.messages import AIMessage

In [25]:
import faiss
from uuid import uuid4
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

In [26]:
# Configuration files
ttl_file = "/Users/Ruben/Github/MoMuHackathon/Data/items_filtered.ttl"
pkl_file = "/Users/Ruben/Github/MoMuHackathon/Data/dataset.pkl"
txt_for_validation = "/Users/Ruben/Github/MoMuHackathon/Data/graph_output.txt"
docs_path = "/Users/Ruben/Github/MoMuHackathon/output_documents"

In [27]:
def convert_ttl_to_dict(ttl_file, pkl_file, txt_for_validation):
    out = dict()
    g = Graph()
    g.parse(ttl_file, format="turtle")
    json_ld_data = g.serialize(format="json-ld", indent=4)
    with open(txt_for_validation, "w") as file:
        # Iterate through each triple in the graph
        subjects = set(g.subjects())
        for subject in subjects:
            org_subject = subject
            if "api/item" in subject:
                subj_formalized = str(subject).split('/')[-1]
                if '#' in subj_formalized:
                    subject = subj_formalized.split('#')[-1]
                else:
                    subject = subj_formalized
                out[subject] = {}
                file.write(f"Subject {subject}\n")
                # Iterate over all triples where this subject is the subject
                for pred, obj in g.predicate_objects(subject=org_subject):
                    pred_formalized = str(pred).split('/')[-1]
                    if '#' in pred_formalized:
                        pred = pred_formalized.split('#')[-1]
                    else:
                        pred = pred_formalized
                    if "http" not in obj:
                        out[subject][pred] = str(obj)
                        file.write(f"{pred}: {obj}\n")
                     
    with open(pkl_file, "wb") as f:
        pickle.dump(out, f)
    
    return out

In [28]:
dataset = convert_ttl_to_dict(ttl_file, pkl_file, txt_for_validation)
# with open(pkl_file, "rb") as f:
#     dataset = pickle.load(f)

In [29]:
def simplify_predicate(predicate):
    match = re.search(r"[#/](\w+)$", predicate)
    return match.group(1).replace("_", " ") if match else predicate

In [30]:
def generate_readable_content_v2(instance, properties):
    lines = []
    instance_id = str(instance).split("/")[-1]  
    # lines.append(f"The item {instance_id} has the following information:")
    
    for predicate, obj in properties:
        simplified_predicate = simplify_predicate(str(predicate))
        if "is_public" in str(predicate):
            lines.append(f"The item {instance_id} is {'public' if obj == 'true' else 'not public'}.")
        elif "title" in str(predicate):
            lines.append(f"The identifier of this artifact is \"{obj}\".")
        elif "description" in str(predicate):
            lines.append(f"The description of this artifact is \"{obj}\"")
        elif "date" == str(predicate):
            # print (predicate, obj)
            lines.append(f"This artifact was created from the following period: {obj}.")
        elif "modified" in str(predicate):
            lines.append(f"This artifact was last modified on {obj}.")
        elif "medium" in str(predicate):
            lines.append(f"The medium of this artifact includes {obj}.")
        elif "extent" in str(predicate):
            lines.append(f"The dimensions of this artifact are {obj}.")
        elif "publisher" in str(predicate):
            lines.append(f"The publisher of this artifact is {obj}.")
        elif "subject" in str(predicate):
            lines.append(f"The subject of this artifact includes {obj}.")
        elif "shortDescription" in str(predicate):
            obj = obj.replace('\n', '')
            lines.append(f"The context of this artifact is \"{obj}\".")
        elif "P48_has_preferred_identifier" in str(predicate):
            lines.append(f"The preferred identifier of this artifact is {obj}.")
        elif "P50_has_current_keeper" in str(predicate):
            lines.append(f"The current keeper of this artifact is {obj}.")
        elif "P55_has_current_location" in str(predicate):
            lines.append(f"The current location of this artifact is in {obj}.")
        elif "dateSubmitted" in str(predicate):
            lines.append(f"This artifact was submitted on {obj}.")
        elif "identifierGroupType" in str(predicate):
            lines.append(f"The group type of this artifact is  {obj}.")
        elif "identifierGroupValue" in str(predicate):
            lines.append(f"The group value of this artifact is {obj}.")
        elif simplified_predicate == "id":
            continue
        else:     
            # print (simplified_predicate)
            lines.append(f"The {simplified_predicate} of this artifact is {obj}.")
    
    return lines

In [31]:
def convert_pkl_to_doc(dataset, docs_path, save_txt = True):

    docs = list()
    combined_texts = dict()
    for item, val in dataset.items():
        item_id = str(item).split("/")[-1]
        properties = [(k, v) for k,v in val.items()]
        # lines = generate_readable_content(item, properties)
        lines = generate_readable_content_v2(item, properties)
        for line in lines:
            docs.append (
                Document(
                page_content = line,
                metadata={'item_id': item_id}
                )
        )

        combined_texts[item_id] = '\n'.join(lines)
        
        if save_txt:
            file_name = item_id + ".txt"  
            file_path = os.path.join(docs_path, file_name)
            with open(file_path, "w", encoding="utf-8") as f:
                f.write('\n'.join(lines))
                
    return docs, combined_texts

In [32]:
docs, combined_texts = convert_pkl_to_doc(dataset, docs_path)

In [33]:
# https://python.langchain.com/docs/integrations/vectorstores/faiss/

# embeddings = OllamaEmbeddings(model="llama3.2:1b")
embeddings = OllamaEmbeddings(model="mxbai-embed-large")

index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

uuids = [str(uuid4()) for _ in range(len(docs))]
vector_store.add_documents(documents=docs, ids=uuids)

['b6656458-52b4-4226-bf80-473c478af05e',
 '6c38d13d-8756-4443-8ea5-6b34cae6ef11',
 '8d1936c4-7d3c-4e79-95ad-f809444fa2db',
 '4e10ddd6-7bbc-49f8-8527-c6aa10016c37',
 'e3dea233-7e3f-4d37-ad3e-b64aac95c551',
 '8aac8c33-bff9-447f-a7b1-71b3be764f8b',
 'e2a76208-3a21-47df-aa65-42338b938e74',
 '2b2ee5ad-ad98-41db-917e-bb88be99417d',
 '4fbc6528-6b64-4a6b-8937-3a61c49a4517',
 '553fa8be-2f5b-4cfd-b150-bbcc10624722',
 'bbf03d20-9998-4323-a982-4e3521302fb0',
 'afd23906-c0f7-47e3-b0bb-4f5ce992a27f',
 'f066b41f-e66b-441e-a04b-6c515902b941',
 '1edcd5b4-263d-4c58-892b-d3d659e78b75',
 '0a3c9204-0985-4158-b93b-ce9de9cfc94e',
 'f0204134-f7fb-46da-b9d3-b82da47fcf1c',
 '3a7ad112-f2e9-4a5b-aded-d8e47a50acf0',
 '51f9e85c-d542-4e20-8366-e8c7d8a6bac7',
 '446b2482-91e2-457a-8feb-5cd121c7d855',
 'aca946f4-2588-45e4-bab5-e45f0518012a',
 '2b2c4275-4714-43d3-a78e-0d0bb1a3fa4f',
 'ded1eea3-0b5e-4cf8-8c3e-2cb2c072b60f',
 'b238b8ff-6737-4d18-8177-04a8de5291ee',
 '76f1c401-9892-4d5a-a8d0-b9e04eefad79',
 '1304d251-c259-

In [39]:
# TEST DEMO: Similarity search
results = vector_store.similarity_search(
    "What are the artefacts that are created in the 1900s",
    k=5,
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* This artifact was created from the following period: 1900-1910. [{'item_id': '14641'}]
* This artifact was created from the following period: 1900-1909. [{'item_id': '16016'}]
* This artifact was created from the following period: ca. 1900. [{'item_id': '16076'}]
* This artifact was created from the following period: ca. 1900. [{'item_id': '15938'}]
* This artifact was created from the following period: 1850-1900. [{'item_id': '15966'}]


In [None]:
def chat_llm(llm, query, new_m, messages):
    results = vector_store.similarity_search(
        query,
        k=1,
        # filter={"source": "tweet"},
    )
    # for res in results:
    #     print(f"* {res.page_content} [{res.metadata}]")
    
    # retrieve the right artifact 
    # simply pick the top one
    item_id = results[0].metadata['item_id']
    context = combined_texts[item_id]
    messages = [
        (
            "system",
            f"You are a helpful assistant in museum to explain the artifact. \
            You have the knowledge about the artifact: {context}. \
            Please answer the question \
            and then introduce detailed information about this artifact, \
            Your answer must include the identifier, created period, and 3-4 sentences as its description ",
        ),
        ("human", query),
    ]
    
    
    ai_msg = llm.invoke(messages)
    print('-'* 30 + " Context of the Artifact " + '-'* 30)
    print(context)
    print('-'* 30 + " LLM answer " + '-'* 30)
    print(ai_msg.content)
    return messages, results

In [36]:
# https://python.langchain.com/docs/integrations/chat/ollama/
llm = ChatOllama(model="llama3.1", temperature=0)

In [41]:
query = "What can you tell me about the history of the artefact?"
chat_llm(llm, query)

------------------------------ Context of the Artifact ------------------------------
The identifier of this artifact is "ST2253".
This artifact was created from the following period: 1945.
This artifact was submitted on 2022-06-28.
The description of this artifact is "Jurk van witte katoen met blauwe en roze ruiten."
The dimensions of this artifact are hoogte: 100 cm  schouderbreedte: 36 cm  tailleomvang: 62 cm.
The medium of this artifact includes platbinding.
The publisher of this artifact is MOMU.
The subject of this artifact includes jurk.
The context of this artifact is "Een geruite katoenen jurk van een meisje of jonge vrouw met de initialen “J.B.”. Als je goed kijkt een voorbeeld van spaarzaamheid. Tijdens en net na de Tweede Wereldoorlog is textiel namelijk moeilijk te verkrijgen. Kleding is duur en “op de bon”: een nieuw kledingstuk kan enkel worden aangeschaft bij inlevering van op rantsoen gestelde textielbonnen. Zuinig omspringen met de eigen kledij is van groot belang.  D

In [38]:
# TODO
# 1) improve the retrieval stage (acc & multi questions)
# 2) work on the system prompt to see what to display
# 3) backup the vector store 
# 4) reduce the inference time
# 5) introduce more metadata, e.g., image
# 6) multilingual feature