In [None]:
!pip install rdflib
!pip install load_dotenv
!pip install faiss-cpu
!pip install --upgrade langchain-ollama
!pip install --upgrade langchain 
!pip install --upgrade langchain-community

In [1]:
import os
import re
import pickle
from pprint import pprint
from dotenv import load_dotenv
from rdflib import Graph

In [2]:
from langchain_core.documents import Document
from langchain_ollama import OllamaEmbeddings
from langchain.chat_models import ChatOllama
from langchain_core.messages import AIMessage

In [3]:
import faiss
from uuid import uuid4
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

In [4]:
# Configuration files
ttl_file = "/Users/adrian/Desktop/items_filtered.ttl"
pkl_file = "/Users/adrian/Desktop/dataset.pkl"
txt_for_validation = "/Users/adrian/Desktop/graph_output.txt"
docs_path = "/Users/adrian/Desktop/momu"

In [5]:
def convert_ttl_to_dict(ttl_file, pkl_file, txt_for_validation):
    out = dict()
    g = Graph()
    g.parse(ttl_file, format="turtle")
    json_ld_data = g.serialize(format="json-ld", indent=4)
    with open(txt_for_validation, "w") as file:
        # Iterate through each triple in the graph
        subjects = set(g.subjects())
        for subject in subjects:
            org_subject = subject
            if "api/item" in subject:
                subj_formalized = str(subject).split('/')[-1]
                if '#' in subj_formalized:
                    subject = subj_formalized.split('#')[-1]
                else:
                    subject = subj_formalized
                out[subject] = {}
                file.write(f"Subject {subject}\n")
                # Iterate over all triples where this subject is the subject
                for pred, obj in g.predicate_objects(subject=org_subject):
                    pred_formalized = str(pred).split('/')[-1]
                    if '#' in pred_formalized:
                        pred = pred_formalized.split('#')[-1]
                    else:
                        pred = pred_formalized
                    if "http" not in obj:
                        out[subject][pred] = str(obj)
                        file.write(f"{pred}: {obj}\n")
                     
    with open(pkl_file, "wb") as f:
        pickle.dump(out, f)
    
    return out

In [6]:
dataset = convert_ttl_to_dict(ttl_file, pkl_file, txt_for_validation)
# with open(pkl_file, "rb") as f:
#     dataset = pickle.load(f)

In [7]:
def simplify_predicate(predicate):
    match = re.search(r"[#/](\w+)$", predicate)
    return match.group(1).replace("_", " ") if match else predicate

In [8]:
def generate_readable_content_v2(instance, properties):
    lines = []
    instance_id = str(instance).split("/")[-1]  
    # lines.append(f"The item {instance_id} has the following information:")
    
    for predicate, obj in properties:
        simplified_predicate = simplify_predicate(str(predicate))
        if "is_public" in str(predicate):
            lines.append(f"The item {instance_id} is {'public' if obj == 'true' else 'not public'}.")
        elif "title" in str(predicate):
            lines.append(f"The identifier of this artifact is \"{obj}\".")
        elif "description" in str(predicate):
            lines.append(f"The description of this artifact is \"{obj}\"")
        elif "date" == str(predicate):
            # print (predicate, obj)
            lines.append(f"This artifact was created from the following period: {obj}.")
        elif "modified" in str(predicate):
            lines.append(f"This artifact was last modified on {obj}.")
        elif "medium" in str(predicate):
            lines.append(f"The medium of this artifact includes {obj}.")
        elif "extent" in str(predicate):
            lines.append(f"The dimensions of this artifact are {obj}.")
        elif "publisher" in str(predicate):
            lines.append(f"The publisher of this artifact is {obj}.")
        elif "subject" in str(predicate):
            lines.append(f"The subject of this artifact includes {obj}.")
        elif "shortDescription" in str(predicate):
            obj = obj.replace('\n', '')
            lines.append(f"The context of this artifact is \"{obj}\".")
        elif "P48_has_preferred_identifier" in str(predicate):
            lines.append(f"The preferred identifier of this artifact is {obj}.")
        elif "P50_has_current_keeper" in str(predicate):
            lines.append(f"The current keeper of this artifact is {obj}.")
        elif "P55_has_current_location" in str(predicate):
            lines.append(f"The current location of this artifact is in {obj}.")
        elif "dateSubmitted" in str(predicate):
            lines.append(f"This artifact was submitted on {obj}.")
        elif "identifierGroupType" in str(predicate):
            lines.append(f"The group type of this artifact is  {obj}.")
        elif "identifierGroupValue" in str(predicate):
            lines.append(f"The group value of this artifact is {obj}.")
        elif simplified_predicate == "id":
            continue
        else:     
            # print (simplified_predicate)
            lines.append(f"The {simplified_predicate} of this artifact is {obj}.")
    
    return lines

In [9]:
def convert_pkl_to_doc(dataset, docs_path, save_txt = True):

    docs = list()
    combined_texts = dict()
    for item, val in dataset.items():
        item_id = str(item).split("/")[-1]
        properties = [(k, v) for k,v in val.items()]
        # lines = generate_readable_content(item, properties)
        lines = generate_readable_content_v2(item, properties)
        for line in lines:
            docs.append (
                Document(
                page_content = line,
                metadata={'item_id': item_id}
                )
        )

        combined_texts[item_id] = '\n'.join(lines)
        
        if save_txt:
            file_name = item_id + ".txt"  
            file_path = os.path.join(docs_path, file_name)
            with open(file_path, "w", encoding="utf-8") as f:
                f.write('\n'.join(lines))
                
    return docs, combined_texts

In [10]:
docs, combined_texts = convert_pkl_to_doc(dataset, docs_path)

In [11]:
# https://python.langchain.com/docs/integrations/vectorstores/faiss/

# embeddings = OllamaEmbeddings(model="llama3.2:1b")
embeddings = OllamaEmbeddings(model="mxbai-embed-large")

index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

uuids = [str(uuid4()) for _ in range(len(docs))]
vector_store.add_documents(documents=docs, ids=uuids)

['de7211b6-05bd-4011-864f-3a0ad2c23156',
 '6eb4d4a3-d7e9-4883-98d3-9b8b25e1aa33',
 '423032d6-f32a-4500-b68a-12adcac66d44',
 'eca16f19-f49c-41d8-b995-7fd7dbdeeccb',
 '626edd8d-101f-498d-9e45-b104f87727df',
 '952892ac-f8d9-44ad-9c06-3482e5edc16c',
 '43b27c3c-180a-48e1-bb87-0b4a20cb6506',
 'c4cc58b6-63c4-4f70-9bb7-df849c5529ac',
 'ddf9740d-1773-437e-9727-bd9ea3a2ccf2',
 'b2857a1a-383d-4312-9f7d-ac48425aee37',
 'e7c7cbfe-20a9-4df4-81bf-0b90ffdefafe',
 '0b2f356e-b24e-4666-88ce-5b7b01e62cea',
 '0b9076bc-2f59-46ca-b994-864a59b07ac1',
 'b25037d7-00af-4e8c-9f7f-3ea27858ea73',
 '6127a79a-cf0a-4f27-9ea3-a2d7b058f5f9',
 '47f02304-bf92-4ab4-ab5e-632897f51c05',
 '4c1a8152-edaa-492f-af64-65426b615763',
 '5c4f7cad-1d8d-4daf-8441-d5e1c26a7866',
 'afaeeae1-9458-4f79-8e64-9c9ffb7ebbdf',
 '31b73988-aa44-4528-8447-22707d5ced90',
 '4be7d9e7-1078-4edf-9dc0-d037d8f261c4',
 '4030e453-3844-4ced-aba5-7858aac15dd1',
 '908cc0f6-5c2f-4aae-97e4-5d667cb89a20',
 '75e225c6-8281-4efa-a293-45738366369c',
 'dff12501-c5f8-

In [12]:
# TEST DEMO: Similarity search
results = vector_store.similarity_search(
    "which artifact was submitted on 06-28",
    k=5,
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* This artifact was submitted on 2022-06-28. [{'item_id': '15812'}]
* This artifact was submitted on 2022-06-28. [{'item_id': '15966'}]
* This artifact was submitted on 2022-06-28. [{'item_id': '15695'}]
* This artifact was submitted on 2022-06-28. [{'item_id': '16148'}]
* This artifact was submitted on 2022-06-28. [{'item_id': '16099'}]


In [13]:
def chat_llm(llm, query):
    results = vector_store.similarity_search(
        query,
        k=1,
        # filter={"source": "tweet"},
    )
    # for res in results:
    #     print(f"* {res.page_content} [{res.metadata}]")
    
    # retrieve the right artifact 
    # simply pick the top one
    item_id = results[0].metadata['item_id']
    context = combined_texts[item_id]
    
    messages = [
        (
            "system",
            f"You are a helpful assistant in museum to explain the artifact. \
            First, read the following context {context}. \
            Please answer the question \
            and introduce more detailed information about this artifact, \
            including identifier, created period, and 2-3 sentences as its description ",
        ),
        ("human", query),
    ]
    
    ai_msg = llm.invoke(messages)
    print('-'* 30 + " Context of the Artifact " + '-'* 30)
    print(context)
    print('-'* 30 + " LLM answer " + '-'* 30)
    print(ai_msg.content)

In [14]:
# https://python.langchain.com/docs/integrations/chat/ollama/
llm = ChatOllama(model="llama3.1")

In [15]:
query = "which artifact was created from the following period: 1930-1959?"
chat_llm(llm, query)

------------------------------ Context of the Artifact ------------------------------
The identifier of this artifact is "ST2014".
This artifact was created from the following period: 1930-1959.
This artifact was submitted on 2022-06-28.
The description of this artifact is "Jurk, daagse jurk of werkjurk."
The publisher of this artifact is MOMU.
The subject of this artifact includes jurk.
The context of this artifact is "Deze daagse huis- of werkjurk van bedrukt katoen is nog ongedragen. Het papieren etiket geeft inzicht in waar dit kledingstuk gekocht is: de Grand Magasins À Saint-Jacques te Reims.  Aan het begin van de jaren 1920 fuseren twee grote warenhuizen in Reims, La Samaritaine en À la Tour Saint-Jacques, onder de naam À Saint-Jacques. In deze nieuwe zaak in de Rue de Vesle verkoopt men voornamelijk dames- en kinderkleding. Tevens zijn er zijn verstelateliers, en klanten kunnen er terecht voor stoffen, interieurtextiel en een grote verscheidenheid aan accessoires. In een bijgeb

In [16]:
# TODO
# 1) improve the retrieval stage (acc & multi questions)
# 2) work on the system prompt to see what to display
# 3) backup the vector store 
# 4) reduce the inference time
# 5) introduce more metadata, e.g., image
# 6) multilingual feature