In [8]:
#test the ollama local setup

from llama_index.llms.ollama import Ollama

llm = Ollama(model="phi3", request_timeout=60.0)

response = llm.complete("What color is Guinness beer?")
print(response)

Guinness beer, the iconic Irish stout, is typically a dark brown or black in color. This deep hue comes from its high concentration of melanoidins, which are created during the brewing process when malted barley and roasted unmalted barley (called "black patent") are used in the beer's recipe.

However, it is worth noting that there may also be a lighter variety of Guinness called "Guinness Foreign Extra Stout" which has a slightly different color and appearance compared to the traditional dark stout. This variation uses roasted barley in its brewing process but at a lower level than the mainstream version, resulting in a less intense black-brown hue.


In [12]:
# from llama_index.embeddings.huggingface import HuggingFaceEmbedding
# from llama_index.core import Settings

# Settings.embed_model = HuggingFaceEmbedding(
#     model_name="microsoft/Phi-3-mini-4k-instruct-gguf"
# )

In [61]:
#test olamma embedding

from llama_index.embeddings.ollama import OllamaEmbedding

ollama_embedding = OllamaEmbedding(
    model_name="phi3",
    base_url="http://localhost:11434",
    ollama_additional_kwargs={"mirostat": 0},
)

#uncomment to test
# pass_embedding = ollama_embedding.get_text_embedding_batch(
#     ["This is a passage!", "This is another passage"], show_progress=True
# )
# print(pass_embedding)

# query_embedding = ollama_embedding.get_query_embedding("Where is blue?")
# print(query_embedding)

In [60]:
import json

def to_langchain_format(chapter):
    """
    Transforms a chapter dictionary into a standardized format expected by LangChain.

    Args:
        chapter (dict): A dictionary containing the keys 'chapter_title' and 'content'
                        which hold the title and content of the chapter, respectively.

    Returns:
        dict: A dictionary formatted for LangChain, with the title and cleaned content.
    """
    # Assuming 'content' is already cleaned
    return {
        "title": chapter["chapter_title"],
        "content": chapter["content"]  # already cleaned
    }



In [19]:
#data transformation for langchain format

import json

# Load the cleaned data
file_path = r'C:\Users\mharr\OneDrive\Documents\GitHub\MPEP_finetune\Notebooks\mpep_data_clean.json'
with open(file_path, 'r', encoding='utf-8') as file:
    cleaned_chapters = json.load(file)

# Apply the transformation to each chapter
processed_documents = [to_langchain_format(chapter) for chapter in cleaned_chapters]

# Optionally, save the processed documents to a new file
output_path = r'C:\Users\mharr\OneDrive\Documents\GitHub\MPEP_finetune\Notebooks\langchain_ready_data.json'
with open(output_path, 'w', encoding='utf-8') as file:
    json.dump(processed_documents, file, indent=4, ensure_ascii=False)

print("Processed data ready for LangChain saved to:", output_path)


Processed data ready for LangChain saved to: C:\Users\mharr\OneDrive\Documents\GitHub\MPEP_finetune\Notebooks\langchain_ready_data.json


In [27]:
#langchain documents object

from llama_index.core import SimpleDirectoryReader

reader = SimpleDirectoryReader(input_dir=r"C:\Users\mharr\OneDrive\Documents\GitHub\MPEP_finetune\Notebooks\langchainjson")
documents = reader.load_data()

In [29]:
#embeddings creation

from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

# Load documents and build index
index = VectorStoreIndex.from_documents(documents, show_progress=True)

Parsing nodes: 100%|██████████| 1/1 [00:20<00:00, 20.51s/it]
Generating embeddings: 100%|██████████| 2048/2048 [01:12<00:00, 28.36it/s]
Generating embeddings: 100%|██████████| 2048/2048 [01:57<00:00, 17.44it/s]
Generating embeddings: 100%|██████████| 829/829 [00:39<00:00, 21.17it/s]


In [37]:
# import
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from IPython.display import Markdown, display
import chromadb

In [43]:
# save to disk

db = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db.get_or_create_collection("quickstart")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context, embed_model=ollama_embedding
)

In [44]:
# load from disk
db2 = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db2.get_or_create_collection("quickstart")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
index = VectorStoreIndex.from_vector_store(
    vector_store,
    embed_model=ollama_embedding,
)

In [62]:
# Query Data from the persisted index
query_engine = index.as_query_engine()
response = query_engine.query("What are the utility requirements of a patent? Please explain in detail.")
display(Markdown(f"<b>{response}</b>"))

<b>The utility requirement of a patent refers to the fundamental criterion that an invention must have a specific and credible utility or use. In other words, for an invention to be eligible for a patent, it must serve a practical purpose and be capable of providing some form of benefit or advantage. This requirement ensures that patents are granted for inventions that are actually useful and have real-world applications. Additionally, the utility of the invention must be described in the patent application in a manner that enables a person skilled in the relevant field to understand and appreciate the practical significance of the invention.</b>