In [5]:
from importlib.metadata import version
from llama_index.llms.ollama import Ollama
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.embeddings import HuggingFaceEmbedding  # Updated import
import logging
import sys
import os
from tqdm.auto import tqdm
from time import time
from datetime import timedelta

# Configure logging
logging.basicConfig(stream=sys.stdout, level=logging.INFO)

def timing_decorator(func):
    """Decorator to measure execution time"""
    def wrapper(*args, **kwargs):
        start = time()
        with tqdm(total=1, desc=f"Running {func.__name__}") as pbar:
            result = func(*args, **kwargs)
            pbar.update(1)
        elapsed = timedelta(seconds=round(time() - start))
        logging.info(f"{func.__name__} completed in {elapsed}")
        return result
    return wrapper

@timing_decorator
def create_data_directory(directory):
    """Create directory if it doesn't exist."""
    if not os.path.exists(directory):
        os.makedirs(directory)
        logging.info(f"Created directory: {directory}")
    return directory

@timing_decorator
def initialize_embedding_model():
    """Initialize embedding model"""
    try:
        embed_model = HuggingFaceEmbedding(
            model_name="sentence-transformers/all-mpnet-base-v2",  # Better model for embeddings
            cache_folder="./model_cache",
            embed_batch_size=32
        )
        logging.info("Successfully initialized HuggingFace embedding model")
        Settings.embed_model = embed_model
        return embed_model
    except Exception as e:
        logging.error(f"Error initializing embedding model: {str(e)}")
        raise

def main():
    try:
        # Initialize Ollama LLM with progress
        with tqdm(total=1, desc="Initializing Ollama LLM") as pbar:
            llm = Ollama(model="llama2")
            pbar.update(1)
        logging.info("Ollama LLM initialized with model: llama2")

        # Initialize embedding model
        embed_model = initialize_embedding_model()
        logging.info("Embedding model initialized")

        # Ensure data directory exists
        data_dir = create_data_directory('MPEP_HTML')

        # Load MPEP documents with progress
        logging.info("Loading MPEP documents...")
        documents = []
        with tqdm(desc="Loading documents") as pbar:
            documents = SimpleDirectoryReader(data_dir).load_data()
            pbar.update(1)
        logging.info(f"Loaded {len(documents)} documents from '{data_dir}'")

        # Create vector store index with progress tracking
        logging.info("Creating vector store index...")
        with tqdm(total=3, desc="Building index") as pbar:
            index = VectorStoreIndex.from_documents(
                documents,
                embed_model=embed_model,
                show_progress=True
            )
            pbar.update(3)
        logging.info("Vector store index created")

        # Create query engine with progress
        with tqdm(total=1, desc="Creating query engine") as pbar:
            query_engine = index.as_query_engine()
            pbar.update(1)
        logging.info("Query engine created")

        # Test query with progress
        test_query = "What is a patent?"
        logging.info(f"Running test query: '{test_query}'")
        with tqdm(total=1, desc="Processing query") as pbar:
            response = query_engine.query(test_query)
            pbar.update(1)
        logging.info(f"Query response received")
        print("\nQuery Response:")
        print("-" * 80)
        print(response)
        print("-" * 80)

    except Exception as e:
        logging.error(f"Error in processing: {str(e)}")
        raise

if __name__ == "__main__":
    start_time = time()
    logging.info("Starting main process...")
    main()
    elapsed = timedelta(seconds=round(time() - start_time))
    logging.info(f"Main process finished in {elapsed}")


IndentationError: unexpected indent (2619634838.py, line 44)

In [2]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings

Settings.embed_model = HuggingFaceEmbedding(
     model_name="microsoft/Phi-3-mini-4k-instruct-gguf"
 )

  from .autonotebook import tqdm as notebook_tqdm
No sentence-transformers model found with name microsoft/Phi-3-mini-4k-instruct-gguf. Creating a new one with mean pooling.


ValueError: Unrecognized model in microsoft/Phi-3-mini-4k-instruct-gguf. Should have a `model_type` key in its config.json, or contain one of the following strings in its name: albert, align, altclip, aria, aria_text, audio-spectrogram-transformer, autoformer, bamba, bark, bart, beit, bert, bert-generation, big_bird, bigbird_pegasus, biogpt, bit, blenderbot, blenderbot-small, blip, blip-2, bloom, bridgetower, bros, camembert, canine, chameleon, chinese_clip, chinese_clip_vision_model, clap, clip, clip_text_model, clip_vision_model, clipseg, clvp, code_llama, codegen, cohere, cohere2, colpali, conditional_detr, convbert, convnext, convnextv2, cpmant, ctrl, cvt, dac, data2vec-audio, data2vec-text, data2vec-vision, dbrx, deberta, deberta-v2, decision_transformer, deformable_detr, deit, depth_anything, deta, detr, diffllama, dinat, dinov2, dinov2_with_registers, distilbert, donut-swin, dpr, dpt, efficientformer, efficientnet, electra, emu3, encodec, encoder-decoder, ernie, ernie_m, esm, falcon, falcon_mamba, fastspeech2_conformer, flaubert, flava, fnet, focalnet, fsmt, funnel, fuyu, gemma, gemma2, git, glm, glpn, gpt-sw3, gpt2, gpt_bigcode, gpt_neo, gpt_neox, gpt_neox_japanese, gptj, gptsan-japanese, granite, granitemoe, graphormer, grounding-dino, groupvit, hiera, hubert, ibert, idefics, idefics2, idefics3, idefics3_vision, ijepa, imagegpt, informer, instructblip, instructblipvideo, jamba, jetmoe, jukebox, kosmos-2, layoutlm, layoutlmv2, layoutlmv3, led, levit, lilt, llama, llava, llava_next, llava_next_video, llava_onevision, longformer, longt5, luke, lxmert, m2m_100, mamba, mamba2, marian, markuplm, mask2former, maskformer, maskformer-swin, mbart, mctct, mega, megatron-bert, mgp-str, mimi, mistral, mixtral, mllama, mobilebert, mobilenet_v1, mobilenet_v2, mobilevit, mobilevitv2, modernbert, moonshine, moshi, mpnet, mpt, mra, mt5, musicgen, musicgen_melody, mvp, nat, nemotron, nezha, nllb-moe, nougat, nystromformer, olmo, olmo2, olmoe, omdet-turbo, oneformer, open-llama, openai-gpt, opt, owlv2, owlvit, paligemma, patchtsmixer, patchtst, pegasus, pegasus_x, perceiver, persimmon, phi, phi3, phimoe, pix2struct, pixtral, plbart, poolformer, pop2piano, prophetnet, pvt, pvt_v2, qdqbert, qwen2, qwen2_audio, qwen2_audio_encoder, qwen2_moe, qwen2_vl, rag, realm, recurrent_gemma, reformer, regnet, rembert, resnet, retribert, roberta, roberta-prelayernorm, roc_bert, roformer, rt_detr, rt_detr_resnet, rwkv, sam, seamless_m4t, seamless_m4t_v2, segformer, seggpt, sew, sew-d, siglip, siglip_vision_model, speech-encoder-decoder, speech_to_text, speech_to_text_2, speecht5, splinter, squeezebert, stablelm, starcoder2, superpoint, swiftformer, swin, swin2sr, swinv2, switch_transformers, t5, table-transformer, tapas, textnet, time_series_transformer, timesformer, timm_backbone, timm_wrapper, trajectory_transformer, transfo-xl, trocr, tvlt, tvp, udop, umt5, unispeech, unispeech-sat, univnet, upernet, van, video_llava, videomae, vilt, vipllava, vision-encoder-decoder, vision-text-dual-encoder, visual_bert, vit, vit_hybrid, vit_mae, vit_msn, vitdet, vitmatte, vitpose, vitpose_backbone, vits, vivit, wav2vec2, wav2vec2-bert, wav2vec2-conformer, wavlm, whisper, xclip, xglm, xlm, xlm-prophetnet, xlm-roberta, xlm-roberta-xl, xlnet, xmod, yolos, yoso, zamba, zoedepth

In [61]:
#test olamma embedding

from llama_index.embeddings.ollama import OllamaEmbedding

ollama_embedding = OllamaEmbedding(
    model_name="phi3",
    base_url="http://localhost:11434",
    ollama_additional_kwargs={"mirostat": 0},
)

#uncomment to test
# pass_embedding = ollama_embedding.get_text_embedding_batch(
#     ["This is a passage!", "This is another passage"], show_progress=True
# )
# print(pass_embedding)

# query_embedding = ollama_embedding.get_query_embedding("Where is blue?")
# print(query_embedding)

In [60]:
import json

def to_langchain_format(chapter):
    """
    Transforms a chapter dictionary into a standardized format expected by LangChain.

    Args:
        chapter (dict): A dictionary containing the keys 'chapter_title' and 'content'
                        which hold the title and content of the chapter, respectively.

    Returns:
        dict: A dictionary formatted for LangChain, with the title and cleaned content.
    """
    # Assuming 'content' is already cleaned
    return {
        "title": chapter["chapter_title"],
        "content": chapter["content"]  # already cleaned
    }



In [19]:
#data transformation for langchain format

import json

# Load the cleaned data
file_path = r'C:\Users\mharr\OneDrive\Documents\GitHub\MPEP_finetune\Notebooks\mpep_data_clean.json'
with open(file_path, 'r', encoding='utf-8') as file:
    cleaned_chapters = json.load(file)

# Apply the transformation to each chapter
processed_documents = [to_langchain_format(chapter) for chapter in cleaned_chapters]

# Optionally, save the processed documents to a new file
output_path = r'C:\Users\mharr\OneDrive\Documents\GitHub\MPEP_finetune\Notebooks\langchain_ready_data.json'
with open(output_path, 'w', encoding='utf-8') as file:
    json.dump(processed_documents, file, indent=4, ensure_ascii=False)

print("Processed data ready for LangChain saved to:", output_path)


Processed data ready for LangChain saved to: C:\Users\mharr\OneDrive\Documents\GitHub\MPEP_finetune\Notebooks\langchain_ready_data.json


In [27]:
#langchain documents object

from llama_index.core import SimpleDirectoryReader

reader = SimpleDirectoryReader(input_dir=r"C:\Users\mharr\OneDrive\Documents\GitHub\MPEP_finetune\Notebooks\langchainjson")
documents = reader.load_data()

In [29]:
#embeddings creation

from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

# Load documents and build index
index = VectorStoreIndex.from_documents(documents, show_progress=True)

Parsing nodes: 100%|██████████| 1/1 [00:20<00:00, 20.51s/it]
Generating embeddings: 100%|██████████| 2048/2048 [01:12<00:00, 28.36it/s]
Generating embeddings: 100%|██████████| 2048/2048 [01:57<00:00, 17.44it/s]
Generating embeddings: 100%|██████████| 829/829 [00:39<00:00, 21.17it/s]


In [37]:
# import
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from IPython.display import Markdown, display
import chromadb

In [43]:
# save to disk

db = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db.get_or_create_collection("quickstart")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context, embed_model=ollama_embedding
)

In [44]:
# load from disk
db2 = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db2.get_or_create_collection("quickstart")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
index = VectorStoreIndex.from_vector_store(
    vector_store,
    embed_model=ollama_embedding,
)

In [62]:
# Query Data from the persisted index
query_engine = index.as_query_engine()
response = query_engine.query("What are the utility requirements of a patent? Please explain in detail.")
display(Markdown(f"<b>{response}</b>"))

<b>The utility requirement of a patent refers to the fundamental criterion that an invention must have a specific and credible utility or use. In other words, for an invention to be eligible for a patent, it must serve a practical purpose and be capable of providing some form of benefit or advantage. This requirement ensures that patents are granted for inventions that are actually useful and have real-world applications. Additionally, the utility of the invention must be described in the patent application in a manner that enables a person skilled in the relevant field to understand and appreciate the practical significance of the invention.</b>