In [4]:

import os
from dotenv import load_dotenv
from pydantic import BaseModel, Field
load_dotenv()

def get_model(model: str = 'deepseek-r1:7b', provider: str = 'local'):
    if (provider == 'local'):
        from langchain_ollama import ChatOllama
        llm = ChatOllama(model=model, temperature=0)
        return llm
    elif (provider == 'aws'):
        from langchain_aws import ChatBedrockConverse
        import boto3
        access_key = os.getenv('ACCESS_KEY')
        secret_key = os.getenv('SECRET_KEY')
        bedrock_client = boto3.client('bedrock-runtime',
                                      region_name='us-east-1',
                                      aws_access_key_id=access_key,
                                      aws_secret_access_key=secret_key)
        llm = ChatBedrockConverse(client=bedrock_client,
                                  model=model,
                                  temperature=0)
        return llm

def get_embeddings(model:str='deepseek-r1:7b', provider:str='local'):
    if(provider == 'local'):
        from langchain_ollama import OllamaEmbeddings
        embeddings=OllamaEmbeddings(model=model)
        return embeddings
    elif(provider == 'aws'):
        from langchain_aws import BedrockEmbeddings
        import boto3
        access_key=os.getenv('ACCESS_KEY')
        secret_key=os.getenv('SECRET_KEY')
        bedrock_client=boto3.client('bedrock-runtime', region_name='us-east-1', aws_access_key_id=access_key, aws_secret_access_key=secret_key)
        embeddings=BedrockEmbeddings(client=bedrock_client, model_id=model)
        return embeddings

In [5]:
llm=get_model(model='anthropic.claude-3-5-sonnet-20241022-v2:0',provider='aws')


In [None]:
import nest_asyncio

nest_asyncio.apply()

In [8]:
from llama_index.core.schema import TextNode
from typing import List
import json


def get_text_nodes(json_list: List[dict]):
    text_nodes = []
    for idx, page in enumerate(json_list):
        text_node = TextNode(text=page["md"], metadata={"page": page["page"]})
        text_nodes.append(text_node)
    return text_nodes


def save_jsonl(data_list, filename):
    """Save a list of dictionaries as JSON Lines."""
    with open(filename, "w") as file:
        for item in data_list:
            json.dump(item, file)
            file.write("\n")


def load_jsonl(filename):
    """Load a list of dictionaries from JSON Lines."""
    data_list = []
    with open(filename, "r") as file:
        for line in file:
            data_list.append(json.loads(line))
    return data_list

In [11]:
from llama_cloud_services import LlamaParse

parser = LlamaParse(
    api_key=os.getenv("LAMA_PARSE_API_KEY"),
    result_type="markdown",
    use_vendor_multimodal_model=True,
    vendor_multimodal_model_name="anthropic-sonnet-3.5",
    # invalidate_cache=True
)
json_objs = parser.get_json_result("./data/memes.pdf")
# json_objs = parser.get_json_result("./data/llama2-p33.pdf")
json_list = json_objs[0]["pages"]
docs = get_text_nodes(json_list)

Started parsing the file under job_id 4cc834ac-99fb-4939-a3b7-6504ed340b8d


In [None]:
import pypickle

import pypickle
filepath = 'test.pkl'

# Save
status = pypickle.save(filepath, json_objs)

11-04-25 00:29:22 [pypickle.pypickle] > INFO     > Pickle file saved: test.pkl


In [26]:
data = pypickle.load(filepath)

11-04-25 00:29:46 [pypickle.pypickle] > INFO     > Pickle file loaded: test.pkl


In [27]:
json_list = data[0]["pages"]
docs = get_text_nodes(json_list)

In [28]:
print(docs)

[TextNode(id_='981be608-094f-4eaa-969a-de53a29845d5', embedding=None, metadata={'page': 1}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text='WIYOOREAQTSE\n  ITS MONDAY\n Memes can tell more than 1000 words.\n      1', mimetype='text/plain', start_char_idx=None, end_char_idx=None, metadata_seperator='\n', text_template='{metadata_str}\n\n{content}')]


In [12]:
save_jsonl([d.dict() for d in docs], "docs.jsonl")

In [13]:
from llama_index.core import Document

docs_dicts = load_jsonl("docs.jsonl")
docs = [Document.parse_obj(d) for d in docs_dicts]

/var/folders/nx/wvnbfw697mbf57dq4vcl2nbm0000gn/T/ipykernel_1428/218604384.py:4: PydanticDeprecatedSince20: The `parse_obj` method is deprecated; use `model_validate` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  docs = [Document.parse_obj(d) for d in docs_dicts]


In [None]:
print(docs[0].get_content(metadata_mode="all"))

page: 1

WHEN YOU REALISE

IT'S MONDAY

Memes can tell more than 1000 words.


In [23]:
FILE_NAME = "./data/memes.pdf"
IMAGES_DOWNLOAD_PATH = "./data/images/"

In [20]:
parser = LlamaParse(
    api_key=os.getenv("LAMA_PARSE_API_KEY"),
    result_type="markdown",
)

json_objs = parser.get_json_result(FILE_NAME)
json_list = json_objs[0]["pages"]



Started parsing the file under job_id 83664a32-4296-43e7-bfe9-9029fc10c362


In [21]:
import os
import nest_asyncio
from llama_parse import LlamaParse
from llama_index.core.schema import ImageDocument, TextNode
from typing import List

In [24]:
def get_text_nodes(json_list: List[dict]) -> List[TextNode]:
    return [TextNode(text=page["text"], metadata={"page": page["page"]}) for page in json_list]

text_nodes = get_text_nodes(json_list)

def get_image_nodes(json_objs: List[dict], download_path: str) -> List[ImageDocument]:
    image_dicts = parser.get_images(json_objs, download_path=download_path)
    return [ImageDocument(image_path=image_dict["path"]) for image_dict in image_dicts]

image_documents = get_image_nodes(json_objs, IMAGES_DOWNLOAD_PATH)

> Images for page 1: [{'name': 'img_p0_1.png', 'height': 1066, 'width': 1599, 'x': 168.139, 'y': 124.80138916800004, 'original_width': 1599, 'original_height': 1066, 'ocr': [{'x': 188, 'y': 3, 'w': 1270, 'h': 193, 'confidence': '0.2964225884886829', 'text': 'WIYOOREAQTSE'}, {'x': 391, 'y': 839, 'w': 858, 'h': 182, 'confidence': '0.9528591356940469', 'text': 'ITS MONDAY'}]}]
