In [1]:
%load_ext autoreload
%autoreload 2

import pathlib
import pandas as pd
import lotus
from lotus.file_extractors import DirectoryReader
from lotus.models import LM, LiteLLMRM
from lotus.vector_store import FaissVS
from utils import *
import os

  from .autonotebook import tqdm as notebook_tqdm


## 1) set model configs

In [2]:
lm = LM("gpt-4o-mini", temperature=0.0)
rm = LiteLLMRM(model="text-embedding-3-small")
vs = FaissVS()
lotus.settings.configure(lm=lm, rm=rm, vs=vs)

#### add api key here

In [32]:
%set_env OPENAI_API_KEY=sk-

env: OPENAI_API_KEY=sk-


## 2) load data


In [4]:
# laod the data from code/ocr_pages
pdf_path = "data/ocr_pages"
in_df = DirectoryReader().add(pdf_path).to_df(per_page=True)

In [5]:
df = chunk_content(in_df, chunk_size=500)


In [6]:
df.head()

Unnamed: 0,content,chunk_id,file_path,file_name,file_type,file_size,creation_date,last_modified_date,page_label
0,So Oo NN DB WOW BR W NO\n\n10\n11\n12\n13\n14\...,0,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_1.txt,text/plain,1801,2025-05-12,2025-05-12,1
1,"LIFORNIA COUNTY OF\nLOS ANGELES, NORTH VALLEY ...",1,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_1.txt,text/plain,1801,2025-05-12,2025-05-12,1
2,ized and existing under and by virtue of the s...,2,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_1.txt,text/plain,1801,2025-05-12,2025-05-12,1
3,"s. Plaintiff is\ninformed and believes, and th...",3,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_1.txt,text/plain,1801,2025-05-12,2025-05-12,1
4,Copy of the Electronic Original® document mana...,0,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_10.txt,text/plain,4362,2025-05-12,2025-05-12,1


## 3) create and load our index

In [22]:
# Note: only run this once
# df = df.sem_index("content", "data/content_index")
df = df.load_sem_index("content", "data/content_index")

## 4) perform simple queries

In [29]:
def qa_pipeline(in_df, query, K=7):
    evidence_df = in_df.sem_search("content", query, K=K)
    res = evidence_df.sem_agg(f"Answer the query based on the {{content}}.\nQuery: {query}")
    return res.iloc[0]._output, evidence_df


### simple search test

In [26]:
df.sem_search("content", "plaintiff", K=5)

100%|██████████| 1/1 [00:00<00:00,  7.51it/s]


Unnamed: 0,content,chunk_id,file_path,file_name,file_type,file_size,creation_date,last_modified_date,page_label
3,"s. Plaintiff is\ninformed and believes, and th...",3,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_1.txt,text/plain,1801,2025-05-12,2025-05-12,1
2,ized and existing under and by virtue of the s...,2,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_1.txt,text/plain,1801,2025-05-12,2025-05-12,1
49,So Oo NN DB WOW BR W NO\n\n10\n11\n12\n13\n14\...,0,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_2.txt,text/plain,2265,2025-05-12,2025-05-12,1
50,"il Code, §2984.4, nor Civil Code, $1812.10.\n\...",1,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_2.txt,text/plain,2265,2025-05-12,2025-05-12,1
1,"LIFORNIA COUNTY OF\nLOS ANGELES, NORTH VALLEY ...",1,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_1.txt,text/plain,1801,2025-05-12,2025-05-12,1


### test query 1

In [30]:
query = "Is the plaintiff a debt buyer?"
ans, evidence_df = qa_pipeline(df, query)
print(ans)
evidence_df.head()

100%|██████████| 1/1 [00:00<00:00,  5.74it/s]
Aggregating: 100%|██████████ 1/1 LM calls [00:00<00:00,  1.27it/s]

Yes, the plaintiff is a debt buyer and the sole owner of the account in question.





Unnamed: 0,content,chunk_id,file_path,file_name,file_type,file_size,creation_date,last_modified_date,page_label
50,"il Code, §2984.4, nor Civil Code, $1812.10.\n\...",1,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_2.txt,text/plain,2265,2025-05-12,2025-05-12,1
2,ized and existing under and by virtue of the s...,2,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_1.txt,text/plain,1801,2025-05-12,2025-05-12,1
52,MONTEREY PARK CA 91754.\n\n9. The names and ad...,3,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_2.txt,text/plain,2265,2025-05-12,2025-05-12,1
54,So Oo NN DB WOW BR W NO\n\n10\n11\n12\n13\n14\...,0,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_3.txt,text/plain,1434,2025-05-12,2025-05-12,1
56,nce incorporates the same herein\nas though fu...,2,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_3.txt,text/plain,1434,2025-05-12,2025-05-12,1


### test query 2

In [31]:
query = "Who is the defendant?"
ans, evidence_df = qa_pipeline(df, query)
print(ans)
evidence_df.head()

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00,  2.51it/s]
Aggregating: 100%|██████████ 1/1 LM calls [00:00<00:00,  1.20it/s]

The defendant in this case is Paul Fernandez, along with unnamed defendants referred to as DOES 1 through 15.





Unnamed: 0,content,chunk_id,file_path,file_name,file_type,file_size,creation_date,last_modified_date,page_label
50,"il Code, §2984.4, nor Civil Code, $1812.10.\n\...",1,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_2.txt,text/plain,2265,2025-05-12,2025-05-12,1
49,So Oo NN DB WOW BR W NO\n\n10\n11\n12\n13\n14\...,0,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_2.txt,text/plain,2265,2025-05-12,2025-05-12,1
3,"s. Plaintiff is\ninformed and believes, and th...",3,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_1.txt,text/plain,1801,2025-05-12,2025-05-12,1
2,ized and existing under and by virtue of the s...,2,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_1.txt,text/plain,1801,2025-05-12,2025-05-12,1
57,So OO NN DB Wn BR W NO\n\n10\n11\n12\n13\n14\n...,0,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_4.txt,text/plain,831,2025-05-12,2025-05-12,1
