In [58]:
%load_ext autoreload
%autoreload 2

import pathlib
import pandas as pd
import lotus
from lotus.file_extractors import DirectoryReader
from lotus.models import LM, LiteLLMRM
from lotus.vector_store import FaissVS
from utils import *
import os

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 1) set model configs

In [59]:
lm = LM("gpt-4o-mini", temperature=0.0)
rm = LiteLLMRM(model="text-embedding-3-small")
vs = FaissVS()
lotus.settings.configure(lm=lm, rm=rm, vs=vs)

#### add api key here

In [70]:
%set_env OPENAI_API_KEY=sk-

env: OPENAI_API_KEY=sk-


## 2) load data


In [4]:
# laod the data from code/ocr_pages
pdf_path = "data/ocr_pages"
in_df = DirectoryReader().add(pdf_path).to_df(per_page=True)

In [61]:
df = chunk_content(in_df, chunk_size=1000)


In [62]:
df.head()

Unnamed: 0,content,chunk_id,file_path,file_name,file_type,file_size,creation_date,last_modified_date,page_label
0,So Oo NN DB WOW BR W NO\n\n10\n11\n12\n13\n14\...,0,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_1.txt,text/plain,1801,2025-05-12,2025-05-12,1
1,ized and existing under and by virtue of the s...,1,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_1.txt,text/plain,1801,2025-05-12,2025-05-12,1
2,Copy of the Electronic Original® document mana...,0,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_10.txt,text/plain,4362,2025-05-12,2025-05-12,1
3,Governing Law. I understand and agree that Fi...,1,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_10.txt,text/plain,4362,2025-05-12,2025-05-12,1
4,the extension of credit set forth in\nthe Not...,2,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_10.txt,text/plain,4362,2025-05-12,2025-05-12,1


## 3) create and load our index

In [63]:
# Note: only run this once
df = df.sem_index("content", "data/content_index")
# df = df.load_sem_index("content", "data/content_index")

100%|██████████| 1/1 [00:00<00:00,  2.75it/s]


## 4) perform simple queries

In [64]:
def qa_pipeline(in_df, query, K=7):
    evidence_df = in_df.sem_search("content", query, K=K)
    res = evidence_df.sem_agg(f"Given the {{content}} of snippets from a legal document, answer the query.\nQuery: {query}")
    return res.iloc[0]._output, evidence_df


### simple search test

In [65]:
df.sem_search("content", "plaintiff", K=5)

100%|██████████| 1/1 [00:00<00:00,  6.13it/s]


Unnamed: 0,content,chunk_id,file_path,file_name,file_type,file_size,creation_date,last_modified_date,page_label
1,ized and existing under and by virtue of the s...,1,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_1.txt,text/plain,1801,2025-05-12,2025-05-12,1
26,So Oo NN DB WOW BR W NO\n\n10\n11\n12\n13\n14\...,0,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_2.txt,text/plain,2265,2025-05-12,2025-05-12,1
31,So OO NN DB Wn BR W NO\n\n10\n11\n12\n13\n14\n...,0,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_4.txt,text/plain,831,2025-05-12,2025-05-12,1
0,So Oo NN DB WOW BR W NO\n\n10\n11\n12\n13\n14\...,0,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_1.txt,text/plain,1801,2025-05-12,2025-05-12,1
28,"make payments, causing\ndamages set forth here...",2,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_2.txt,text/plain,2265,2025-05-12,2025-05-12,1


### test query 1

In [69]:
query = "Is the plaintiff a debt buyer?"
ans, evidence_df = qa_pipeline(df, query)
print(ans)
evidence_df.head()

100%|██████████| 1/1 [00:00<00:00,  7.81it/s]
Aggregating: 100%|██████████ 1/1 LM calls [00:02<00:00,  2.43s/it]

Yes, the plaintiff, Velocity Investments LLC, is a debt buyer and the sole owner of the account in question.





Unnamed: 0,content,chunk_id,file_path,file_name,file_type,file_size,creation_date,last_modified_date,page_label
26,So Oo NN DB WOW BR W NO\n\n10\n11\n12\n13\n14\...,0,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_2.txt,text/plain,2265,2025-05-12,2025-05-12,1
1,ized and existing under and by virtue of the s...,1,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_1.txt,text/plain,1801,2025-05-12,2025-05-12,1
31,So OO NN DB Wn BR W NO\n\n10\n11\n12\n13\n14\n...,0,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_4.txt,text/plain,831,2025-05-12,2025-05-12,1
27,SUED BY FINWISE BANK\nSERVICED BY UPSTART NETW...,1,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_2.txt,text/plain,2265,2025-05-12,2025-05-12,1
0,So Oo NN DB WOW BR W NO\n\n10\n11\n12\n13\n14\...,0,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_1.txt,text/plain,1801,2025-05-12,2025-05-12,1


### test query 2

In [68]:
query = "Who is the defendant?"
ans, evidence_df = qa_pipeline(df, query)
print(ans)
evidence_df.head()

100%|██████████| 1/1 [00:00<00:00,  3.93it/s]
Aggregating: 100%|██████████ 1/1 LM calls [00:01<00:00,  1.30s/it]

The defendant in this case is Paul Fernandez.





Unnamed: 0,content,chunk_id,file_path,file_name,file_type,file_size,creation_date,last_modified_date,page_label
26,So Oo NN DB WOW BR W NO\n\n10\n11\n12\n13\n14\...,0,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_2.txt,text/plain,2265,2025-05-12,2025-05-12,1
1,ized and existing under and by virtue of the s...,1,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_1.txt,text/plain,1801,2025-05-12,2025-05-12,1
0,So Oo NN DB WOW BR W NO\n\n10\n11\n12\n13\n14\...,0,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_1.txt,text/plain,1801,2025-05-12,2025-05-12,1
31,So OO NN DB Wn BR W NO\n\n10\n11\n12\n13\n14\n...,0,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_4.txt,text/plain,831,2025-05-12,2025-05-12,1
29,So Oo NN DB WOW BR W NO\n\n10\n11\n12\n13\n14\...,0,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_3.txt,text/plain,1434,2025-05-12,2025-05-12,1
