In [61]:
import pathlib

import lotus
from lotus.file_extractors import DirectoryReader
from lotus.models import LM, LiteLLMRM
from lotus.vector_store import FaissVS
import os

## 1) set model configs

In [59]:
lm = LM("gpt-4o-mini", temperature=0.0)
rm = LiteLLMRM(model="text-embedding-3-small")
vs = FaissVS()
lotus.settings.configure(lm=lm, rm=rm, vs=vs)

#### add api key here

In [107]:
%set_env OPENAI_API_KEY=sk-

env: OPENAI_API_KEY=sk-


## 2) load data


In [100]:
# laod the data from code/ocr_pages
pdf_path = "data/ocr_pages"
df = DirectoryReader().add(pdf_path).to_df(per_page=True)

In [101]:
df.head()

Unnamed: 0,content,file_path,file_name,file_type,file_size,creation_date,last_modified_date,page_label
0,So Oo NN DB WOW BR W NO\n\n10\n11\n12\n13\n14\...,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_1.txt,text/plain,1801,2025-05-12,2025-05-12,1
1,Copy of the Electronic Original® document mana...,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_10.txt,text/plain,4362,2025-05-12,2025-05-12,1
2,; Copy of the Electronic Original® document ma...,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_11.txt,text/plain,4615,2025-05-12,2025-05-12,1
3,Copy of the Electronic Original® document mana...,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_12.txt,text/plain,3628,2025-05-12,2025-05-12,1
4,; Copy of the Electronic Original® document ma...,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_13.txt,text/plain,3430,2025-05-12,2025-05-12,1


In [68]:
# preview the data
print(df.iloc[0].content[500:700])

LIFORNIA COUNTY OF
LOS ANGELES, NORTH VALLEY JUDICIAL DISTRICT

CHATSWORTH COURTHOUSE

VELOCITY INVESTMENTS LLC, CASENO. 2 30>HLOl18998

Plaintiff, COMPLAINT FOR MONEY
1. Account Stated

2. Open Book 


## 3) create and load our index

In [102]:
# Note: only run this once
# df = df.sem_index("content", "data/content_index")
df = df.load_sem_index("content", "data/content_index")

## 4) perform simple queries

In [103]:
def qa_pipeline(df, query, K=3):
    evidence_df = df.sem_search("content", query, K=K)
    res = evidence_df.sem_agg(f"Answer the query based on the {{content}}.\nQuery: {query}")
    return res.iloc[0]._output, evidence_df


### test query 1

In [106]:
query = "Is the plaintiff a debt buyer?"
ans, evidence_df = qa_pipeline(df, query)
print(ans)
evidence_df.head()

100%|██████████| 1/1 [00:00<00:00,  3.48it/s]
Aggregating: 100%|██████████ 1/1 LM calls [00:02<00:00,  2.02s/it]

The plaintiff is not a debt buyer; rather, they are the original creditor or a representative of the original creditor. The documents indicate that the charge-off creditor was MPLI Capital Holdings, issued by FinWise Bank and serviced by Upstart Network, Inc. The debt was later sold to Velocity Investments, LLC, which is identified as the entity that purchased or was assigned the debt after charge-off. Therefore, the plaintiff is pursuing the case based on the original debt rather than as a debt buyer.





Unnamed: 0,content,file_path,file_name,file_type,file_size,creation_date,last_modified_date,page_label
7,DocuSign Envelope ID: AFSEBOAD-E0F2-462E-8EC7-...,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_16.txt,text/plain,1767,2025-05-12,2025-05-12,1
9,So Oo NN DB WOW BR W NO\n\n10\n11\n12\n13\n14\...,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_2.txt,text/plain,2265,2025-05-12,2025-05-12,1
13,Copy of the Electronic Original® document mana...,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_6.txt,text/plain,2185,2025-05-12,2025-05-12,1


### test query 2

In [92]:
query = "Who is the defendant?"
ans, evidence_df = qa_pipeline(df, query)
print(ans)
evidence_df.head()

100%|██████████| 1/1 [00:01<00:00,  1.33s/it]
Aggregating: 100%|██████████ 1/1 LM calls [00:01<00:00,  1.26s/it]

The defendant in the complaint is identified as "Defendant(s)," specifically named as Paul Fernandez, who resides at 216 W Markland Dr, Monterey Park, CA 91754.





Unnamed: 0,content,file_path,file_name,file_type,file_size,creation_date,last_modified_date,page_label,len
7,DocuSign Envelope ID: AFSEBOAD-E0F2-462E-8EC7-...,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_16.txt,text/plain,1767,2025-05-12,2025-05-12,1,1739
9,So Oo NN DB WOW BR W NO\n\n10\n11\n12\n13\n14\...,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_2.txt,text/plain,2265,2025-05-12,2025-05-12,1,2260
13,Copy of the Electronic Original® document mana...,/lfs/guestrin-hgx-1/0/lianapat/law-app/sample_...,page_6.txt,text/plain,2185,2025-05-12,2025-05-12,1,2176
