In [1]:
!pip3 install thirdai --upgrade

In [1]:
from thirdai import neural_db as ndb, licensing
licensing.activate("D0F869-B61466-6A28F0-14B8C6-0AC6C6-V3")
import pandas as pd
import fitz 
from langchain.text_splitter import CharacterTextSplitter

[nltk_data] Downloading package punkt to /home/pratyush/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Process PDF files into CSV

In [2]:
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    doc.close()
    return text

def save_chunks_to_csv(chunks, csv_path):
    df = pd.DataFrame(chunks, columns=['Text'])
    df.to_csv(csv_path, index=False)

In [3]:
# specify the pdf paths here
pdf_paths = []

csv_files = []
for pdf_path in pdf_paths:
    csv_out_path = pdf_path.split(".")[0] + ".csv"
    csv_files.append(csv_out_path)
    chunk_size = 1000
    chunk_overlap = 100
    text = extract_text_from_pdf(pdf_path)
    splitter = CharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap,separator='\n')

    chunks = list(map(lambda x: x.page_content, splitter.create_documents([text])))
    print(len(chunks))
    save_chunks_to_csv(chunks, csv_out_path)
print(csv_files)

[]


### Train NeuralDB over generated files

In [4]:
# preprocessed csv files
csv_files = ['data/pfizer-20221231.csv',
               'data/tsla-20231231.csv',
               'data/msft-10-Q.csv',
               'data/walmart-10k.csv',
               'data/samsung-2022-10k.csv',
               'data/apple-10k.csv',
               'data/nvda-10k.csv',
               'data/meta-10k.csv']
csv_docs = [ndb.CSV(path=csv_file, strong_columns=['Text'], weak_columns=[], reference_columns=['Text']) for csv_file in csv_files]

In [5]:
# to load a pretrained model, uncomment this
# db = ndb.NeuralDB.from_checkpoint("lti_finetuned.ndb")

# training model from scratch
db = ndb.NeuralDB()
db.insert(csv_docs)

loading data | source 'Documents:
pfizer-20221231.csv
tsla-20231231.csv
msft-10-Q.csv
walmart-10k.csv
samsung-2022-10k.csv
apple-10k.csv
nvda-10k.csv
meta-10k.csv'
loading data | source 'Documents:
pfizer-20221231.csv
tsla-20231231.csv
msft-10-Q.csv
walmart-10k.csv
samsung-2022-10k.csv
apple-10k.csv
nvda-10k.csv
meta-10k.csv' | vectors 3247 | batches 2 | time 0.402s | complete

train | epoch 0 | train_steps 2 | train_hash_precision@5=0.0200185  | train_batches 2 | time 3.322s

loading data | source 'Documents:
pfizer-20221231.csv
tsla-20231231.csv
msft-10-Q.csv
walmart-10k.csv
samsung-2022-10k.csv
apple-10k.csv
nvda-10k.csv
meta-10k.csv'
loading data | source 'Documents:
pfizer-20221231.csv
tsla-20231231.csv
msft-10-Q.csv
walmart-10k.csv
samsung-2022-10k.csv
apple-10k.csv
nvda-10k.csv
meta-10k.csv' | vectors 3247 | batches 2 | time 0.411s | complete

train | epoch 1 | train_steps 4 | train_hash_precision@5=0.10077  | train_batches 2 | time 1.520s

loading data | source 'Documents:
pfiz

['102dd503aa3823625cefadee032e0b84f201ea38',
 '972458763dbd75cc3d558af20cb9f20d40ab0c27',
 'c724146e94979882520c2bab7883b79ac33eb8c6',
 'f230bbad3b3bb3a4beb4d927f6918821ac010acc',
 'eafd6303597c7d1be372fde50f1548193aced0ac',
 'ec8d20c1a07e85d751ad66160b6b222aff503b8e',
 '541c360909f325a06e117a1da2eb9d3101f0def8',
 '1eb7c0896cd4ac551ef17c50597742b3220e50a0']

### Finetune the model over questions, paragraph pairs

In [None]:
question_df = pd.read_csv("questions_large.csv")
import tqdm
for csv_file in csv_files:
    df = pd.read_csv(csv_file)
    print(csv_file)
    para = df['Text']
    temp_df = question_df[question_df['source']==csv_file]
    for _, row in tqdm.tqdm(temp_df.iterrows(), total=len(temp_df)):
        question = row['question']
        db.associate(question, para[int(row['para_id'])])

### Inference 

In [7]:
import os
os.environ['OPENAI_API_KEY'] = "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"

from langchain.chat_models import ChatOpenAI
from paperqa.prompts import qa_prompt
from paperqa.chains import make_chain
from langchain.prompts import PromptTemplate
llm = ChatOpenAI(
    model_name='gpt-3.5-turbo', 
    temperature=0.1,
)

In [8]:
def get_references(query, radius=None):
    search_results = db.search(query,top_k=3)
    references = []
    for result in search_results:
        if (radius):
            references.append(result.context(radius=radius))
        else:
            references.append(result.text)
    return references

def get_answer(query, references):
    #uses default qa_prompt
    qa_chain = make_chain(prompt=qa_prompt, llm=llm)
    return qa_chain.run(question=query, context='\n\n'.join(references[:5]), answer_length="abt 50 words")

def get_answer_manual_prompt(query, references, prompt):
    qa_chain = make_chain(prompt=prompt, llm=llm)
    return qa_chain.run(question=query, context='\n\n'.join(references[:5]))
    # can pass in manual prompt here
    # these input variables would  need to be passed while calling qa_chain.run


In [10]:
query = "What is the revenue of apple in year 2022"

references = get_references(query, radius=1)
print(references)
answer = get_answer(query, references)

print(answer)

['Text: ® ® Apple Inc. | 2023 Form 10-K | 34 Net sales disaggregated by significant products and services for 2023, 2022 and 2021 were as follows (in millions): 2023 2022 2021 iPhone  $ 200,583  $ 205,489  $ 191,973  Mac  29,357  40,177  35,190  iPad  28,300  29,292  31,862  Wearables, Home and Accessories  39,845  41,241  38,367  Services  85,200  78,129  68,425  Total net sales $ 383,285  $ 394,328  $ 365,817  (1) Products net sales include amortization of the deferred value of unspecified software upgrade rights, which are bundled in the sales price of the respective product. (2) Services net sales include amortization of the deferred value of services bundled in the sales price of certain products. Total net sales include $8.2 billion of revenue recognized in 2023 that was included in deferred revenue as of September 24, 2022, $7.5 billion of revenue Text: recognized in 2022 that was included in deferred revenue as of September 25, 2021, and $6.7 billion of revenue recognized in 20

The revenue of Apple in the year 2022 was $394,328 million (Apple Inc. | 2023 Form 10-K | 34).


In [16]:
query = "What is the revenue of apple in year 2022"

references = get_references(query)
print(references)

# design your own prompt here
# make sure to have two variables question and context as defined
manual_prompt = (
        PromptTemplate.from_template("Answer the question based on the following context\n")
        + "Question: {question}"
        + "Context: {context}"
    ) 
print(get_answer_manual_prompt(query, references, manual_prompt))

['Text: $\n162,560 \n$\n169,658 \n$\n153,306 \nOperating income\n$\n60,508 \n$\n62,683 \n$\n53,382 \nEurope:\nNet sales\n$\n94,294 \n$\n95,118 \n$\n89,307 \nOperating income\n$\n36,098 \n$\n35,233 \n$\n32,505 \nGreater China:\nNet sales\n$\n72,559 \n$\n74,200 \n$\n68,366 \nOperating income\n$\n30,328 \n$\n31,153 \n$\n28,504 \nJapan:\nNet sales\n$\n24,257 \n$\n25,977 \n$\n28,482 \nOperating income\n$\n11,888 \n$\n12,257 \n$\n12,798 \nRest of Asia Pacific:\nNet sales\n$\n29,615 \n$\n29,375 \n$\n26,356 \nOperating income\n$\n12,066 \n$\n11,569 \n$\n9,817 \nA reconciliation of the Company’s segment operating income to the Consolidated Statements of Operations for 2023, 2022 and 2021 is as follows (in millions):\n2023\n2022\n2021\nSegment operating income\n$\n150,888 \n$\n152,895 \n$\n137,006 \nResearch and development expense\n(29,915)\n(26,251)\n(21,914)\nOther corporate expenses, net \n(6,672)\n(7,207)\n(6,143)\nTotal operating income\n$\n114,301 \n$\n119,437 \n$\n108,949 \n(1)', 'Text: 