### 0. Setup

Run, then restart runtime! 🙏🙏

In [None]:
from google.colab import auth as google_auth
google_auth.authenticate_user()

In [None]:
!gsutil cp gs://vertex_sdk_llm_private_releases/SDK/google-cloud-aiplatform-1.23.0.llm.alpha.5.zip .

!pip install google-cloud-aiplatform-1.23.0.llm.alpha.5.zip "shapely<2.0.0"

### 1. Data Ingestion

In [None]:
! pip install PyPDF2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
import PyPDF2
from tqdm import tqdm
from google.colab import drive
from google.colab import files
import requests
drive.mount('/content/drive/')

Mounted at /content/drive/


PDF URLs Ingestion

1. import them.
2. copy them to local drive.

In [None]:
import os

def importPDFs(src_folder_path, pdf_urls):
    # Create the folder if it doesn't exist
    if not os.path.exists(src_folder_path):
        os.makedirs(src_folder_path)

    # Loop through the PDF URLs and download them
    for url in tqdm(pdf_urls):
        response = requests.get(url)
        if response.status_code == 200:
            # Save the PDF file to the specified folder
            file_path = f"{src_folder_path}/{url.split('/')[-1]}"
            with open(file_path, 'wb') as f:
                f.write(response.content)
            print(f'{file_path} saved successfully.')
        else:
            print(f'Error downloading {url}: {response.status_code}')


def copyFilesToLocalDrive(src_folder_path, dest_folder_path):
    """
    Copies files from a source folder in Google Drive to a local folder in the Colab environment.

    Args:
    - src_folder_path (str): the path of the source folder in Google Drive
    - dest_folder_path (str): the path of the destination folder in the Colab environment

    Returns:
    None
    """

    # Remove the destination folder if it exists
    if os.path.exists(dest_folder_path):
        !rm -rf "$dest_folder_path"

    # Create the destination folder
    os.makedirs(dest_folder_path)

    # Copy the files from the source folder to the destination folder
    !cp "$src_folder_path"/* "$dest_folder_path"

    # List the files in the destination folder
    !ls "$dest_folder_path"

HTML URLs Ingestion

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

def splitTextIntoChunks(text, url, chunk_size=400):
    """
    Splits a given text into 400-word chunks and appends each chunk to a list.

    Args:
    - text (str): The text to be split.
    - chunk_size (int): The number of words in each chunk.

    Returns:
    - chunks (list): A list of text chunks, each containing up to 400 words.
    """
    # Split the text into words
    words = text.split()

    # Initialize a list to store the chunks
    chunks = []
    urls = []
    pages = []

    # Loop through the words and split them into chunks of size 'chunk_size'
    for i in range(0, len(words), chunk_size):
        chunk = ' '.join(words[i:i+chunk_size])
        chunks.append(chunk)
        urls.append(url)
        pages.append(i//chunk_size + 1)

    return chunks, urls, pages


def processURL(url):
    # Open the URL and read the HTML content
    html = urlopen(url).read()

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html, features="html.parser")

    # Remove script and style elements from the HTML
    for script in soup(["script", "style"]):
        script.extract()

    # Extract the text content from the HTML
    text = soup.get_text()

    # Split the text into chunks of 400 words each
    chunks, urls, pages = splitTextIntoChunks(text, url, chunk_size=400)

    # Return the list of text chunks
    return chunks, urls, pages

def HTMLsPreProcessing(html_urls):
  all_chunks = []
  all_urls = []
  all_pages = []
  for url in tqdm(html_urls):
    chunk_list, url_list, page_chunk_list = processURL(url)
    all_chunks += chunk_list
    all_urls += url_list
    all_pages += page_chunk_list
  return all_chunks, all_urls, all_pages

In [None]:
# Set the path of the folder in your Google Drive where you want to save the PDF files
src_folder_path = '/content/drive/MyDrive/Colab Notebooks/data/llm/Insurance'
dest_folder_path = "corpus"

# Loop through the PDF URLs and download them
pdf_urls = [
    # ADD YOUR PDF URLS HERE.
    ]

importPDFs(src_folder_path, pdf_urls)
copyFilesToLocalDrive(src_folder_path, dest_folder_path)

0it [00:00, ?it/s]


Sample-InsurancePolicy.pdf


In [None]:
html_urls = [
# ADD YOUR HTML URLS HERE.
]

all_chunks, all_urls, all_pages = HTMLsPreProcessing(html_urls)

0it [00:00, ?it/s]


### 2. Data Pre-processing
- Splitting PDFs in text chunks.

In [None]:
def splitPDFsToString(file_paths):

    # Create an empty list to store the text and another the titles from all PDF files
    text_list = []
    title_list = []
    page_list = []

    # Loop through each file path in the list
    for file_path in tqdm(file_paths):
        # Open the PDF file in read-binary mode
        with open(file_path, 'rb') as pdf_file:
            # Create a PdfFileReader object to read the PDF file
            pdf_reader = PyPDF2.PdfReader(pdf_file)

            # Get the number of pages in the PDF file
            num_pages = len(pdf_reader.pages)

            # Loop through each page in the PDF file
            for page_num in range(num_pages):
                # Get the Page object for the current page
                page = pdf_reader.pages[page_num]
                page_list.append(page_num)
                # Extract the text from the current page and append it to the list
                text_list.append(page.extract_text())
                title_list.append(file_path)
    # Return the list of lists of strings
    return text_list, title_list, page_list


In [None]:
# Define the file paths for the PDF files
file_paths_raw = !ls corpus

file_paths = []

for file_path in file_paths_raw:
  # Remove single quotes and any leading/trailing whitespace from file path
  file_path = file_path.strip().replace("'", "")
  file_paths.append(f'/content/corpus/{file_path}')

# Call the split_pdf_to_strings function for each PDF file
text_list, title_list, page_list = splitPDFsToString(file_paths)

100%|██████████| 1/1 [00:01<00:00,  1.03s/it]


### 3. Models Import and Initialization

In [None]:
from google.colab import auth as google_auth
google_auth.authenticate_user()

In [None]:
PROJECT_ID = "<provide your project id>"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

In [None]:
from google.cloud import aiplatform
from google.cloud.aiplatform.private_preview.language_models import TextGenerationModel, ChatModel
#from google.cloud.aiplatform.private_preview.language_models import _TextEmbeddingModel as TextEmbeddingModel

aiplatform.init(project=PROJECT_ID, location=LOCATION)
model = TextGenerationModel.from_pretrained("text-bison-001")
#embedder = TextEmbeddingModel.from_pretrained("embedding-gecko-001")

### 4. Index Generation
- Mapping each text chunk with it's corresponding embedding and file title.

Load the Universal Sentence Encoder's TF Hub module.

In [None]:
import tensorflow_hub as hub

module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" #@param ["https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5"]
embed = hub.load(module_url)
print("module %s loaded" % module_url)

module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [None]:
import pandas as pd

def createIndexDF(text_list, page_list, title_list, all_chunks=None, all_pages=None, all_urls=None):
    # Create a new list that concatenates text_list and all_chunks
    text_chunks = text_list.copy()
    if all_chunks is not None:
        text_chunks += all_chunks

    # Create a new list that concatenates page_list and all_pages
    page_nums = page_list.copy()
    if all_pages is not None:
        page_nums += all_pages

    # Create a new list that concatenates title_list and all_urls
    titles = title_list.copy()
    if all_urls is not None:
        titles += all_urls
        titles = [titles.lstrip("/content/corpus/").rstrip(".pdf") for titles in titles]

    # Create a new dataframe with the combined lists
    index_df = pd.DataFrame({'text_chunk': text_chunks,
                             'page_num': page_nums,
                             'title': titles})

    # Calculate embeddings using the given 'embed' function
    embeddings = []
    for row in tqdm(index_df.text_chunk):
        embedding = embed([row])
        embeddings.append(embedding)
    index_df['embedding'] = embeddings

    return index_df

In [None]:
index_df = createIndexDF(text_list, page_list, title_list, all_chunks, all_pages, all_urls)
index_df[5:10]

100%|██████████| 64/64 [00:01<00:00, 59.65it/s]


Unnamed: 0,text_chunk,page_num,title,embedding
5,1 TO OUR POLICYHOLDER \n \nThis Automobile C...,5,Sample-InsurancePolicy,"((tf.Tensor(-0.027879076, shape=(), dtype=floa..."
6,2 Auto – means a motor vehicle with four or ...,6,Sample-InsurancePolicy,"((tf.Tensor(-0.027661495, shape=(), dtype=floa..."
7,3 1. missiles or falling objects; \n \n2. fir...,7,Sample-InsurancePolicy,"((tf.Tensor(-0.021275755, shape=(), dtype=floa..."
8,4 you or a household member or any other per...,8,Sample-InsurancePolicy,"((tf.Tensor(-0.03853473, shape=(), dtype=float..."
9,5 Trailer – means a vehicle designed to be p...,9,Sample-InsurancePolicy,"((tf.Tensor(-0.059401114, shape=(), dtype=floa..."


### 5. Testing answers with hardcoded contexts

In [None]:
CONTEXT = """
Given the following balance sheet
CONSOLIDATED BALANCE SHEETS
As at
December 31, December 31,
In million of U.S. dollars 2021 2020
Assets
Current assets:
Cash and cash equivalents 3,225 3,006
Short-term deposits 291 581
Marketable securities — 133
Trade accounts receivable, net 1,759 1,465
Inventories 1,972 1,841
Other current assets 581 584
Total current assets 7,828 7,610
Goodwill 313 330
Other intangible assets, net 438 445
Property, plant and equipment, net 5,660 4,596
Non-current deferred tax assets 652 739
Long-term investments 10 10
Other non-current assets 639 724
7,712 6,844
Total assets 15,540 14,454
Liabilities and equity
Current liabilities:
Short-term debt 143 795
Trade accounts payable 1,582 1,166
Other payables and accrued liabilities 1,101 966
Dividends payable to stockholders 55 42
Accrued income tax 68 84
Total current liabilities 2,949 3,053
Long-term debt 2,396 1,826
Post-employment benefit obligations 442 506
Long-term deferred tax liabilities 64 75
Other long-term liabilities 416 488
3,318 2,895
Total liabilities 6,267 5,948
Commitment and contingencies
Equity
Parent company stockholders' equity
Common stock (preferred stock: 540,000,000 shares authorized, not issued;
 common stock: Euro 1.04 par value, 1,200,000,000 shares authorized,
 911,276,920 shares issued, 906,518,057 shares outstanding) 1,157 1,157
Additional paid-in capital 2,533 3,062
Retained earnings 5,223 3,599
Accumulated other comprehensive income 496 723
Treasury stock (200) (93)
Total parent company stockholders' equity 9,209 8,448
Noncontrolling interest 64 58
Total equity 9,273 8,506
Total liabilities and equity 15,540 14,454
The accompanying notes are an integral part of these audited consolidated financial statements

and income statement
Twelve months ended
December 31, December 31, December 31,
In million of U.S. dollars except per share amounts 2021 2020 2019
Net sales 12,729 10,181 9,529
Other revenues 32 38 27
Net revenues 12,761 10,219 9,556
Cost of sales (7,435) (6,430) (5,860)
Gross profit 5,326 3,789 3,696
Selling, general and administrative (1,323) (1,109) (1,093)
Research and development (1,723) (1,548) (1,498)
Other income and expenses, net 141 202 103
Impairment, restructuring charges and other related closure
 costs (2) (11) (5)
Operating income 2,419 1,323 1,203
Interest income (expense), net (29) (20) 1
Other components of pension benefit costs (10) (12) (16)
Income (loss) on equity-method investments — 2 1
Loss on financial instruments, net (43) (26) —
Income before income taxes and noncontrolling interest 2,337 1,267 1,189
Income tax expense (331) (159) (156)
Net income 2,006 1,108 1,033
Net income attributable to noncontrolling interest (6) (2) (1)
Net income attributable to parent company stockholders 2,000 1,106 1,032
Earnings per share (Basic) attributable to parent company
 stockholders 2.21 1.24 1.15
Earnings per share (Diluted) attributable to parent
 company stockholders 2.16 1.20 1.14
"""

In [None]:
QUESTION = "Calculate the return on net assets using the following equation  Net Income/(Fixed Assets + Working Capital) for 2021"  # @param {type:"string"}

In [None]:
PROMPT = f"""
Based on the following context:
{CONTEXT}.


Please answer the question below only using information from the context provided:
{QUESTION}.

"""

model.predict(
    PROMPT,
    max_output_tokens=1000,
    temperature=0,
    top_p=1,
    top_k=40,
)

The return on net assets for 2021 is 12.6%.

The return on net assets is calculated by dividing net income by the sum of fixed assets and working capital. In this case, net income is 2,006 million, fixed assets are 7,712 million, and working capital is 7,828 million. Therefore, the return on net assets is 2,006 / (7,712 + 7,828) = 12.6%.

### 6. Retrieving the closest n contexts for a given question

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def mostSimilarContexts(QUESTION, INDEX, AMOUNT_OF_CONTEXTS = 3):
  # Generate the embedding of the provided question.
  embedding_question = embed([QUESTION])

  similarities = []
  top_results_titles = []
  top_results_pagenums = []

  # Calculate cosine similarities between the question and the contexts.
  for embedding in INDEX["embedding"].tolist():
    similarity = cosine_similarity(embedding_question, embedding)[0][0]
    similarities.append(similarity)

  similarities = np.array(similarities)
  # Get indices of top n results
  top_results_indices = similarities.argsort()[::-1][:AMOUNT_OF_CONTEXTS]


  # Concatenate top 5 results into a string
  top_results = " ".join(INDEX.iloc[top_results_indices]["text_chunk"].tolist())
  top_results_titles = INDEX.iloc[top_results_indices]["title"].tolist()
  top_results_pagenums = INDEX.iloc[top_results_indices]["page_num"].tolist()

  return top_results, top_results_titles, top_results_pagenums

In [None]:
QUESTION = "When and where the policy applies?"
top_contexts, titles, pagenums = mostSimilarContexts(QUESTION, index_df)
print(top_contexts)

 522. Under PARTS II, III, IV and V, until 30 days 
after written proof of loss is filed and the 
amount of loss is determined. 
 
No one has the right to bring us into a suit to 
determine the liability of an insured.  
TERMINATION 
 
1. Cancellation by You 
 
 You may cancel by: 
 a. returning this policy to us; or 
 b. giving us advance notice of the date 
cancellation is to take effect. 
 2. Cancellation by Us 
 
 We may cancel by mailing to you at the last 
address known by us: 
 a. at least 10 days notice: 
  (1) if cancellation is for nonpayment of 
premium; or 
  (2) if notice is mailed during the first 60 
days this policy is in effect and this is 
not a renewal or continuation policy; 
 b. at least 30 days notice in all other cases. 
 
 After this policy is in effect for 60 days, or if this 
is a renewal or continuation policy, we will 
cancel only: 
 a. for nonpayment of premium; or 
 b. if your  driver’s license has been 
suspended or revoked. 
  This must have occurred:   

### 7. Putting it all together :)


In [None]:
QUESTION = "When and where the policy applies" #"What regions does Infineon operate in?"  # @param {type:"string"}

In [None]:
CONTEXT, TITLES, PAGES = mostSimilarContexts(QUESTION, index_df)
SOURCES = {"Titles": TITLES, "Pages": PAGES}

PROMPT = f"""

Based only on the following information:
{CONTEXT}.

Answer the question below:
{QUESTION}.

Don't assume anything, If you are not sure then answer that you don't have enough information.
Answer what's important. Be concise.
"""

#Surround by this '\033[32m \033[0m' the most important parts of the text to highlight them.
#For example: \033[32mIMPORTANT PART OF THE ANSWER\033[0m

endpoint = aiplatform.Endpoint('projects/801452371447/locations/us-central1/endpoints/5781438847032229888')

tuned_model_name=endpoint.name
tuned_nj_model = TextGenerationModel.get_tuned_model(tuned_model_name)


ANSWER = model.predict(
          PROMPT,
          max_output_tokens=1000,
          temperature=0,
          top_p=1,
          top_k=40,
      )

In [None]:
print("\033[1m\033[38;2;70;130;180mQuestion:\033[0m")
print(QUESTION)
print("")
print("\033[1m\033[38;2;70;130;180mSources:\033[0m")
for title, page in zip(SOURCES["Titles"], SOURCES["Pages"]):
    print(f"\033[1mTitle:\033[0m \033[32m{title}\033[0m, \033[1mPage:\033[0m \033[32m{page}\033[0m")
print("")
print("\033[1m\033[38;2;70;130;180mAnswer:\033[0m")
ANSWER

[1m[38;2;70;130;180mQuestion:[0m
When and where the policy applies

[1m[38;2;70;130;180mSources:[0m
[1mTitle:[0m [32mSample-InsurancePolicy[0m, [1mPage:[0m [32m56[0m
[1mTitle:[0m [32mSample-InsurancePolicy[0m, [1mPage:[0m [32m60[0m
[1mTitle:[0m [32mSample-InsurancePolicy[0m, [1mPage:[0m [32m58[0m

[1m[38;2;70;130;180mAnswer:[0m


The policy applies during the policy period, which is the period of time that the policy is in effect. The policy period begins on the effective date and ends on the expiration date. The effective date is the date that the policy is issued, and the expiration date is the date that the policy expires. The policy applies to the insured auto, which is the auto that is listed on the declarations page of the policy. The policy also applies to the insured person, who is the person who is listed on the declarations page of the policy.

### 8. Intuitive UI for Demo.

In [None]:
! pip install gradio

In [None]:
import gradio as gr
import warnings

def qa_interface(question, temperature, top_p, max_output_tokens, top_k):
    QUESTION = question
    CONTEXT, TITLES, PAGES = mostSimilarContexts(QUESTION, index_df)
    SOURCES = {"Titles": TITLES, "Pages": PAGES}

    PROMPT = f"""

    Based only on the following information:
    {CONTEXT}.


    Answer the question below:
    {QUESTION}.

    Don't assume anything, If you are not sure then answer that you don't have enough information.
    Answer what's important. Be concise.
    """

    ANSWER = model.predict(
          PROMPT,
          max_output_tokens=max_output_tokens,
          temperature=temperature,
          top_p=top_p,
          top_k=top_k,
      )
    ANSWER_2 = model.predict(
          PROMPT,
          max_output_tokens=max_output_tokens,
          temperature=temperature,
          top_p=top_p,
          top_k=top_k,
      )
    SOURCES_DF = pd.DataFrame(SOURCES)

    return ANSWER, ANSWER_2, SOURCES_DF

iface = gr.Interface(fn=qa_interface,
                     inputs=[
                         gr.inputs.Textbox(label="Question")
                         , gr.Slider(0, 1, 0.3)
                         , gr.Slider(0, 1, 1)
                         , gr.Slider(0, 1024, 700)
                         , gr.Slider(0, 40, 40)
                         ],
                     outputs=[
                         gr.Textbox(label="Answer 1"),
                         gr.Textbox(label="Answer 2"),
                         gr.Dataframe(headers = ['Title', 'Page'], label="Sources")
                          ],
                     title="Intra-Knowledge Q&A",
                     description="Enter a question and get two possible answers from the PDF and HTML files.",
                     allow_flagging=False,
                     theme=gr.themes.Soft()
                     )

iface.launch(share=True)

  super().__init__(
  super().__init__(


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://8fb9b1fb85e84c10e5.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces


