In [None]:
# Import python packages
from PyPDF2 import PdfFileReader
from snowflake.snowpark.files import SnowflakeFile
from snowflake.snowpark.types import StringType
from io import BytesIO
import pandas as pd
from snowflake.snowpark.context import get_active_session
session = get_active_session()


In [None]:
def readpdf(file_path):
    whole_text = ""
    with SnowflakeFile.open(file_path, 'rb') as file:
        f = BytesIO(file.readall())
        pdf_reader = PdfFileReader(f)
        whole_text = ""
        for page in pdf_reader.pages:
            whole_text += page.extract_text()
    return whole_text

In [None]:
session.udf.register(
func = readpdf, 
return_type = StringType(), 
input_types = [StringType()], 
is_permanent = True,
name = 'EXTRACT_PDF', 
replace = True, 
packages=['snowflake-snowpark-python','pypdf2'], 
stage_location = 'SNOW_INVEST.BRONZE.UDF')

In [None]:
CREATE OR REPLACE TABLE SNOW_INVEST.BRONZE.RAW_DOCUMENTATION AS
SELECT
    relative_path, 
    file_url, 
    extract_pdf(build_scoped_file_url(@SNOW_INVEST.BRONZE.DOCUMENTATION, relative_path)) as raw_text
from directory(@SNOW_INVEST.BRONZE.DOCUMENTATION);

In [None]:
#A class for chunking text and returning a table via UDTF
from snowflake.snowpark.types import StringType, StructField, StructType
from langchain.text_splitter import RecursiveCharacterTextSplitter

class text_chunker:

    def process(self,text):        
        text_raw=[]
        text_raw.append(text) 
        
        text_splitter = RecursiveCharacterTextSplitter(
            separators = ["\n"],   #Define an appropriate separator. New line
            chunk_size = 250,     #Adjust this as you see fit
            chunk_overlap  = 80,  #This lets text chunks have overlap. Useful for keeping chunks contextual
            length_function = len,
            add_start_index = True #Optional, useful if you want to feed the chunk before/after
        )
    
        chunks = text_splitter.create_documents(text_raw)
        df = pd.DataFrame(chunks, columns=['chunks','meta'])
        
        yield from df.itertuples(index=False, name=None)

In [None]:
#Register the UDTF - set the stage location
schema = StructType([
     StructField("chunk", StringType()),
    StructField("meta", StringType()),
 ])

session.udtf.register( 
    handler = text_chunker,
    output_schema= schema, 
    input_types = [StringType()] , 
    is_permanent = True , 
    name = 'CHUNK' , 
    replace = True , 
    packages=['pandas','langchain'], stage_location = 'SNOW_INVEST.BRONZE.UDF')

In [None]:
CREATE OR REPLACE TABLE SNOW_INVEST.SILVER.PDF_SUMMARIZED_TEXT AS
SELECT
    *,
    snowflake.ml.summarize(left(raw_text,8000)) as summary
FROM SNOW_INVEST.BRONZE.RAW_DOCUMENTATION;

In [None]:
--Create the chunked version of the table
CREATE OR REPLACE TABLE SNOW_INVEST.SILVER.PDF_CHUNK_TEXT AS
SELECT
    raw.relative_path,
    chunk.*
FROM SNOW_INVEST.BRONZE.RAW_DOCUMENTATION AS raw,
TABLE(chunk(raw_text)) as chunk;

In [None]:
-- some text chunks are shorter than 150 characters. Let's remove them with a view.
CREATE OR REPLACE VIEW SNOW_INVEST.SILVER.PDF_CHUNK_TEXT_V AS
SELECT
    *
FROM SNOW_INVEST.SILVER.PDF_CHUNK_TEXT 
WHERE length(chunk) > 155;

In [None]:
--Convert your chunks to embeddings
CREATE OR REPLACE  TABLE SNOW_INVEST.GOLD.PDF_VECTOR_STORE AS
SELECT
    RELATIVE_PATH as REPORT,
    CHUNK AS CHUNK,
    snowflake.cortex.embed_text('e5-base-v2',chunk) as chunk_embedding
FROM SNOW_INVEST.SILVER.PDF_CHUNK_TEXT_V;

In [None]:
SELECT 
    REPORT, 
    CHUNK,
    VECTOR_L2_DISTANCE(snowflake.ml.embed_text('e5-base-v2', 'What are the key challenges in sustainable retail ?'), 
    CHUNK_EMBEDDING) as VECTOR_DISTANCE
FROM SNOW_INVEST.GOLD.PDF_VECTOR_STORE
ORDER BY VECTOR_DISTANCE ASC
LIMIT 10;

In [None]:
--Pass the chunks we need along with the prompt to get a structured answer from the LLM
select snowflake.ml.complete(
        'llama2-70b-chat', 
        concat( 
        'Answer the question concisely based on the context. Context: ',
        (
            select to_varchar(array_agg(chunk)) 
            from (
            select chunk from SNOW_INVEST.GOLD.PDF_VECTOR_STORE
                order by vector_l2_distance(
                snowflake.ml.embed_text('e5-base-v2', 
                'What are the key challenges in sustainable retail ?'
                ), chunk_embedding
                ) limit 10) 
        ),'What are the key challenges in sustainable retail ? Answer: '
        )
    ) as response;