In [None]:
%%sql -r dataframe_1
DROP DATABASE IF EXISTS snowflake_llm_poc;
CREATE Database snowflake_llm_poc;
use snowflake_llm_poc;

In [None]:
%%sql -r dataframe_4
create or replace stage snowflake_llm_poc.PUBLIC.Snow_stage_directory_table_yt url="s3://snwoflakeragtest/pdf_knowledge_base/" 
credentials=(aws_key_id=''
aws_secret_key='')
Directory=(ENABLE=TRUE);

In [None]:
%%sql -r dataframe_6
alter stage snowflake_llm_poc.PUBLIC.Snow_stage_directory_table_yt refresh;

In [None]:
%%sql -r dataframe_5
SELECT * FROM directory(@snowflake_llm_poc.PUBLIC.Snow_stage_directory_table_yt);

In [None]:
%%sql -r dataframe_2
CREATE OR REPLACE FUNCTION snowflake_llm_poc.PUBLIC.read_pdf_and_split(file_name STRING)
RETURNS ARRAY
LANGUAGE PYTHON
RUNTIME_VERSION = '3.9'
PACKAGES = ('snowflake-snowpark-python','PyPDF2','langchain')
HANDLER = 'main_fn'
AS
$$
from snowflake.snowpark.files import SnowflakeFile
import PyPDF2
from langchain.text_splitter import RecursiveCharacterTextSplitter

def main_fn(file_name):
    f = SnowflakeFile.open(file_name, 'rb')
    pdf_object = PyPDF2.PdfReader(f)

    results = []

    text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=500,
            chunk_overlap=100,
            length_function=len
        )

    # iterate page by page
    for page_number, page in enumerate(pdf_object.pages, start=1):
        text = page.extract_text()
        if not text:
            continue

        text = text.replace('\\n', ' ').replace('\\0', ' ')
        chunks = text_splitter.split_text(text)

        for chunk in chunks:
            results.append({
                "chunk": chunk,
                "page_number": page_number
            })

    return results
$$;

In [None]:
%%sql -r dataframe_7
ALTER ACCOUNT SET CORTEX_ENABLED_CROSS_REGION = 'ANY_REGION';

In [None]:
%%sql -r dataframe_8
--Storage
create or replace TABLE snowflake_llm_poc.PUBLIC.DOCS_CHUNKS_TABLE ( 
    RELATIVE_PATH VARCHAR(16777216), -- Relative path to the PDF file
    SIZE NUMBER(38,0), -- Size of the PDF
    Index Number(38,0), --Index no. of the chunk
    CHUNK VARCHAR(16777216), -- Piece of text
    Page_Number Number(38,0), -- Page Number
    Embedding_Vector VECTOR(FLOAT, 1024)
);

In [None]:
%%sql -r dataframe_3
insert into snowflake_llm_poc.PUBLIC.docs_chunks_table (relative_path, size, Index,chunk,Page_Number,Embedding_Vector)
with splitted_data as (SELECT RELATIVE_PATH,SIZE,read_pdf_and_split(BUILD_SCOPED_FILE_URL( @snowflake_llm_poc.PUBLIC.Snow_stage_directory_table_yt , RELATIVE_PATH )) as pdf_text_split FROM directory(@snowflake_llm_poc.PUBLIC.Snow_stage_directory_table_yt))
select Relative_path,SIZE,f.Index,trim(f.value['chunk'],'"') as chunk,f.value['page_number'] as page_number,
SNOWFLAKE.CORTEX.EMBED_TEXT_1024('voyage-multilingual-2', trim(f.value['chunk'],'"')) as Embedding_Vector from splitted_data , lateral flatten(pdf_text_split) f ;

In [None]:
%%sql -r dataframe_9
select * from snowflake_llm_poc.PUBLIC.docs_chunks_table;

In [None]:
%%sql -r dataframe_10
SELECT snowflake.cortex.complete(
    'mistral-large', 
    CONCAT( 
        'Answer the question based on the context. Be concise.','Context: ',
        (
            select listagg(chunk,', ') from (SELECT CHUNK from snowflake_llm_poc.PUBLIC.docs_chunks_table
            ORDER BY VECTOR_L2_DISTANCE(
            SNOWFLAKE.CORTEX.EMBED_TEXT_1024('voyage-multilingual-2',
            'How to start Aurora?'
            ), Embedding_Vector
            ) limit 3)
        ),
        ' Question: ', 
        'How to start Aurora?',
        'Answer: '
    )
) as response;

In [None]:
from snowflake.snowpark.context import get_active_session
from IPython.display import display, Markdown
session = get_active_session()

In [None]:
#user_question = 'What security features does Amazon Bedrock use to keep my fine-tuning data private and secure?'
user_question = 'How can I migrate from MySQL to Aurora?'
num_chunks = 5

print("The user question is: ", user_question)
print()
#retrieval
cmd = f"""
        SELECT CHUNK,RELATIVE_PATH,PAGE_Number,VECTOR_COSINE_SIMILARITY(
        SNOWFLAKE.CORTEX.EMBED_TEXT_1024('voyage-multilingual-2',
        '{user_question}'
        ), Embedding_Vector
        ) as cosine_similarity from snowflake_llm_poc.PUBLIC.docs_chunks_table
        ORDER BY cosine_similarity desc limit {num_chunks}
        """
     
df_context = session.sql(cmd)
print("The context is: ")
df_context.show()

df_context = df_context.to_pandas()


prompt_context = ""
for i in range (0, num_chunks):
    prompt_context += df_context._get_value(i, 'CHUNK').replace("'", "")

#augmentation
prompt = f"""
      '''You are an expert assistance extracting information from context provided. 
       Answer the question based on the context. Be concise and do not hallucinate. 
       If you do not have the information just say so.
      Context: {prompt_context}
      Question:  
       {user_question} 
       Answer: '''
       """

#generation
cmd = f"""select SNOWFLAKE.CORTEX.COMPLETE('mistral-large',{prompt}) as response"""

df_response = session.sql(cmd).collect()
print("The output of RAG is: ",df_response[0].asDict()['RESPONSE'])


#for reference links
grouped_df = (
    df_context.groupby("RELATIVE_PATH")["PAGE_NUMBER"]
      .apply(lambda x: list(dict.fromkeys(x)))
      .reset_index(name="pages")
)

def generate_presigned_url(pdf_name):
    query = f"""
        SELECT GET_PRESIGNED_URL(
            @snowflake_llm_poc.PUBLIC.Snow_stage_directory_table_yt,
            '{pdf_name}',
            360
        ) AS presigned_url
    """
    return session.sql(query).collect()[0]["PRESIGNED_URL"]


grouped_df["presigned_url"] = grouped_df["RELATIVE_PATH"].apply(generate_presigned_url)
print()
def format_sources_from_df(df):
    lines = ["### Sources"]
    for _, row in df.iterrows():
        pages = row["pages"]
        lines.append(
            f"- **[{row['RELATIVE_PATH']}]({row['presigned_url']})** â€“ Pages: {pages}"
        )
    return "\n".join(lines)

display(Markdown(format_sources_from_df(grouped_df)))
print()