In [None]:
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers  import StrOutputParser

from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents.base import Document 
from langchain_openai import AzureChatOpenAI
from langchain_text_splitters import TokenTextSplitter


In [None]:
import os
from dotenv import load_dotenv, find_dotenv


load_dotenv(find_dotenv())


AZURE_OPENAI_CHAT_DEPLOYMENT_VERSION = os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT_VERSION")
AZURE_OPENAI_CHAT_DEPLOYMENT = os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT_NAME")
AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT = os.getenv("AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT")

print(AZURE_OPENAI_CHAT_DEPLOYMENT)

In [None]:
url = "https://arxiv.org/pdf/1706.03762.pdf" #attention is all you need
url = "https://dosequis.colorado.edu/Courses/MethodsLogic/papers/WatsonCrick1953.pdf" #watson and crick
url = "https://citeseerx.ist.psu.edu/document?repid=rep1&type=pdf&doi=587222fc46037816093a897a01a17dd3c2e56a42"
url ="https://arxiv.org/pdf/2305.16291.pdf" #voyager
url = "https://arxiv.org/pdf/2305.17926.pdf" #llms are not fair evaluators
loader = PyPDFLoader(url)
docs = loader.load()

In [None]:
print(docs)

In [None]:
combined = "".join([d.page_content for d in docs])

In [None]:
print(combined)

In [None]:
llm = AzureChatOpenAI(
    api_version=AZURE_OPENAI_CHAT_DEPLOYMENT_VERSION,
    deployment_name=AZURE_OPENAI_CHAT_DEPLOYMENT,
    temperature=0.0,
)

print(AZURE_OPENAI_CHAT_DEPLOYMENT)

In [None]:
prompt_template = """Summarise each section of this academic paper for a {level} audience. 
                    Try to highlight the important themes and ideas in the paper. If there are
                    particularly important ideas, add a section discussing them. 
                    Start with a high-level summary of the paper and conclude with a bullet list of the important ideas.
                    Highlight the results and achivements in the paper.
                    This is the content:

                    {text}
                    """

PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"]) 

In [None]:
chain = PROMPT | llm | StrOutputParser()


In [None]:
text_splitter = TokenTextSplitter(
    encoding_name="cl100k_base", chunk_size=15000, chunk_overlap=0
)
texts = text_splitter.split_text(combined)
input_text = texts[0]
was_truncated = False if len(input_text) == len(combined) else True

print(was_truncated)


In [None]:

model_output = chain.invoke({"text": Document(page_content=input_text), "level": "practitioner"})
if was_truncated:
    model_output = f"WARINING, THIS WAS BASED ON AN INCOMPLETE DOCUMENT. \n\n {model_output}"

print(model_output)

In [None]:
def the_whole_thing(url, summarize_chain)-> str:
    loader = PyPDFLoader(url)
    docs = loader.load()
    combined = "".join([d.page_content for d in docs])

    text_splitter = TokenTextSplitter(
        encoding_name="cl100k_base", chunk_size=15000, chunk_overlap=0
    )
    texts = text_splitter.split_text(combined)
    input_text = texts[0]
    was_truncated = False if len(input_text) == len(combined) else True

    model_output = summarize_chain.invoke({"text": Document(page_content=input_text), "level": "practitioner"})
    if was_truncated:
        model_output = f"WARINING, THIS WAS BASED ON AN INCOMPLETE DOCUMENT. \n\n {model_output}"

    return model_output

In [None]:
urls = [
"https://arxiv.org/pdf/1706.03762.pdf", #attention is all you need
"https://arxiv.org/pdf/2305.16291.pdf" ,#voyager
"https://arxiv.org/pdf/2305.17926.pdf", #llms are not fair evaluators
"https://citeseerx.ist.psu.edu/document?repid=rep1&type=pdf&doi=587222fc46037816093a897a01a17dd3c2e56a42",
"https://dosequis.colorado.edu/Courses/MethodsLogic/papers/WatsonCrick1953.pdf" #watson and crick
]

for url in urls:
    print("####################")
    print(the_whole_thing(url, chain))