In [42]:
#%pip install --upgrade --quiet tiktoken langchain langgraph beautifulsoup4

## Importing Dependencies

In [43]:
import os
import getpass
import textwrap
import dotenv

from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain_openai import OpenAIEmbeddings
from langchain.chains.mapreduce import MapReduceChain
from langchain import OpenAI, PromptTemplate, LLMChain
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_loaders import DirectoryLoader
from langchain.chains.summarize import load_summarize_chain

## Natural Language Tool Kit

In [44]:
#  import nltk
#  nltk.download('punkt')

## API Keys

In [72]:
load_dotenv()
openai_api_key = os.getenv('OPENAI_API_KEY')
langsmith_api_key = os.getenv('LANGSMITH_API_KEY')

if openai_api_key:
    os.environ['OPENAI_API_KEY'] = openai_api_key
    

if langsmith_api_key:
    os.environ['LANGSMITH_API_KEY'] = langsmith_api_key
    
    
print(f"openai_api_key: {os.environ.get('OPENAI_API_KEY')}")
print(f"langsmith_api_key: {os.environ.get('LANGSMITH_API_KEY')}")
    

openai_api_key: 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
langsmith_api_key: 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


## Embedding Models

In [46]:
#os.environ["HUGGINGFACEHUB_API_TOKEN"]="HUGGINGFACEHUB_API_TOKEN"
embeddings = OpenAIEmbeddings(
    model="text-embedding-ada-002",  #response time is 9s  #infloat/e5-base-V2 has 3.53sec response time.
)
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x32782a690>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x332154fe0>, model='text-embedding-ada-002', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

## Web Loader

In [47]:
# loader = WebBaseLoader("https://collegerecon.com/")
# documents = loader.load()

## Documents Loader

In [48]:
directory = '/Users/user/Downloads/summarizer/directory'
def load_documents(directory):
    loader = DirectoryLoader(directory)
    documents = loader.load()
    return documents

documents = load_documents(directory)

In [49]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50
)
texts = text_splitter.split_documents(documents)

## Vector-database (Chroma)

In [50]:
from langchain.vectorstores import Chroma
persist_directory = "persist_directory"
vectordb = Chroma.from_documents(
    documents=documents, 
    embedding=embeddings,
    persist_directory=persist_directory
)
vectordb.persist()

## Prompt Template

In [51]:
from langchain.prompts import PromptTemplate
prompt_template = """Write a concise summary of the following: "{context}" CONCISE SUMMARY: """
prompt = PromptTemplate(
    template=prompt_template, 
    input_variables=["context", "question"]
)

## LLM Model Initiation

In [52]:
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
# Instantiate the model. Callbacks support token-wise streaming
callbacks = [StreamingStdOutCallbackHandler()]
llm = ChatOpenAI(
    model="gpt-4o-mini",
    #device='gpu',
    temperature=0.0)

## Enable debug and verbose mode 


In [53]:
from langchain.globals import set_verbose, set_debug
set_debug(True)
set_verbose(True)

## RAG with `RetrievalQA` 

In [54]:
from langchain.chains import RetrievalQA
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectordb.as_retriever(),
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt},
    return_source_documents=True
)

In [55]:
from langchain.chains import RetrievalQA
response = qa_chain("please summarize all the document ")

[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "please summarize all the document "
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQA > chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQA > chain:StuffDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "context": "3 2 0 2\n\ng u A 2\n\n] L C . s c [\n\n7 v 2 6 7 3 0 . 6 0 7 1 : v i X r a\n\nProvided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works.\n\nAttention Is All You Need\n\nAshish Vaswani∗ Google Brain avaswani@google.com\n\nNoam Shazeer∗ Google Brain noam@google.com\n\nNiki Parmar∗ Google Research nikip@google.com\n\nJakob Uszkoreit∗ Google Research usz@google.com\n\nLlion Jones∗ Google Research llion@google.com\n\nAidan N. Gomez∗ † University of Toronto aidan@cs.

## Summarization 'Stuff'

In [56]:
import pprint
pp = pprint.PrettyPrinter(indent=0)
pp.pprint(response['result'])

('The paper "Attention Is All You Need" introduces the Transformer, a novel '
 'neural network architecture designed for sequence transduction tasks, '
 'particularly in machine translation. Unlike traditional models that rely on '
 'recurrent or convolutional layers, the Transformer is solely based on '
 'attention mechanisms, allowing for greater parallelization and significantly '
 'reduced training time. The authors demonstrate that the Transformer achieves '
 'state-of-the-art performance on the WMT 2014 English-to-German and '
 'English-to-French translation tasks, with BLEU scores of 28.4 and 41.0, '
 'respectively, surpassing previous models by notable margins. The '
 'architecture consists of stacked self-attention and feed-forward layers, '
 'with multi-head attention enabling the model to focus on different parts of '
 'the input sequence simultaneously. The paper highlights the advantages of '
 'self-attention in capturing long-range dependencies and improving '
 'computati

## Web Summarization

In [57]:
# from langchain.chains.combine_documents import create_stuff_documents_chain
# from langchain.chains.llm import LLMChain
# from langchain_core.prompts import ChatPromptTemplate

# # Define prompt
# prompt = ChatPromptTemplate.from_messages(
#     [("system", "Write a concise summary of the following:\\n\\n{context}")]
# )

# # Instantiate chain
# chain = create_stuff_documents_chain(llm, prompt)

# # Invoke chain
# result = chain.invoke({"context": docs})
# print(result)

## Summarization  'map_reduce' 


In [58]:
chain = load_summarize_chain(llm, 
                        chain_type="map_reduce",
                        verbose = True)

output_summary = chain.run(documents)
wrapped_text = textwrap.fill(output_summary, width=100)
print(wrapped_text)

[32;1m[1;3m[chain/start][0m [1m[chain:MapReduceDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:MapReduceDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "text": "support@vitafyhealth.com | 1-866-4-Vitafy\n\nActivity Logs Module\n\nSeptember, 2023\n\nOverview\n\nThe Activity Log serves as a comprehensive tool for monitoring all user actions within the platform. Gain insights into each action with detailed timestamps and user-specific activities for a thorough overview. \n\nFilter user activity\n\nUse the search bar to filter user activity by username, action, or time interval\n\nActivity log timestamps\n\nDetailed overview of user activities showcasing specific actions taken by each user along with corresponding timestamps.\n\nHow to\n\nFilter activity log\n\n\nTo filter activity log using search bar:\n\nSearch using user or activity\n\nClick the filter button to filter by interv

## Summarizing Main theme using 'map_reduce'

In [59]:
from langchain.chains.summarize import load_summarize_chain
from langchain_community.document_loaders import WebBaseLoader
from langchain_openai import ChatOpenAI

directory = '/Users/user/Downloads/summarizer/directory'
def load_documents(directory):
    loader = DirectoryLoader(directory)
    documents = loader.load()
    return documents

documents = load_documents(directory)

llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-1106")
chain = load_summarize_chain(llm, chain_type="stuff")

chain.run(documents)

[32;1m[1;3m[chain/start][0m [1m[chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:StuffDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "text": "support@vitafyhealth.com | 1-866-4-Vitafy\n\nActivity Logs Module\n\nSeptember, 2023\n\nOverview\n\nThe Activity Log serves as a comprehensive tool for monitoring all user actions within the platform. Gain insights into each action with detailed timestamps and user-specific activities for a thorough overview. \n\nFilter user activity\n\nUse the search bar to filter user activity by username, action, or time interval\n\nActivity log timestamps\n\nDetailed overview of user activities showcasing specific actions taken by each user along with corresponding timestamps.\n\nHow to\n\nFilter activity log\n\n\nTo filter activity log using search bar:\n\nSearch using user or activity\n\nClick the filter button to filter by interval\n\nUse the arrow button beside th

'The Vitafy Health platform offers an Activity Logs Module for monitoring user actions, a Calendar Module for scheduling and managing appointments, and a Billings Module for handling client billing and transactions. The Activity Log allows for filtering and detailed timestamps of user activities. The Calendar Module enables event scheduling, collaborative sharing, and filtering of calendar views. The Billings Module allows for integrated billing, managing transactions, and issuing refunds. It also provides options for viewing, filtering, and adding transactions, as well as managing upcoming and failed transactions. For further assistance, users can contact Vitafy support at support@vitafyhealth.com.'

In [60]:
from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain
from langchain_text_splitters import CharacterTextSplitter

llm = ChatOpenAI(temperature=0)

# Map
template = """The following is a set of documents
{docs}
Based on this list of docs, please identify the main themes 
Helpful Answer:"""
prompt = PromptTemplate.from_template(template)
chain = LLMChain(llm=llm, prompt=prompt)

  chain = LLMChain(llm=llm, prompt=prompt)


In [61]:
from langchain import hub

map_prompt = hub.pull("rlm/map-prompt")
map_chain = LLMChain(llm=llm, prompt=map_prompt)



In [62]:
# Reduce
reduce_template = """The following is set of summaries:
{docs}
Take these and distill it into a final, consolidated summary of the main themes. 
Helpful Answer:"""
reduce_prompt = PromptTemplate.from_template(reduce_template)

In [73]:
# Note we can also get this from the prompt hub, as noted above
reduce_prompt = hub.pull("rlm/map-prompt")



In [64]:
reduce_prompt

ChatPromptTemplate(input_variables=['docs'], input_types={}, partial_variables={}, metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'map-prompt', 'lc_hub_commit_hash': 'de4fba345f211a462584fc25b7077e69c1ba6cdcf4e21b7ec9abe457ddb16c87'}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['docs'], input_types={}, partial_variables={}, template='The following is a set of documents:\n{docs}\nBased on this list of docs, please identify the main themes \nHelpful Answer:'), additional_kwargs={})])

In [65]:
from langchain.chains.combine_documents.stuff import StuffDocumentsChain

# Run chain
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain, document_variable_name="docs"
)

# Combines and iteratively reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
    # This is final chain that is called.
    combine_documents_chain=combine_documents_chain,
    # If documents exceed context for `StuffDocumentsChain`
    collapse_documents_chain=combine_documents_chain,
    # The maximum number of tokens to group documents into.
    token_max=4000,
)

  combine_documents_chain = StuffDocumentsChain(
  reduce_documents_chain = ReduceDocumentsChain(


In [66]:
# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
    # Map chain
    llm_chain=map_chain,
    # Reduce chain
    reduce_documents_chain=reduce_documents_chain,
    # The variable name in the llm_chain to put the documents in
    document_variable_name="docs",
    # Return the results of the map steps in the output
    return_intermediate_steps=False,
)

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1000, chunk_overlap=0
)
split_docs = text_splitter.split_documents(documents)


  map_reduce_chain = MapReduceDocumentsChain(


In [67]:
print(map_reduce_chain.run(split_docs))

[32;1m[1;3m[chain/start][0m [1m[chain:MapReduceDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:MapReduceDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "docs": "support@vitafyhealth.com | 1-866-4-Vitafy\n\nActivity Logs Module\n\nSeptember, 2023\n\nOverview\n\nThe Activity Log serves as a comprehensive tool for monitoring all user actions within the platform. Gain insights into each action with detailed timestamps and user-specific activities for a thorough overview. \n\nFilter user activity\n\nUse the search bar to filter user activity by username, action, or time interval\n\nActivity log timestamps\n\nDetailed overview of user activities showcasing specific actions taken by each user along with corresponding timestamps.\n\nHow to\n\nFilter activity log\n\n\nTo filter activity log using search bar:\n\nSearch using user or activity\n\nClick the filter button to filter by interv

## Summarizing Long Documents

The objective of this notebook is to demonstrate how to summarize large documents with a controllable level of detail.

