<a href="https://colab.research.google.com/github/SampathK/MyExperimentalNotebooks/blob/main/Large_Text_Summarization_Using_LangChain_AWS_BedRock_Mistral_Large.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%sh
pip install -Uq boto3 langchain langchain_community tiktoken

     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 139.2/139.2 kB 826.2 kB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 974.6/974.6 kB 24.9 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.2/2.2 MB 34.8 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.1/1.1 MB 25.6 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.3/12.3 MB 50.7 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 82.2/82.2 kB 7.2 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 321.8/321.8 kB 19.5 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 127.1/127.1 kB 9.7 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 49.2/49.2 kB 3.3 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 145.0/145.0 kB 5.2 MB/s eta 0:00:00


In [2]:
file_path = "./input/meeting_1.txt"

# Open the file in read mode and read its contents into a string
with open(file_path, "r") as file:
    call_transcripts = file.read()

In [3]:
import re

def preprocess_transcript(transcript):
    # Remove timestamps (assuming the format is [hh:mm:ss])
    transcript = re.sub(r'\[\d{2}:\d{2}:\d{2}\]', '', transcript)

    # Remove extra whitespace
    transcript = re.sub(r'\s+', ' ', transcript).strip()

    # Remove any unnecessary special characters (if needed)
    transcript = re.sub(r'[^\w\s.,?!:]', '', transcript)

    transcript = re.sub(r"<.*>", "", transcript)
    transcript = re.sub(r"\[(.*)\]", "", transcript)
    transcript = re.sub(r"\n\n\n", "\n", transcript)

    return transcript

In [4]:
processed_transcript = preprocess_transcript(call_transcripts)

In [10]:
from langchain_community.document_loaders import TextLoader
def get_document(file_path):
    loader = TextLoader(file_path)
    return loader.load()

In [14]:
doc = get_document(file_path)

In [15]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_doc(doc, chunk_size, chunk_overlap):
    # Create a RecursiveCharacterTextSplitter instance
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,  # Set the chunk size
        chunk_overlap=chunk_overlap,  # Set the chunk overlap
        separators=['\n', '.', '\n\n'],  # Define the separators
        keep_separator=False  # Do not keep the separators
    )

    # Split the text using the RecursiveCharacterTextSplitter
    chunks = splitter.split_documents(doc)
    return chunks

In [9]:
import boto3
from langchain_community.chat_models import BedrockChat
from google.colab import userdata
def get_aws_bedrock_summarization_llm():
  model_id = "mistral.mistral-large-2402-v1:0"
  session = boto3.client(
    aws_access_key_id=userdata.get("AWS_SERVER_PUBLIC_KEY"),
    aws_secret_access_key=userdata.get("AWS_SERVER_SECRET_KEY"),
    service_name="bedrock-runtime",
    region_name="us-east-1"
  )
  model_kwargs =  {
    "temperature": 0.2,
    "max_tokens": 8192,
    "top_p": 1,
    "top_k": 200,
    "stop":["</s>"]
  }
  model = BedrockChat(
    client=session,
    model_id=model_id,
    model_kwargs=model_kwargs,
  )
  return model


In [33]:
from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain
from langchain_text_splitters import CharacterTextSplitter
from langchain_core.prompts import PromptTemplate
from langchain.chains.llm import LLMChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain



llm = get_aws_bedrock_summarization_llm()
map_template = """<s>[INST]<text>{docs}</text>
Please summarize the above meeting transcript within text XML into a comprehensive summary, including all key details. The summary should include the following fields:

1. Participants: List all unique participants or human or owner names mentioned in the meeting.
2. Summary: Provide a brief overview of the main topics discussed during the meeting.
3. Key Points: Extract the most significant points discussed.
4. Decisions Made: List all decisions made during the meeting.
5. Action Items: Include detailed action items with descriptions, owners, and deadlines.
6. Strengths: Capture any positive aspects or strengths highlighted during the meeting.
7. Weaknesses: Capture any negative aspects or weaknesses highlighted during the meeting.
8. Progress Updates: Provide updates on previous action items and overall project progress.
9. Meeting Date and Time: Record the date and time when the meeting took place.
10. Meeting Duration: Accuaretely calculately duration of the meeting in Minutes, else do not include this field.
11. Meeting Type: Categorize the type of meeting (e.g., planning, review, status update).
12. Sentiment Scores: Provide numeric scores for positive and negative sentiments.
13. Tags: Include keywords or tags relevant to the meeting content.
14. Project Name: Associate the meeting with a specific project.
15. Location: Specify the location (physical or virtual) of the meeting.
The final summary should be in JSON format, avoid any other additional text structured as follows:
{{
  "participants": ["Participant 1", "Participant 2", "Participant 3", ...],
  "summary": "Brief overview of the meeting and main topics discussed.",
  "key_points": [
    "Key point 1",
    "Key point 2",
    "Key point 3",
    ...
  ],
  "decisions_made": [
    "Decision 1",
    "Decision 2",
    "Decision 3",
    ...
  ],
  "action_items": [
    {{"description": "Action item 1", "owner": "Person 1", "deadline": "YYYY-MM-DD"}},
    {{"description": "Action item 2", "owner": "Person 2", "deadline": "YYYY-MM-DD"}},
    ...
  ],
  "strengths": [
    "Positive aspect 1",
    "Positive aspect 2",
    ...
  ],
  "weaknesses": [
    "Negative aspect 1",
    "Negative aspect 2",
    ...
  ],
  "progress_updates": [
    "Update on previous action item 1",
    "Update on previous action item 2",
    ...
  ],
  "meeting_date": "YYYY-MM-DDTHH:MM:SSZ",
  "meeting_duration": "60 Min",
  "meeting_type": "Planning",
  "positive_sentiment_score": 0.8,
  "negative_sentiment_score": 0.1,
  "tags": ["tag1", "tag2", "tag3"],
  "project_name": "Project Name",
  "location": "Location"}} [/INST]"""
map_prompt = PromptTemplate.from_template(map_template)
map_chain = LLMChain(llm=llm, prompt=map_prompt)
reduce_template = """<s>[INST]<summaries>{docs}</summaries>
Please summarize the above client meeting transcript summaries within summaries XML tag of a document into a concise, comprehensive consolidated summary. Make sure to include the key points, decisions made, action items, positive and negative sentiments, progress updates, and names of participants mentioned in the summaries.
If it is not possible to create a single summary without losing important details due to length constraints, provide the summaries as they are.
The summary should include:
1. Participants: List all unique participants mentioned in the meeting.
2. Summary: Provide a brief overview of the unique main topics discussed during the meeting.
3. Key Points: Extract the most significant unique points discussed.
4. Decisions Made: List all unique decisions made during the meeting.
5. Action Items: Include detailed unique action items with descriptions, owners, and deadlines.
6. Strengths: Capture any unique positive aspects or strengths highlighted during the meeting.
7. Weaknesses: Capture any unique negative aspects or weaknesses highlighted during the meeting.
8. Progress Updates: Provide unique updates on previous action items and overall project progress.
9. Meeting Date and Time: Record the date and time when the meeting took place.
10. Meeting Duration: Accuaretely calculately duration of the meeting in Minutes, else do not include this field.
11. Meeting Type: Categorize the type of meeting (e.g., planning, review, status update).
12. Sentiment Scores: Provide overall numeric scores for positive and negative sentiments.
13. Tags: Include unique keywords or tags relevant to the meeting content.
14. Project Name: Associate the meeting with a specific project.
15. Location: Specify the location (physical or virtual) of the meeting.
The final summary should be in only JSON format Avoid any other additional texts, structured as follows:
{{
  "participants": ["Participant 1", "Participant 2", "Participant 3", ...],
  "summary": "Brief overview of the meeting and main topics discussed.",
  "key_points": [
    "Key point 1",
    "Key point 2",
    "Key point 3",
    ...
  ],
  "decisions_made": [
    "Decision 1",
    "Decision 2",
    "Decision 3",
    ...
  ],
  "action_items": [
    {{"description": "Action item 1", "owner": "Person 1", "deadline": "YYYY-MM-DD"}},
    {{"description": "Action item 2", "owner": "Person 2", "deadline": "YYYY-MM-DD"}},
    ...
  ],
  "strengths": [
    "Positive aspect 1",
    "Positive aspect 2",
    ...
  ],
  "weaknesses": [
    "Negative aspect 1",
    "Negative aspect 2",
    ...
  ],
  "progress_updates": [
    "Update on previous action item 1",
    "Update on previous action item 2",
    ...
  ],
  "meeting_date": "YYYY-MM-DDTHH:MM:SSZ",
  "meeting_duration": "60 Min",
  "meeting_type": "Planning",
  "positive_sentiment_score": 0.8,
  "negative_sentiment_score": 0.1,
  "tags": ["tag1", "tag2", "tag3"],
  "project_name": "Project Name",
  "location": "Location"
  }}
  [/INST]"""
reduce_prompt = PromptTemplate.from_template(reduce_template)
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain, document_variable_name="docs"
)

# Combines and iteratively reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
    # This is final chain that is called.
    combine_documents_chain=combine_documents_chain,
    # If documents exceed context for `StuffDocumentsChain`
    collapse_documents_chain=combine_documents_chain,
    # The maximum number of tokens to group documents into.
    token_max=8192,
)
chunk_size = 8192
chunk_overlap = 2048
split_docs = split_doc(doc,chunk_size,chunk_overlap)
map_reduce_chain = MapReduceDocumentsChain(
    # Map chain
    llm_chain=map_chain,
    # Reduce chain
    reduce_documents_chain=reduce_documents_chain,
    # The variable name in the llm_chain to put the documents in
    document_variable_name="docs",
    # Return the results of the map steps in the output
    return_intermediate_steps=False,
)
result = map_reduce_chain.invoke(split_docs)

In [34]:
print(result["output_text"])

 {
  "participants": ["Emily", "John", "Jane", "Michael", "Sarah"],
  "summary": "Discussion on marketing campaign strategies, including social media marketing, SEO, and email campaigns, as well as a weekly check-in on influencer collaborations, email campaigns, SEO improvements, and social media engagement.",
  "key_points": [
    "Increase online presence and drive more traffic to the website",
    "Focus on creating more interactive content for social media",
    "Segment audience for email campaigns and tailor content to their interests",
    "Optimize website content with relevant keywords and improve user experience for SEO",
    "Received responses from 3 out of 5 influencers, with 2 interested in collaborations",
    "Improved email campaign performance with increased open rates and click-through rates",
    "SEO improvements implemented, resulting in better site speed and increased organic search traffic",
    "Social media engagement has plateaued and needs adjustment"
  ],
 