In [3]:
from dotenv import load_dotenv
import os
from langchain_community.chat_models import ChatHuggingFace
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFaceEndpoint
from langchain_groq import ChatGroq

# Load env
load_dotenv()

# Tokens
os.environ["HUGGINGFACEHUB_API_TOKEN"] = os.getenv("HF_API_KEY")

# LangSmith (optional)
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = os.getenv("LANGCHAIN_PROJECT")

# Models

hf_llm = HuggingFaceEndpoint(
    repo_id="HuggingFaceH4/zephyr-7b-beta",
    temperature=0,
    max_new_tokens=512
)

embeddings = HuggingFaceEmbeddings(
    model_name="all-MiniLM-L6-v2"
)


groq_api_key= os.getenv("GROQ_API_KEY")
model = ChatGroq(model= 'moonshotai/kimi-k2-instruct', groq_api_key =groq_api_key)

  hf_llm = HuggingFaceEndpoint(


In [7]:
from langchain_core.messages import AIMessage, HumanMessage , SystemMessage

speech = """Hemanth Kumar (born 16 February 1977), better known by his stage name Darshan Thoogudeepa or simply Darshan, is an Indian actor, film producer and distributor who works predominantly in Kannada cinema. In a career spanning over two decades, Darshan has worked in over 50 films. He is one of the highest paid actors and leading contemporary actors of Kannada films.[2][3][4] Darshan has a significant fan following and is referred to by his fans as Challenging Star and D Boss.[5][6][7]

Darshan established the production house Thoogudeepa Productions in 2006. Its first production was Jothe Jotheyali, with Darshan in a special appearance.[8][9] His performances in Anatharu (2007) and Krantiveera Sangolli Rayanna (2012) won him praise from critics; his performance in the latter as the 19th-century warrior Sangolli Rayanna won him the Karnataka State Film Award for Best Actor.[10]

He began his acting career in soap operas and small films in the mid-1990s. His first big-screen lead role was in the 2002 film Majestic. Darshan starred in commercially successful films such as Kariya (2003), Kalasipalya (2004), Gaja (2008), Navagraha (2008), Saarathi (2011), Krantiveera Sangolli Rayanna (2012), Bulbul (2013), Yajamana (2019), Roberrt (2021) and Kaatera (2023)"""

In [9]:
chat_message = [
    SystemMessage(content="your are the expert in summarizing speeches"),
    HumanMessage(content= f"Please provide a short and summary of the follow speech:\n Text:{speech}")
]

In [10]:
model.get_num_tokens(speech)

  return len(self.get_token_ids(text))


338

In [14]:
response = model.invoke(chat_message)
print(response.content)

Darshan Thoogudeepa, born 1977, is a top-tier Kannada actor-producer with 50+ films in 20+ years; nicknamed “Challenging Star,” he won a State Best Actor award for Krantiveera Sangolli Rayanna and heads Thoogudeepa Productions.


# Prompt Template Text Summarization

In [22]:
from langchain_core.prompts import ChatPromptTemplate

generic_text = """Please provide a short summary of the following speech:
Speech: {speech}

Then translate the precise summary to {language}.
"""

prompt = ChatPromptTemplate.from_template(generic_text)


In [25]:
op = prompt.format(speech=speech, language ='kannada')

In [28]:
model.get_num_tokens

<bound method BaseLanguageModel.get_num_tokens of ChatGroq(profile={'max_input_tokens': 131072, 'max_output_tokens': 16384, 'image_inputs': False, 'audio_inputs': False, 'video_inputs': False, 'image_outputs': False, 'audio_outputs': False, 'video_outputs': False, 'reasoning_output': False, 'tool_calling': True}, client=<groq.resources.chat.completions.Completions object at 0x000002222634ECF0>, async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x000002222634FA10>, model_name='moonshotai/kimi-k2-instruct', model_kwargs={}, groq_api_key=SecretStr('**********'))>

In [31]:
llm_chain = prompt | model

summary = llm_chain.invoke({
    "speech": speech,
    "language": "Kannada"
})


In [34]:
print(summary.content)

Summary (English)  
Darshan Thoogudeepa, born 1977, is a top Kannada-film actor-producer with 50+ films in 20 years. After TV roles, he debuted as hero in Majestic (2002), became a box-office draw with hits like Kariya, Gaja, Saarathi, KSR, Roberrt and Kaatera, and won the State Best-Actor award for Krantiveera Sangolli Rayanna (2012). Fans call him “Challenging Star” and “D Boss”; he owns Thoogudeepa Productions.

ಸಾರಾಂಶ (ಕನ್ನಡ)  
ದರ್ಶನ್ ತೂಗುದೀಪ, ೧೯೭೭ ಜನನ; ಇಪ್ಪತ್ತು ವರ್ಷಗಳಲ್ಲಿ ಐವತ್ತಕ್ಕೂ ಹೆಚ್ಚು ಚಿತ್ರಗಳಲ್ಲಿ ನಟಿಸಿದ ಕನ್ನಡ ಚಿತ್ರರಂಗದ ಅಗ್ರ ನಟ-ನಿರ್ಮಾಪಕ. ಟಿವಿ ಧಾರಾವಾಹಿಗಳಿಂದ ಆರಂಭಿಸಿ “ಮೆಜೆಸ್ಟಿಕ್” (೨೦೦೨) ನಾಯಕನಾಗಿ ಬಂದು “ಕಾರಿಯಾ”, “ಗಜ”, “ಸಾರಥಿ”, “ಕ್ರಾಂತಿವೀರ ಸಂಗೋಳಿ ರಾಯಣ್ಣ”, “ರಾಬರ್ಟ್”, “ಕಾತೇರಾ” ಮುಂತಾದ ಚಿತ್ರಗಳಿಂದ ಖ್ಯಾತಿ ಪಡೆದರು. “ಕ್ರಾಂತಿವೀರ…” ಗೆ ಕರ್ನಾಟಕ ರಾಜ್ಯ ಚಿತ್ರೋತ್ಸವದ ಅತ್ಯುತ್ತಮ ನಟ ಪ್ರಶಸ್ತಿ ಲಭಿಸಿದೆ. ಅವರ ಅಭಿಮಾನಿಗಳು ಇವರನ್ನು “ಚಾಲೆಂಜಿಂಗ್ ಸ್ಟಾರ್” ಮತ್ತು “ಡಿ ಬಾಸ್” ಎಂದು ಕರೆಯುತ್ತಾರೆ; ಥೂಗುದೀಪ ಪ್ರೊಡಕ್ಷನ್ಸ್ ಸಂಸ್ಥೆಯ ಅಧಿಪತಿ.


# Stuff Document Chain Text Summarization

In [None]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("source_pdf/JD - Fraud Strategy Analyst.pdf")

pages = loader.load_and_split()

pages

[Document(metadata={'producer': 'www.ilovepdf.com', 'creator': 'Microsoft® Word 2016', 'creationdate': '2026-02-07T13:39:50+00:00', 'author': 'Anubhav.Bairathi@straive.com', 'moddate': '2026-02-07T13:39:50+00:00', 'source': 'source_pdf/JD - Fraud Strategy Analyst.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1'}, page_content="Job Description \n \nAbout Company: Straive: \n \nStraive is a market leading Content and Data Technology company providing data services, subject \nmatter expertise, & technology solutions to multiple domains. \nData Analytics & Al Solutions, Data Al Powered Operations and Education & Learning form the core \npillars of the company’s long-term vision. The company is a specialized solutions provider to business \ninformation providers in finance, insurance, legal, real estate, life sciences and logistics. Straive continues to \nbe the leading content services provider to research and education publishers. \n \nData Analytics & Al Services: Our Data Solutions 

In [63]:
from langchain_core.runnables import RunnablePassthrough

# Format docs into one string
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# The "Stuff" Pipe
stuff_chain = (
    {"context": format_docs} 
    | prompt 
    | model 
)

result = stuff_chain.invoke(pages)

print(result.content)

**Straive – Fraud Strategy Analytics (Analyst / Sr. Analyst)**  
Global content & data-tech firm headquartered in Singapore; this role sits in the India Data-Analytics practice.

**Mission**  
Build and maintain data-driven fraud-detection capabilities (merchant, 1st-/3rd-party, digital) for international B2B clients.

**Core duties**  
- Mine transaction data, write Python/SQL models, craft fraud rules & reports.  
- Investigate anomalies, quantify risk, present findings to ops/risk/compliance teams.  
- Track regulatory & industry shifts; continuously tune prevention strategies.

**Must-have**  
- 3–10 yrs hands-on fraud analytics; solid Python & SQL.  
- Bachelor’s+ in CS, Stats, Math, Eng from top-tier institute.  
- Sharp storytelling, detail orientation, team collaboration.

**Logistics**  
Location: Gurugram / Bangalore / Hyderabad / Mumbai  
Work: Hybrid (2 office/3 home)  
Shift: 13:00–22:00 IST


In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

# Step A: The "Map" - Summarize each document individually
map_prompt = ChatPromptTemplate.from_template("Summarize this section: {section}")
map_chain = (
    {"section": RunnablePassthrough()} 
    | map_prompt 
    | model 
    | StrOutputParser()
)

# Step B: The "Reduce" - Combine all summaries
reduce_prompt = ChatPromptTemplate.from_template("Combine these summaries into one: {summaries}")
reduce_chain = (
    {"summaries": lambda x: "\n".join(x)} 
    | reduce_prompt 
    | model 
)

# Execution: Map over the list of pages, then Reduce
page_summaries = map_chain.batch([p.page_content for p in pages])
final_summary = reduce_chain.invoke(page_summaries)



content='Straive, a Singapore-headquartered content-and-data-technology company, is hiring an Analyst/Sr. Analyst (3-10 yrs) to expand its Data Analytics & AI fraud-mitigation practice. The role calls for a Python- and SQL-savvy fraud specialist who can design detection rules, statistical/machine-learning models and dashboards covering merchant, first-party and third-party fraud; investigate anomalies; monitor trends; and partner with risk, operations and compliance teams to deliver compliant, data-driven decisions. Ideal candidates bring a quantitative/technical degree (CS, Stats, Math, Eng) from a top-tier university, deep fraud-domain expertise, strong storytelling and cross-functional collaboration skills. Work is hybrid (2 days in office), 1 pm-10 pm IST, and can be based in Gurugram, Bangalore, Hyderabad or Mumbai.' additional_kwargs={} response_metadata={'token_usage': {'completion_tokens': 168, 'prompt_tokens': 245, 'total_tokens': 413, 'completion_time': 0.749597533, 'completi

In [65]:
print(final_summary.content)

Straive, a Singapore-headquartered content-and-data-technology company, is hiring an Analyst/Sr. Analyst (3-10 yrs) to expand its Data Analytics & AI fraud-mitigation practice. The role calls for a Python- and SQL-savvy fraud specialist who can design detection rules, statistical/machine-learning models and dashboards covering merchant, first-party and third-party fraud; investigate anomalies; monitor trends; and partner with risk, operations and compliance teams to deliver compliant, data-driven decisions. Ideal candidates bring a quantitative/technical degree (CS, Stats, Math, Eng) from a top-tier university, deep fraud-domain expertise, strong storytelling and cross-functional collaboration skills. Work is hybrid (2 days in office), 1 pm-10 pm IST, and can be based in Gurugram, Bangalore, Hyderabad or Mumbai.


# refine_prompt

In [66]:
from langchain_core.prompts import ChatPromptTemplate

# 1. Initial Summary Prompt (For the first page)
initial_prompt = ChatPromptTemplate.from_template(
    "Write a concise summary of this first section: {context}"
)

# 2. Refine Prompt (For all subsequent pages)
refine_prompt = ChatPromptTemplate.from_template(
    "Your job is to produce a final summary. "
    "We have an existing summary up to a certain point: {existing_answer}. "
    "We have more context below. Use it to refine the existing summary. "
    "If the new context isn't useful, return the existing summary.\n\n"
    "New Context: {context}"
)

# --- EXECUTION ---

# Start with the first page
current_summary = (initial_prompt | model | StrOutputParser()).invoke(
    {"context": pages[0].page_content}
)

# Loop through the rest of the pages to "Refine"
for i in range(1, len(pages)):
    current_summary = (refine_prompt | model | StrOutputParser()).invoke({
        "existing_answer": current_summary,
        "context": pages[i].page_content
    })

print(current_summary)

**Summary – Section 1: Job Description**  
Straive, a Singapore-headquartered content-and-data-technology leader, seeks an Analyst/Sr. Analyst – Fraud Strategy Analytics (3-10 yrs) for hybrid bases in Gurugram/Bangalore/Hyderabad/Mumbai. The role uses Python, SQL, AI and statistical modeling to detect, investigate and prevent merchant, consumer, first-party and digital fraud; build fraud rules/workflows; monitor trends; and report findings to management. Requires 3+ yrs’ proven fraud-prevention analytics, solid command of risk-assessment methodologies, regulatory compliance, fraud-detection tools, top-tier bachelor’s in CS, Stats, Math or Engineering, and availability 1 pm–10 pm IST.
