In [1]:
import os
from dotenv import load_dotenv
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import WebBaseLoader
import asyncio
import bs4
from typing import List
from langchain.schema import Document
from langchain.document_loaders import AsyncHtmlLoader
from langchain.document_transformers import Html2TextTransformer

load_dotenv()

USER_AGENT environment variable not set, consider setting it to identify your requests.
USER_AGENT environment variable not set, consider setting it to identify your requests.


True

In [6]:
def generate_response(
    user_prompt: str,
    temperature: int = 0.7,
    tokens: int = 256,
) -> str:
    if not os.getenv("GROQ_API_KEY"):
        os.environ["GROQ_API_KEY"] = input("Enter API key for Groq: ")

    llm = ChatGroq(
        model="openai/gpt-oss-120b",
        temperature=temperature,
        max_tokens=tokens,
    )

    prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                "You are a professional summarizer assistant. You return just the summarized \
                    output without any formatting or anything, just plain english output. When you cannot summarize \
                        just say 'I am unable to summarize the given text' in a better way",
            ),
            (
                "human",
                "{user_prompt}",
            ),
        ]
    )

    # Combine prompt
    chain = prompt | llm | StrOutputParser()

    response = chain.invoke({"user_prompt": user_prompt})
    return response

In [7]:
generate_response("Are we ready?")

"Absolutely—I'm ready whenever you are! Let me know what you'd like to dive into."

In [7]:
def summarize_text(text: str) -> str:
    summary_prompt = f"Summarize the below text for an eight-grade student within 280 characters (no hashtags, no emojis, no links) \
        \nText: {text}"
    formatted_prompt = summary_prompt.format(text)
    # print(f"Formatted Prompt: {formatted_prompt}")
    response = generate_response(formatted_prompt)
    return response

In [159]:
summarize_text("Hello there")

'A simple, friendly greeting that says “Hello there.”'

In [2]:
urls = [
    "https://getrecast.com/modern-media-mix-modeling/",
    "https://www.saxifrage.xyz/post/econometrics-gsheets",
    "https://blog.hurree.co/blog/marketing-mix-modeling",
    "https://www.thinkwithgoogle.com/marketing-strategies/data-and-measurement/art-of-marketing-mix-models/",
    "https://digiday.com/marketing/wtf-is-marketing-mix-modeling/",
    "https://towardsdatascience.com/market-mix-modeling-mmm-101-3d094df976f9",
    "https://en.wikipedia.org/wiki/Marketing_mix_modeling",
]

In [None]:
async def load_documents(urls: list[str]):
    try:
        loader = WebBaseLoader(urls)
        docs = []
        async for doc in loader.alazy_load():
            docs.append(doc)
        return docs
    except Exception as error:
        print(f"Error while loading documents: {error}")

In [162]:
docs = await load_documents(urls)



In [None]:
text = docs[1].metadata.get("description")
summarize_text(text)

In [3]:
async def load_formatted_documents(urls: list[str]):
    try:
        loader = AsyncHtmlLoader(urls)
        transformer = Html2TextTransformer()
        # docs: List[Document] = []
        # async for doc in loader.alazy_load():  # ensure this yields Document
        #     # If it yields tuples, extract the Document part here:
        #     if isinstance(doc, tuple) and len(doc) > 0:
        #         docs.append(doc)
        docs = loader.load()
        transformer = Html2TextTransformer()
        docs_text = transformer.transform_documents(docs)  # batch transform
        return docs_text
    except Exception as error:
        print(f"Error while loading documents: {error}")

In [4]:
formatted_docs = await load_formatted_documents(urls)

Fetching pages: 100%|##########| 7/7 [00:02<00:00,  2.88it/s]


In [9]:
formatted_text = formatted_docs[0].metadata.get("description")
print(f"Formatted Docs: {formatted_text}")
summarize_text(formatted_text)

Formatted Docs: How can your organization shift from legacy to modern media mix modeling? We explain how to operationalize modern solutions in your business.


"Your company can move from old ways of measuring ads to newer, smarter methods. We'll show how to set up these modern tools in your business."