In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ["GROQ_API_KEY"]  = os.getenv("GROQ_API_KEY")
os.environ["TAVILY_API_KEY"] = os.getenv("TAVILY_API_KEY")

In [18]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_transformers import BeautifulSoupTransformer
from langchain_community.vectorstores import FAISS
from langchain_ollama import OllamaEmbeddings

embd = OllamaEmbeddings(model="llama3:latest", base_url="http://localhost:11434")



urls = ["https://www.goal.com/en",
        "https://www.fourfourtwo.com/",
        "https://footballinsides.com/"
    ]

# Load
docs = [WebBaseLoader(url).load() for url in urls]
docs_list = [item for sublist in docs for item in sublist]

# Split
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1000, chunk_overlap=100
)
doc_splits = text_splitter.split_documents(docs_list)

# Add to vectorstore
vectorstore=FAISS.from_documents(
    documents=doc_splits,
    embedding=embd
)


retriever=vectorstore.as_retriever()


In [19]:
retriever.invoke("Current market news")

[Document(id='6d97eed6-7b29-4b6f-95fb-834b97645d46', metadata={'source': 'https://www.goal.com/en', 'title': 'Football News, Live Scores, Results & Transfers | Goal.com', 'description': 'The latest football news, live scores, results, rumours, transfers, fixtures, tables and player profiles from around the world, including EURO U21.', 'language': 'en'}, page_content='PSG as "some of the best football" he has seen in years. Vincent Kompany\'s side are in imperious form to start the season, winning a record-breaking 16 straight games to open the 2025/26 season. Getty Images SportP. FodenBorussia DortmundFoden\'s \'smooth control and finish\' held up as example for BVB star Niko Kovac held up Manchester City star Phil Foden\'s \'smooth control and finish\' as an example for Dortmund forward Karim Adeyemi after yet another disappointing performance on the pitch. Foden starred with a brace, while Erling Haaland and Rayan Cherki also found the back of the net as City thrashed the Bundesliga 

### retrived finished

In [21]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq

from pydantic import BaseModel, Field

class GradeDocuments(BaseModel):
    """Binary score for relevance check on retrieved documents."""
    binary_score: str = Field(
        descripton="Documents are relevant to the question, 'yes' or 'no'"
    )

llm=ChatGroq(model="qwen/qwen3-32b")
structured_llm_grader = llm.with_structured_output(GradeDocuments)

system = """You are a grader assessing relevance of a retrieved document to a user question. \n 
    If the document contains keyword(s) or semantic meaning related to the question, grade it as relevant. \n
    Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question."""

grade_prompt = ChatPromptTemplate.from_messages(
    [
         ("system", system),
         ("human", "Retrieved document: \n\n {document} \n\n User question: {question}"),
    ]
)

retrieval_grader = grade_prompt | structured_llm_grader

# Testing
question = "agent memory"
docs = retriever.invoke(question)
doc_txt = docs[1].page_content
print(retrieval_grader.invoke({"document": doc_txt, "question": question}))

question = "latest football news"
docs = retriever.invoke(question)
doc_txt = docs[1].page_content
print(retrieval_grader.invoke({"document": doc_txt, "question": question}))


binary_score='no'
binary_score='yes'


In [23]:
from langchain import hub
from langchain_core.output_parsers import StrOutputParser

promt = hub.pull("rlm/rag-prompt")
rag_chain = promt | llm | StrOutputParser()

generation = rag_chain.invoke({"context": docs, "question": question})
print(generation)



<think>
Okay, let's tackle this query for the latest football news. The user provided several documents from Goal.com and FourFourTwo. First, I'll scan through the content to find the most recent and significant updates.

From the first Goal.com document, there's mention of Bayern Munich's Vincent Kompany's team winning 16 straight games, which is a big achievement. Also, Phil Foden's performance in a 4-1 win over Dortmund is highlighted. Another part talks about Barcelona's Deco dismissing criticism of Hansi Flick. Then there's a transfer rumor about Bayern and Liverpool's Konate. The second Goal.com document adds more transfers: Wolves appointing Rob Edwards, Kobbie Mainoo's situation at Man Utd, and Konate's potential move to Bayern. The FourFourTwo page has quizzes and some Premier League-related content but isn't as newsworthy. The third Goal.com doc mentions Joey Barton's legal issues and Mary Earps' controversy, plus youth talents like Kevin Filling and Rayan Vitor.

Now, to det

In [24]:
system = """You a question re-writer that converts an input question to a better version that is optimized \n 
     for web search. Look at the input and try to reason about the underlying semantic intent / meaning."""
rewriter_prompt = ChatPromptTemplate.from_messages(
    [
         ("system", system),
         ("human", "Here is the initial question: \n\n {question} \n Formulate an improved version of the question."),
    ]
)

question_rewriter = rewriter_prompt | llm | StrOutputParser()
question_rewriter.invoke({"question": question})


'<think>\nOkay, the user wants to rephrase "latest football news" into a better version for web search. Let me think. The original is pretty straightforward, but maybe adding keywords like "updates" or "headlines" could help. Also, specifying a country might make it more targeted, but since it\'s not mentioned, I should keep it general. Maybe "top" or "most recent" instead of "latest"? Also, considering different regions, but again, without context. So, possible options: "Latest football news updates" or "Most recent football headlines". Wait, "headlines" might be better for search. Also, including "global" if it\'s international. Hmm, but the user didn\'t specify. Let me check common search terms. People often search for "latest football news updates" or "most recent football headlines". Maybe combining both: "Latest football news updates and headlines"? Or "Top recent football news updates". Alternatively, "Most recent global football news updates". That might be too long. Let me pic

In [28]:
from langchain_community.tools.tavily_search import TavilySearchResults

web_search_tool = TavilySearchResults(k=3)

In [29]:
from typing import List
from typing_extensions import TypedDict

class GraphState(TypedDict):
    """
    Represents the state of the graph.

    Attributes:
        question: question
        generation: LLM generation
        web_search: whether to add search
        documents: list of documents
    """

    question: str
    generation: str
    web_search: bool
    documents: List[str]