In [None]:
# Install required packages
!pip install requests beautifulsoup4 chromadb tqdm groq langchain-groq
!pip install sentence_transformers



In [None]:
import requests
from bs4 import BeautifulSoup
import chromadb
from chromadb.utils import embedding_functions
import re
import concurrent.futures
from tqdm import tqdm
from urllib.parse import urljoin, urlparse
from typing import List, Dict
from langchain_groq import ChatGroq
import os

In [None]:

# Initialize ChromaDB
chroma_client = chromadb.Client()
chroma_collection = chroma_client.get_or_create_collection(
    name="web_content",
    embedding_function=embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name="all-MiniLM-L6-v2"
    )
)

# Thomas Enter you API key here

In [None]:
# Initialize Groq LLM
os.environ["GROQ_API_KEY"] = "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"  # Replace with your Groq API key
llm = ChatGroq(
    model_name="deepseek-r1-distill-llama-70b",
    temperature=0.7,
    max_tokens=2500
)

In [None]:
def crawl_urls(homepage: str, max_pages: int = 100) -> List[str]:
    """
    Crawl URLs starting from the homepage up to max_pages
    """
    print(f"Starting crawl from: {homepage}")
    visited = set()
    to_visit = [homepage]
    all_urls = set()

    with tqdm(total=max_pages, desc="Crawling URLs") as pbar:
        while to_visit and len(all_urls) < max_pages:
            current_url = to_visit.pop(0)
            if current_url in visited:
                continue

            visited.add(current_url)
            try:
                response = requests.get(current_url, timeout=10, headers={
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
                })
                response.raise_for_status()

                soup = BeautifulSoup(response.text, 'html.parser')

                for link in soup.find_all('a', href=True):
                    href = urljoin(current_url, link['href'])
                    parsed_href = urlparse(href)

                    if parsed_href.netloc == urlparse(homepage).netloc:
                        if href not in visited:
                            to_visit.append(href)
                            all_urls.add(href)
                            pbar.update(1)
                            if len(all_urls) >= max_pages:
                                break

            except Exception as e:
                print(f"Error crawling {current_url}: {e}")

    return list(all_urls)

# Usage example:
homepage_url = input("Enter the website URL to crawl: ")
max_pages = int(input("Enter maximum number of pages to crawl: "))
discovered_urls = crawl_urls(homepage_url, max_pages)
print(f"\nDiscovered {len(discovered_urls)} URLs")

Enter the website URL to crawl: https://google.com
Enter maximum number of pages to crawl: 1
Starting crawl from: https://google.com


Crawling URLs: 100%|ˆˆˆˆˆˆˆˆˆˆ| 1/1 [00:00<00:00,  1.66it/s]


Discovered 1 URLs





In [None]:
def scrape_urls(urls: List[str]) -> List[Dict]:
    """
    Scrape content from the discovered URLs
    """
    def scrape_single_url(url: str) -> Dict:
        try:
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()

            soup = BeautifulSoup(response.text, 'html.parser')

            # Remove script and style elements
            for script in soup(["script", "style"]):
                script.decompose()

            # Extract and clean text
            text = soup.get_text()
            text = re.sub(r'\s+', ' ', text).strip()
            text = re.sub(r'[^\w\s.,?!-]', '', text)

            return {"url": url, "content": text, "status": "success"}
        except Exception as e:
            return {"url": url, "content": "", "status": f"error: {str(e)}"}

    print("Starting content scraping...")
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        results = list(tqdm(
            executor.map(scrape_single_url, urls),
            total=len(urls),
            desc="Scraping URLs"
        ))

    # Print summary
    success_count = sum(1 for r in results if r["status"] == "success")
    print(f"\nSuccessfully scraped {success_count} out of {len(urls)} URLs")

    return results

# Usage example:
scraped_data = scrape_urls(discovered_urls)

Starting content scraping...


Scraping URLs: 100%|ˆˆˆˆˆˆˆˆˆˆ| 1/1 [00:00<00:00,  1.66it/s]


Successfully scraped 1 out of 1 URLs





In [None]:
scraped_data


[{'url': 'https://google.com/advanced_search?hl=en&authuser=0',
  'content': 'Google Advanced Search Sign in Advanced Search Find pages with... To do this in the search box all these words Type the important words tricolor rat terrier this exact word or phrase Put exact words in quotes rat terrier any of these words Type OR between all the words you want miniature OR standard none of these words Put a minus sign just before words you dont want -rodent, -Jack Russell numbers ranging from to Put 2 periods between the numbers and add a unit of measure 10..35 lb, 300..500, 2010..2011 Then narrow your results by... language any languageAfrikaansArabicArmenianBelarusianBulgarianCatalanChinese SimplifiedChinese TraditionalCroatianCzechDanishDutchEnglishEsperantoEstonianFilipinoFinnishFrenchGermanGreekHebrewHindiHungarianIcelandicIndonesianItalianJapaneseKoreanLatvianLithuanianNorwegianPersianPolishPortugueseRomanianRussianSerbianSlovakSlovenianSpanishSwahiliSwedishThaiTurkishUkrainianVietname

In [None]:
def chunk_text(text: str, chunk_size: int = 1000) -> List[str]:
    """
    Split text into chunks of approximately chunk_size characters
    """
    words = text.split()
    chunks, current_chunk, current_length = [], [], 0

    for word in words:
        current_length += len(word) + 1
        if current_length > chunk_size:
            chunks.append(' '.join(current_chunk))
            current_chunk = [word]
            current_length = len(word)
        else:
            current_chunk.append(word)

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

def process_and_store(scraped_data: List[Dict]):
    """
    Process scraped content and store in ChromaDB
    """
    print("Processing and storing content...")
    chroma_docs, chroma_meta, chroma_ids = [], [], []
    doc_counter = 0

    for item in scraped_data:
        if item["status"] == "success" and item["content"]:
            chunks = chunk_text(item["content"])
            for chunk in chunks:
                chroma_docs.append(chunk)
                chroma_meta.append({"url": item["url"]})
                chroma_ids.append(f"doc_{doc_counter}")
                doc_counter += 1

    if chroma_docs:
        chroma_collection.add(
            documents=chroma_docs,
            metadatas=chroma_meta,
            ids=chroma_ids
        )
        print(f"Stored {len(chroma_docs)} chunks in ChromaDB")

# Usage example:
process_and_store(scraped_data)


Processing and storing content...




Stored 5 chunks in ChromaDB


In [30]:
\
def query_and_respond(query: str) -> Dict:
    """
    Query the database and generate response using Groq LLM
    """
    try:
        # Get relevant contexts from ChromaDB
        results = chroma_collection.query(
            query_texts=[query],
            n_results=1
        )

        contexts = [doc for doc in results['documents'][0]]

        # Prepare prompt for Groq LLM
        system_prompt = """You are a helpful AI assistant that answers questions based on the provided context.
        Your answers should be accurate, informative, and directly related to the context provided."""

        user_prompt = f"""Context information is below.
        ---------------------
        {' '.join(contexts)}
        ---------------------
        Given the context information, please answer this question: {query}

        If the context doesn't contain relevant information, please say so instead of making up an answer."""

        # Generate response using Groq
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ]

        response = llm.invoke(messages).content

        return {
            "query": query,
            "response": response,
            "contexts": contexts
        }
    except Exception as e:
        return {
            "query": query,
            "response": f"Error processing query: {e}",
            "contexts": []
        }

def chat_interface():
    """
    Interactive chat interface
    """
    print("\nWelcome to the website chatbot! Type 'exit' to quit.")
    print("Using Groq's Mixtral-8x7b model for responses...")

    while True:
        query = input("\nEnter your question: ")
        if query.lower() == 'exit':
            break

        print("\nProcessing your question...")
        result = query_and_respond(query)

        print("\nResponse:", result["response"])
        print("\nSources used:")
        for i, context in enumerate(result["contexts"], 1):
            print(f"\n{i}. {context[:200]}...")

# Usage example:
chat_interface()


Welcome to the website chatbot! Type 'exit' to quit.
Using Groq's Mixtral-8x7b model for responses...

Processing your question...

Response: <think>
Okay, so the user has given me a context with a bunch of information about different wines, producers, regions, and some other details like typology, year, denomination, region, size, and whether they're on sale. Then, they asked me to answer a question, but the question they provided is just a URL: https://google.com.

Hmm, that's a bit confusing. I mean, the context is all about wine-related data, with things like producer names, regions, sizes, and so on. But the question isn't a specific question about wine; it's just a link to Google's homepage. So, I'm not sure what the user is asking for here.

Maybe they meant to ask something else but accidentally pasted the wrong thing. Or perhaps they're testing me to see if I can handle such inputs. Either way, the context provided doesn't contain any information related to the URL they've gi

KeyboardInterrupt: Interrupted by user