In [3]:

from dotenv import load_dotenv

load_dotenv() 

# Web Search

In [1]:
from ddg import Duckduckgo

ddg_api = Duckduckgo()

ddg_api.search("What is the latest video posted by Learn with Vichu youtube channel")

{'success': True,
 'data': [{'title': 'Learn with Vichu - YouTube',
   'url': 'https://www.youtube.com/@learnwithvichu',
   'description': 'Do you want tolearnmachine learning and deep learning in a fun way?. then thischannelisfor you. you can findvideosrelated to python, machine learning,...'},
  {'title': 'vichu - YouTube',
   'url': 'https://www.youtube.com/channel/UChuQBUSLJfx4G3JfE2kvHYA/videos',
   'description': 'Learndriving in 5 hours at your door step'},
  {'title': 'Vichu - YouTube',
   'url': 'https://www.youtube.com/channel/UCMPTINKbL7T16AUPJXS5AHg/playlists',
   'description': 'Support ourYouTubechannel....VichuHD-MMR100- 200- 300- 400- 500- 600- 700- 800- 900- 1000/1K- - Waiting for it.... First preference.... ️'},
  {'title': 'How to Start a Youtube Channel for Beginners: 7 Pro Tips',
   'url': 'https://www.skillshare.com/en/blog/how-to-start-a-youtube-channel-for-beginners-7-pro-tips/',
   'description': 'Click on your profile picture to reveal a dropdown menu. 3. Choo

In [33]:
import asyncio
from ddg import Duckduckgo
from crawl4ai import AsyncWebCrawler

class WebSearch:

    def __init__(self,top_n_urls=5):

        self.top_n_urls=top_n_urls

        self.ddg_api = Duckduckgo()

    def get_urls(self,searc_query):

        "Collects URLs using the duckduckgo API"
        
        results = self.ddg_api.search(searc_query)

        urls = [i['url'] for i in results['data']][1:self.top_n_urls+1]

        return urls
    
    async def scrap_url(self,url):
        "Scraps Individual URLs"

        async with AsyncWebCrawler(verbose=True) as crawler:

            result = await crawler.arun(url=url)

            return (url,result.markdown)

    async def search_online(self,search_query):

        "Main method to scrap data from websites"

        urls = self.get_urls(search_query)

        tasks = [self.scrap_url(url) for url in urls]

        results = await asyncio.gather(*tasks)

        return results

if __name__ == "__main__":

    search_Engine = WebSearch()

    content = asyncio.run(search_Engine.search_online("Who is the CEO of OpenAI"))

    print("-----------------------------------")
    print(content)
    print("-----------------------------------")


# Vector Store

In [1]:

import sys
sys.dont_write_bytecode =True

# ------------------------------------------------- Qdrant Vector Store -------------------------------- #

import uuid
import hashlib
from tqdm import tqdm
from qdrant_client import QdrantClient

class QdrantVectorStore:
    
    def __init__(self,db_location="qdrant",dense_model="sentence-transformers/all-MiniLM-L6-v2",sparse_model = "prithivida/Splade_PP_en_v1",hybird=True) -> None:
        
        self.client = QdrantClient(path=f"vector_stores/{db_location}")
        
        self.client.set_model(dense_model)
        # comment this line to use dense vectors only
        if hybird:
            self.client.set_sparse_model(sparse_model)

            self.client.recreate_collection(
                collection_name="default_schema",
                vectors_config=self.client.get_fastembed_vector_params(),
                # comment this line to use dense vectors only
                sparse_vectors_config=self.client.get_fastembed_sparse_vector_params(),  
            )
        else:

            self.client.recreate_collection(
                collection_name="default_schema",
                vectors_config=self.client.get_fastembed_vector_params()
            )

    def add_documents(self,documents,ids,metadata=[],collection_name="default_schema"):

        if not len(ids):

            ids = [self.generate_uuid_based_id(doc) for doc in documents]

        self.client.add(
        collection_name=collection_name,
        documents=documents,
        metadata = metadata,
        ids=tqdm(ids))

    def get_relavant_documents(self, text: str,collection_name:str="default_schema",top_n_similar_docs=6):
        search_result = self.client.query(
            collection_name=collection_name,
            query_text=text,
            limit=top_n_similar_docs, 
        )
        metadata = [{"id":hit.id,"document":hit.metadata['document'],"url":hit.metadata['url']} for hit in search_result]

        return metadata
    
    def generate_uuid_based_id(self, text):
        """
        Generate a full UUID based on the given text using UUID5.

        :param text: Input text to generate the unique ID from.
        :return: A UUID string.
        """
        # Generate a UUID based on a namespace and the text
        namespace = uuid.NAMESPACE_DNS  # You can use different namespaces (DNS, URL, etc.)
        
        # Create a UUID5 based on the text and namespace
        unique_uuid = uuid.uuid5(namespace, text)
        
        return str(unique_uuid)

    def delete_existing_data(self):

        return self.client.delete_collection("default_schema")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

vdb = QdrantVectorStore()

paragraphs = [
    # Topic 1: Artificial Intelligence
    "Artificial intelligence (AI) is transforming the world in unprecedented ways. With machine learning algorithms becoming more sophisticated, AI is now capable of performing tasks that once required human intelligence. From natural language processing to self-driving cars, the applications are vast and varied.",
    
    "The ethical implications of AI are a major concern for researchers. While AI can enhance efficiency and decision-making, it also raises questions about privacy, bias, and job displacement. Balancing innovation with responsibility is critical to the future of AI development.",
    
    "AI in healthcare is proving to be a game-changer. With AI-powered diagnostic tools, doctors can detect diseases earlier and with greater accuracy. AI algorithms can also help in drug discovery and personalized treatment plans, making healthcare more precise and accessible.",
    
    "Deep learning, a subset of AI, is driving advancements in image and speech recognition. These technologies are being integrated into everyday devices such as smartphones and virtual assistants, making interactions with technology more seamless and intuitive.",
    
    "As AI continues to evolve, the debate over AI regulation grows louder. Policymakers are grappling with how to create frameworks that promote innovation while ensuring that AI is used ethically and safely. International cooperation will be key in shaping the future of AI regulation.",
    
    # Topic 2: Climate Change
    "Climate change is one of the most pressing challenges facing humanity today. Rising global temperatures, melting ice caps, and extreme weather events are all symptoms of a warming planet, driven by human activities such as deforestation and burning fossil fuels.",
    
    "The Paris Agreement, signed by nearly 200 countries, aims to limit global warming to below 2 degrees Celsius. Despite these efforts, many experts warn that current commitments are not enough to meet these goals, and more aggressive action is needed.",
    
    "Renewable energy sources like solar, wind, and hydropower are critical in the fight against climate change. By transitioning to clean energy, we can reduce greenhouse gas emissions and slow the rate of global warming. However, this shift requires significant investment and global cooperation.",
    
    "Deforestation is a major contributor to climate change, as trees play a vital role in absorbing carbon dioxide from the atmosphere. Efforts to combat deforestation, such as reforestation projects and sustainable land use practices, are essential to curbing the effects of climate change.",
    
    "Climate change disproportionately affects vulnerable communities, particularly in developing countries. Rising sea levels, droughts, and extreme weather events threaten livelihoods, food security, and access to clean water. Addressing climate change requires a focus on both mitigation and adaptation strategies.",
    
    # Topic 3: Space Exploration
    "Space exploration has fascinated humanity for centuries. From the first moon landing to the Mars rover missions, each step into the cosmos brings new discoveries and insights into the nature of our universe. Space agencies like NASA and private companies like SpaceX are leading the charge in the next era of space exploration.",
    
    "Mars has been a key focus for space exploration in recent years. With multiple missions from various space agencies, scientists are searching for signs of past or present life on the red planet. The ultimate goal is to one day establish a human presence on Mars.",
    
    "The development of reusable rockets by companies like SpaceX has revolutionized space travel. By significantly reducing the cost of launching payloads into orbit, reusable rockets are making space more accessible and opening the door to new commercial opportunities.",
    
    "The search for extraterrestrial life is one of the most intriguing aspects of space exploration. Scientists are using advanced telescopes and probes to search for habitable planets and signs of life beyond our solar system. This quest could answer fundamental questions about our place in the universe.",
    
    "International collaboration is key to the future of space exploration. Agencies like NASA, ESA, and Roscosmos often work together on major projects, sharing resources and expertise. These collaborations are essential to the success of large-scale missions, such as building the next generation of space stations.",
    
    # Topic 4: Mental Health
    "Mental health is an essential part of overall well-being, yet it is often overlooked. Conditions such as depression, anxiety, and bipolar disorder affect millions of people worldwide, and access to effective treatment remains a challenge in many regions.",
    
    "The stigma surrounding mental health issues prevents many people from seeking help. Education and awareness campaigns are critical in changing public perception and encouraging individuals to prioritize their mental health and seek professional support when needed.",
    
    "Technology is playing an increasingly important role in mental health care. Apps and online platforms provide users with tools for managing stress, anxiety, and other mental health conditions. Teletherapy services have also gained popularity, offering convenient and accessible support.",
    
    "Workplace mental health is a growing area of concern. Stress, burnout, and work-life imbalance are common issues that can negatively impact employees' well-being. Companies are starting to recognize the importance of creating supportive environments that promote mental health.",
    
    "Mental health services are often underfunded, particularly in low- and middle-income countries. Expanding access to affordable, quality mental health care is crucial to addressing the global mental health crisis and ensuring that everyone can receive the support they need."
]


Fetching 5 files: 100%|██████████| 5/5 [00:00<?, ?it/s]
Fetching 5 files: 100%|██████████| 5/5 [00:00<?, ?it/s]
  self.client.recreate_collection(


In [3]:
vdb.add_documents(paragraphs,ids=[],metadata=[{"sample":"Sample"}]*20)

100%|██████████| 20/20 [00:01<00:00, 12.64it/s]


In [30]:
vdb.client.collection_exists("default_schema")

True

In [4]:
vdb.get_relavant_documents("Artificial Intelligence")

In [5]:
vdb.client.delete_collection("default_schema")

True

In [12]:
collection = vdb.client.get_collection("default_schema")

# Text Splitter

In [22]:
from semantic_text_splitter import TextSplitter

class CustomTextSplitter:

    def __init__(self,max_chunk_length=300,model_name="gpt-4o"):

        self.splitter = TextSplitter.from_tiktoken_model(model_name, max_chunk_length)

    def get_chunks(self,documents):

        all_chunks = []

        metedata = []

        for doc in documents:

            chunks = self.splitter.chunks(doc[1])

            metedata.extend([{"url":doc[0]}]*len(chunks))

            all_chunks.extend(chunks)

        return all_chunks,metedata
    
# textsplitter =CustomTextSplitter()

# chunks = textsplitter.get_chunks(paragraphs)

In [24]:
chunks,meta = textsplitter.get_chunks([(i,doc) for i,doc in enumerate(paragraphs)])


In [28]:
len(chunks)

67

In [27]:
len(meta)

67

# Search Tool

In [None]:
import asyncio
from webscraper import WebSearch
from textsplitter import CustomTextSplitter
from vectorestores import QdrantVectorStore

class SearhTool:

    def __init__(self,top_n_urls=2,top_n_chunks=5):

        self.top_n_urls=top_n_urls

        self.top_n_chunks=top_n_chunks
        
        self.search_Engine = WebSearch(top_n_urls=self.top_n_urls)

        self.textsplitter =CustomTextSplitter()

        self.vectore_store = QdrantVectorStore()

    async def get_online_details(self,user_question):

        # We need to scrap data
        print("Collecting URLs....")
        scrapped_data = await self.search_Engine.search_online(user_question)

        # Split the web pages into smaller chunks

        print("Scrapping Data....")
        documents, metadata = self.textsplitter.get_chunks(scrapped_data)

        # Add chunks and metadata to the vector store

        print("Adding Data to Vectore Store....")
        self.vectore_store.add_documents(documents=documents,ids=[],metadata=metadata)

        # get the relevant documents
        print("Getting Data From Vectore Store....")
        relevant_documents = self.vectore_store.get_relavant_documents(user_question,top_n_similar_docs=self.top_n_chunks)

        # Format the documents

        final_data = ""

        for data in relevant_documents:

            final_data+=f"\n\nSource URL : {data['url']}\nData : {data['document']}\n\n"

        return final_data
    

if __name__ == "__main__":

    tool = SearhTool()

    print("Started......")

    data = asyncio.run(tool.get_online_details("Who is the CEO of Factspan"))

    print("Data: ",data)