<a href="https://colab.research.google.com/github/Ranamoeed/CodeBankVC/blob/main/Untitled.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import logging
import os
import random
from bs4 import BeautifulSoup
import requests
from functools import lru_cache
import threading

logging.basicConfig(level=logging.INFO)

class WebDataFetcherProcessor:
    def __init__(self, user_agents=None, timeout=30):
        self.user_agents = user_agents or [
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
        ]
        self.timeout = timeout
        self.image_download_folder = "images"

    def _make_request(self, url):
        try:
            headers = self._get_random_user_agent()
            response = requests.get(url, headers=headers, timeout=self.timeout)
            response.raise_for_status()
            return response.text
        except requests.RequestException as e:
            logging.error(f"Request failed for URL '{url}': {e}")
            return None

    def _get_random_user_agent(self):
        return {'User-Agent': random.choice(self.user_agents)}

    @lru_cache(maxsize=128)
    def fetch_website_content(self, url):
        try:
            html = self._make_request(url)
            if not html:
                return None
            soup = BeautifulSoup(html, 'html.parser')
            text_content = "\n".join([p.text.strip() for p in soup.find_all('p') if p.text.strip()])
            meta_data = self.extract_metadata(soup)
            image_urls = [img['src'] for img in soup.find_all('img') if img.get('src')]
            return {"text_content": text_content, "meta_data": meta_data, "image_urls": image_urls}
        except Exception as e:
            logging.error(f"Error parsing website content from {url}: {e}")
            return None

    def extract_metadata(self, soup):
        title = soup.title.string.strip() if soup.title else ""
        description = soup.find("meta", {"name": "description"})['content'].strip() if soup.find(
            "meta", {"name": "description"}) else ""
        return {"title": title, "description": description}

    def download_image(self, url):
        try:
            headers = self._get_random_user_agent()
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            image_data = response.content
            filename = url.split("/")[-1]
            image_path = os.path.join(self.image_download_folder, filename)
            with open(image_path, "wb") as f:
                f.write(image_data)
            return image_path
        except Exception as e:
            logging.error(f"Error downloading image from {url}: {e}")
            return None

    @lru_cache(maxsize=128)
    def search(self, query, num_results=10):
        try:
            search_url = f"https://duckduckgo.com/html/?q={query.replace(' ', '+')}"
            html = self._make_request(search_url)
            if html:
                soup = BeautifulSoup(html, 'html.parser')
                links = [a['href'] for a in soup.find_all('a', class_='result__a')]
                return links[:num_results]
            else:
                return []
        except Exception as e:
            logging.error(f"Error searching for '{query}': {e}")
            return []

    def process_query(self, query, num_websites=10):
        try:
            search_results = self.search(query, num_websites)
            if not search_results:
                logging.warning("No search results found.")
                return []
            websites_data = []
            for link in search_results:
                website_data = self.fetch_website_content(link)
                if website_data:
                    website_data["image_paths"] = [self.download_image(img_url) for img_url in website_data["image_urls"]]
                    websites_data.append(website_data)
            return websites_data
        except Exception as e:
            logging.error(f"Error processing query '{query}': {e}")
            return []

def process_query_on_thread(query, num_websites):
    web_data_fetcher_processor = WebDataFetcherProcessor()
    return web_data_fetcher_processor.process_query(query, num_websites)

def main(queries):
    threads = []
    for query in queries:
        thread = threading.Thread(target=process_query_on_thread, args=(query,))
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()