In [5]:
import os
import requests
from bs4 import BeautifulSoup
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.document_loaders import SeleniumURLLoader, UnstructuredURLLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Define the path where you want to save the Faiss index
DB_FAISS_PATH = r'vector_space\faiss'

# Function to load data from URLs
def load_url_data(base_url):
    try:
        # Send a request to the base URL
        reqs = requests.get(base_url)
        soup = BeautifulSoup(reqs.text, 'html.parser')

        # Extract URLs from the base page
        urls = [link.get('href') for link in soup.find_all('a') if link.get('href')]

        # List to store secondary URLs
        secondary_urls = []

        # Extract URLs from the linked pages
        for url in urls:
            try:
                reqs = requests.get(url)
                if reqs.status_code == 200:
                    soup = BeautifulSoup(reqs.text, 'html.parser')
                    secondary_urls.extend([link.get('href') for link in soup.find_all('a') if link.get('href')])
                else:
                    print(f"Access Denied for URL: {url}")
            except Exception as e:
                print(f"Error occurred for URL: {url} - {str(e)}")

        # Combine both lists of URLs
        all_urls = urls + secondary_urls

        # Log the number of URLs found
        print(f"Total URLs found: {len(all_urls)}")
        print(all_urls)

        # Use SeleniumURLLoader to load data from the URLs
        data_loader = SeleniumURLLoader(urls=all_urls)
        print("Loading data from URLs...")

        # Load the documents from URLs
        documents = data_loader.load()
        print("Data loaded successfully from URLs.")

        return documents
    except Exception as e:
        print(f"Error occurred while loading URL data: {str(e)}")
        return []

# Function to create embeddings and save the Faiss index
def vector(documents):
    try:
        # Step 2: Split the text content of documents into smaller chunks
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
        print("Splitting text into chunks...")

        # Split documents into chunks of text
        texts = text_splitter.split_documents(documents)
        print(f"Total text chunks created: {len(texts)}")

        # Step 3: Create embeddings for the text using Hugging Face's Sentence Transformers
        embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2', model_kwargs={'device': 'cpu'})
        print("Creating text embeddings...")

        # Step 4: Use Faiss to create an index for the text data
        db = FAISS.from_documents(texts, embeddings)
        print("Text embeddings created and indexed.")

        return db
    except Exception as e:
        print(f"Error occurred while creating embeddings: {str(e)}")
        return None

# Example usage
if __name__ == "__main__":
    base_url = "https://mydukaan.io/"  # Replace with your starting URL
    documents = load_url_data(base_url)
    if documents:
        db = vector(documents)
        if db:
            # Save the FAISS index to disk
            os.makedirs(DB_FAISS_PATH, exist_ok=True)
            db.save_local(DB_FAISS_PATH)
            print(f"FAISS index saved at {DB_FAISS_PATH}")
        else:
            print("Failed to create FAISS index.")
    else:
        print("No documents loaded.")


Error occurred for URL: / - Invalid URL '/': No scheme supplied. Perhaps you meant https:///?
Error occurred for URL: / - Invalid URL '/': No scheme supplied. Perhaps you meant https:///?
Error occurred for URL: / - Invalid URL '/': No scheme supplied. Perhaps you meant https:///?
Error occurred for URL: / - Invalid URL '/': No scheme supplied. Perhaps you meant https:///?
Error occurred for URL: /themes - Invalid URL '/themes': No scheme supplied. Perhaps you meant https:///themes?
Access Denied for URL: https://angel.co/company/dukaan-app/jobs
Error occurred for URL: / - Invalid URL '/': No scheme supplied. Perhaps you meant https:///?
Error occurred for URL: /tools - Invalid URL '/tools': No scheme supplied. Perhaps you meant https:///tools?
Error occurred for URL: /dukaan-for-pc - Invalid URL '/dukaan-for-pc': No scheme supplied. Perhaps you meant https:///dukaan-for-pc?
Error occurred for URL: /delivery - Invalid URL '/delivery': No scheme supplied. Perhaps you meant https:///deli

Error fetching or processing /, exception: Message: invalid argument
  (Session info: chrome-headless-shell=125.0.6422.78)
Stacktrace:
	GetHandleVerifier [0x00007FF7761A1F22+60322]
	(No symbol) [0x00007FF77611CE99]
	(No symbol) [0x00007FF775FD7CE9]
	(No symbol) [0x00007FF775FBFCCE]
	(No symbol) [0x00007FF775FBE1E5]
	(No symbol) [0x00007FF775FBE9DC]
	(No symbol) [0x00007FF775FDAC91]
	(No symbol) [0x00007FF77606C5FE]
	(No symbol) [0x00007FF77604C21A]
	(No symbol) [0x00007FF77606BC80]
	(No symbol) [0x00007FF77604BFC3]
	(No symbol) [0x00007FF776019617]
	(No symbol) [0x00007FF77601A211]
	GetHandleVerifier [0x00007FF7764B946D+3301613]
	GetHandleVerifier [0x00007FF776503693+3605267]
	GetHandleVerifier [0x00007FF7764F9410+3563664]
	GetHandleVerifier [0x00007FF7762542F6+790390]
	(No symbol) [0x00007FF7761274DF]
	(No symbol) [0x00007FF7761233D4]
	(No symbol) [0x00007FF776123562]
	(No symbol) [0x00007FF776112F6F]
	BaseThreadInitThunk [0x00007FFA38C4257D+29]
	RtlUserThreadStart [0x00007FFA39BEAA48

Data loaded successfully from URLs.
Splitting text into chunks...
Total text chunks created: 15148
Creating text embeddings...
Text embeddings created and indexed.
FAISS index saved at vector_space\faiss


In [16]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader
from bs4 import BeautifulSoup as Soup
import os
import threading
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2', model_kwargs={'device': 'cpu'})
        # Load the FAISS database
faiss_db = DB_FAISS_PATH
db = FAISS.load_local(faiss_db, embeddings)

query="what is dukaan"
retriever = db.as_retriever()
docs = retriever.invoke(query)
docs

[Document(page_content='Dukaan is an eCommerce website builder that comes with an array of features like advanced analytics, optimized check-outs, responsive themes, etc., which are essential to skyrocket the growth of your business. It is designed keeping the convenience of its users in mind. With its hassle-free design approach, you can set up your store in just 30 seconds, reducing your time-to-market and allowing you to focus on scaling your business.', metadata={'source': 'https://mydukaan.io/blog/ecommerce-website-builder-for-small-business/', 'title': 'Best eCommerce Website Builder for Small Business in 2024', 'description': 'Take a look at our compilation of the BEST eCommerce website builder for small business to find the perfect fit to take your business online.', 'language': 'en-US'}),
 Document(page_content='Dukaan is an eCommerce website builder that comes with an array of features like advanced analytics, optimized check-outs, responsive themes, etc., which are essential

In [17]:
docs = db.similarity_search(query)
docs[0]

Document(page_content='Dukaan is an eCommerce website builder that comes with an array of features like advanced analytics, optimized check-outs, responsive themes, etc., which are essential to skyrocket the growth of your business. It is designed keeping the convenience of its users in mind. With its hassle-free design approach, you can set up your store in just 30 seconds, reducing your time-to-market and allowing you to focus on scaling your business.', metadata={'source': 'https://mydukaan.io/blog/ecommerce-website-builder-for-small-business/', 'title': 'Best eCommerce Website Builder for Small Business in 2024', 'description': 'Take a look at our compilation of the BEST eCommerce website builder for small business to find the perfect fit to take your business online.', 'language': 'en-US'})

In [18]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader
from bs4 import BeautifulSoup as Soup
import os
import threading
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2', model_kwargs={'device': 'cpu'})
        # Load the FAISS database
faiss_db = DB_FAISS_PATH
db = FAISS.load_local(faiss_db, embeddings)
query="how it boost sales"
docs_and_scores = db.similarity_search_with_score(query)
docs_and_scores
for item in docs_and_scores[:1]:
    document, score = item
    page_content = document.page_content
    print("Page Content:", page_content)
    print("Score:", score)

Page Content: 1. SEO (Search Engine Optimization)

You can increase your sales by making your store and product listings more visible to users looking for similar or related things. Your websiteâ€™s ranking on search engine results pages can be improved significantly if you use some SEO marketing best practices including meta descriptions, relevant images, and tags.

2. Paid ads
Score: 0.63286674
