### Importing Libraries

In [95]:
import scrapy
from scrapy.crawler import CrawlerProcess
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import pandas as pd


###  Creating Summarization of Documents

In [2]:
# Load the summarization model
summarizer = pipeline("summarization", model="google/flan-t5-base")

Device set to use cpu


In [3]:
# List to store extracted data
scraped_data = []

class LinksSpider(scrapy.Spider):
    name = "link_spider"
    start_urls = ["https://www.langchain.com"]  # Replace with your target link

    def parse(self, response):
        # Extract all links from the main page
        links = response.css("a::attr(href)").getall()
        for link in links:
            if link.startswith("http"):  # Ensure it's a valid absolute URL
                yield response.follow(link, callback=self.parse_link)

    def parse_link(self, response):
        # Extract text content from the page
        text_content = " ".join(response.css("p::text").getall())  # Extract all paragraph text

        if text_content:
            # Generate a summary (max length 100 words)
            summary = summarizer(text_content, max_length=100, min_length=30, do_sample=False)[0]["summary_text"]
        else:
            summary = "No summary available"

        # Store extracted data in a list
        scraped_data.append({"url": response.url,
            "title": response.css("title::text").get(),"summary": summary})

In [4]:
# Run the spider in Jupyter Notebook
process = CrawlerProcess()
process.crawl(LinksSpider)
process.start()


2025-02-11 12:23:19 [scrapy.utils.log] INFO: Scrapy 2.12.0 started (bot: scrapybot)
2025-02-11 12:23:19 [scrapy.utils.log] INFO: Versions: lxml 5.3.0.0, libxml2 2.11.7, cssselect 1.2.0, parsel 1.10.0, w3lib 2.3.1, Twisted 24.11.0, Python 3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 13:17:27) [MSC v.1929 64 bit (AMD64)], pyOpenSSL 25.0.0 (OpenSSL 3.4.0 22 Oct 2024), cryptography 44.0.0, Platform Windows-11-10.0.22631-SP0
2025-02-11 12:23:19 [scrapy.addons] INFO: Enabled addons:
[]
2025-02-11 12:23:19 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2025-02-11 12:23:19 [scrapy.extensions.telnet] INFO: Telnet Password: 2606bb6bba98daed
2025-02-11 12:23:19 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.logstats.LogStats']
2025-02-11 12:23:19 [scrapy.crawler] INFO: Overridden settings:
{}
2025-02-11 12:23:19 [scrapy.middleware] INFO: Enabl

### Converting The Documents Infromation To Dataframe

In [19]:
lists=[]
for a in scraped_data:
    lists.append(a['summary'])

In [21]:
lists

['A publicly-traded financial technology platform is helping large companies like Meta, Uber, H&M, and Microsoft achieve their ambitions faster by providing end-to-end payments capabilities, data-driven insights, and financial products in a single global solution. With more merchants signing on and with increased transaction volume comes increased pressure on support teams and a team at Adyen that immediately sought out leveraged solutions.',
 'Elastic, a leading search analytics company, serving over 20k customers worldwide, enables organizations to securely harness search-powered AI so anyone can find the answers they need in real-time using all their data, at scale.',
 'Ally Financial, the largest digital-only bank in the US and a leading auto lender, has recently collaborated with LangChain to release the first initial coding module that addresses a significant challenge for AI developers working with personal identifiable information (PII) in highly regulated, consumer-focused ind

In [22]:
summary_lists=[]
for g in lists:
    if g not in summary_lists:
        summary_lists.append(g)

In [23]:
summary_lists

['A publicly-traded financial technology platform is helping large companies like Meta, Uber, H&M, and Microsoft achieve their ambitions faster by providing end-to-end payments capabilities, data-driven insights, and financial products in a single global solution. With more merchants signing on and with increased transaction volume comes increased pressure on support teams and a team at Adyen that immediately sought out leveraged solutions.',
 'Elastic, a leading search analytics company, serving over 20k customers worldwide, enables organizations to securely harness search-powered AI so anyone can find the answers they need in real-time using all their data, at scale.',
 'Ally Financial, the largest digital-only bank in the US and a leading auto lender, has recently collaborated with LangChain to release the first initial coding module that addresses a significant challenge for AI developers working with personal identifiable information (PII) in highly regulated, consumer-focused ind

In [25]:
data_frame=pd.DataFrame(summary_lists,columns=['DOCUMENTS'])

In [26]:
data_frame

Unnamed: 0,DOCUMENTS
0,A publicly-traded financial technology platfor...
1,"Elastic, a leading search analytics company, s..."
2,"Ally Financial, the largest digital-only bank ..."
3,Create a new folder for the project. Run the c...
4,Exploring at the Edge of AI Agents Software En...
5,We'll build a chatbot in LangGraph that can: ...
6,Unresolved incident: Intermittent delays on ru...
7,Build context-aware reasoning applications wit...
8,See if you can write your own integrations . B...
9,No summary available


In [27]:
data_frame['doc_ID']=data_frame.index+1

In [29]:
data_frame=data_frame[['doc_ID','DOCUMENTS']]

In [30]:
data_frame

Unnamed: 0,doc_ID,DOCUMENTS
0,1,A publicly-traded financial technology platfor...
1,2,"Elastic, a leading search analytics company, s..."
2,3,"Ally Financial, the largest digital-only bank ..."
3,4,Create a new folder for the project. Run the c...
4,5,Exploring at the Edge of AI Agents Software En...
5,6,We'll build a chatbot in LangGraph that can: ...
6,7,Unresolved incident: Intermittent delays on ru...
7,8,Build context-aware reasoning applications wit...
8,9,See if you can write your own integrations . B...
9,10,No summary available


### Storing Embending In Fiass Index And Getting Similarity Distance Based On Query Provided

In [31]:
! pip install faiss-cpu sentence-transformers 

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp312-cp312-win_amd64.whl.metadata (4.5 kB)
Downloading faiss_cpu-1.10.0-cp312-cp312-win_amd64.whl (13.7 MB)
   ---------------------------------------- 0.0/13.7 MB ? eta -:--:--
   ------- -------------------------------- 2.6/13.7 MB 16.9 MB/s eta 0:00:01
   --------- ------------------------------ 3.1/13.7 MB 18.5 MB/s eta 0:00:01
   --------- ------------------------------ 3.1/13.7 MB 18.5 MB/s eta 0:00:01
   --------- ------------------------------ 3.1/13.7 MB 18.5 MB/s eta 0:00:01
   -------------- ------------------------- 5.0/13.7 MB 4.9 MB/s eta 0:00:02
   ---------------- ----------------------- 5.8/13.7 MB 5.4 MB/s eta 0:00:02
   ----------------- ---------------------- 6.0/13.7 MB 4.2 MB/s eta 0:00:02
   ----------------------- ---------------- 8.1/13.7 MB 5.1 MB/s eta 0:00:02
   ----------------------- ---------------- 8.1/13.7 MB 5.1 MB/s eta 0:00:02
   -------------------------- ------------- 9.2/13.7 MB 4.6 MB/s eta 0:0

In [33]:
# 2. Load the pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')

2025-02-11 13:16:24 [sentence_transformers.SentenceTransformer] INFO: Use pytorch device_name: cpu
2025-02-11 13:16:24 [sentence_transformers.SentenceTransformer] INFO: Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2025-02-11 13:16:24 [urllib3.connectionpool] DEBUG: Resetting dropped connection: huggingface.co
2025-02-11 13:16:24 [urllib3.connectionpool] DEBUG: https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/modules.json HTTP/1.1" 200 0
2025-02-11 13:16:25 [urllib3.connectionpool] DEBUG: https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/config_sentence_transformers.json HTTP/1.1" 200 0
2025-02-11 13:16:25 [urllib3.connectionpool] DEBUG: https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/README.md HTTP/1.1" 200 0
2025-02-11 13:16:25 [urllib3.connectionpool] DEBUG: https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/modules.json HTTP/1.1" 

In [35]:
# 3. Convert documents to embeddings
document_embeddings = model.encode(data_frame['DOCUMENTS'].tolist())

In [59]:
dimension = document_embeddings.shape[1]
dimension

384

In [45]:
# Initialize FAISS index
index = faiss.IndexFlatL2(dimension)

In [46]:
# Store embeddings in the FAISS index
index.add(np.array(document_embeddings))

In [88]:
def search_query(query, top_k=3):
    # Convert query into embedding
    query_embedding = model.encode([query])
    
    # Search the FAISS index for relevant documents
    distances, indices = index.search(np.array(query_embedding), top_k)
    
    # Return matching documents and corresponding IDs
    results = []
    for i in range(top_k):
        doc_index = indices[0][i]  # Get index from FAISS results
        results.append({
            'ID': int(data_frame.iloc[doc_index]['doc_ID']),  # Use index to get the corresponding ID
            'document': data_frame.iloc[doc_index]['DOCUMENTS'],  # Use index to get the document
            'similarity_score': float(distances[0][i]) # Similarity score from FAISS
        })
    
    return results,indices

In [96]:
results = search_query('what is langchain')

In [97]:
results

[{'ID': 9,
  'document': 'See if you can write your own integrations . Browse the LangChain integrations list to see if there are any providers you want to integrate with.',
  'similarity_score': 0.5289301872253418},
 {'ID': 14,
  'document': "Learn the basics of LangChain's LLM platform. Learn how to build agentic and multi-agent applications. Get started with LangChiain, LangSmith, and LangGraph.",
  'similarity_score': 0.43096399307250977},
 {'ID': 3,
  'document': 'Ally Financial, the largest digital-only bank in the US and a leading auto lender, has recently collaborated with LangChain to release the first initial coding module that addresses a significant challenge for AI developers working with personal identifiable information (PII) in highly regulated, consumer-focused industries.',
  'similarity_score': 0.3278048634529114}]

### Performing Cosine Similarity Search 

In [79]:
#  Normalize embeddings before adding them to FAISS (cosine similarity)
faiss.normalize_L2(document_embeddings)

In [80]:
# Create FAISS index
index = faiss.IndexFlatIP(dimension)  # IP = Inner Product (used for cosine similarity)
index.add(np.array(document_embeddings))

In [81]:
# modifying search function

In [92]:
def search_query(query, top_k=3):
    query_embedding = model.encode([query])
    faiss.normalize_L2(query_embedding)  # Normalize query for cosine similarity

    distances, indices = index.search(np.array(query_embedding), top_k)

    results = []
    for i in range(top_k):
        doc_index = indices[0][i]
        results.append({
            'ID': int(data_frame.iloc[doc_index]['doc_ID']),
            'document': data_frame.iloc[doc_index]['DOCUMENTS'],
            'similarity_score': float(distances[0][i])  # Now it's cosine similarity
        })
    
    return results

In [98]:
answer=search_query('what is langchain')

In [99]:
answer

[{'ID': 9,
  'document': 'See if you can write your own integrations . Browse the LangChain integrations list to see if there are any providers you want to integrate with.',
  'similarity_score': 0.5289301872253418},
 {'ID': 14,
  'document': "Learn the basics of LangChain's LLM platform. Learn how to build agentic and multi-agent applications. Get started with LangChiain, LangSmith, and LangGraph.",
  'similarity_score': 0.43096399307250977},
 {'ID': 3,
  'document': 'Ally Financial, the largest digital-only bank in the US and a leading auto lender, has recently collaborated with LangChain to release the first initial coding module that addresses a significant challenge for AI developers working with personal identifiable information (PII) in highly regulated, consumer-focused industries.',
  'similarity_score': 0.3278048634529114}]