### Importing Nessesery Libraries

In [2]:
import requests
from bs4 import BeautifulSoup
import csv  
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import re
from nltk.stem import WordNetLemmatizer

In [3]:
url = 'https://www.langchain.com'

### Extracting Hyperlinks

In [3]:
def get_links(url):
    """Fetches all valid links from the given URL."""
    try:
        response = requests.get(url) 
        response.raise_for_status()  # Raise error for bad responses (404, 500, etc.)
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return set()  # Return an empty set on failure

    soup = BeautifulSoup(response.text, 'html.parser')
    links = {a.get('href') for a in soup.find_all('a', href=True)}
    # Only keep valid links (absolute URLs starting with http)
    valid_links = {link for link in links if link.startswith('http')}
    
    return valid_links

In [4]:
def extract_links_depth_2(start_url):
    """Extracts links up to depth 2."""
    all_links = set()  # Stores all links found
    
    # Depth 1: Get links from the start URL
    level_1_links = get_links(start_url)
    all_links.update(level_1_links)  # Store these links

    # Depth 2: Get links from each link found at depth 1
    for link in level_1_links:
        level_2_links = get_links(link)  # Extract links from each level 1 link
        all_links.update(level_2_links)  # Store these links

    return all_links

In [5]:
start_url = "https://python.langchain.com/docs/"  # Replace with any valid website
links = extract_links_depth_2(start_url)

In [6]:
len(set(links))

266

In [7]:
list_link=list(links)
print(list_link)

['https://github.com/baskaryan', 'https://academy.langchain.com/', 'https://github.com/langchain-ai/langgraph/edit/main/docs/docs/concepts/index.md', 'https://github.com/langchain-ai/langgraph/blob/main/CONTRIBUTING.md', 'https://langchain-ai.github.io/langgraph/concepts/high_level/', 'https://python.langchain.com/docs/concepts/messages/', 'https://js.langchain.com/docs/', 'https://python.langchain.com/docs/introduction', 'https://research.google/pubs/pub37252/', 'https://github.com/langchain-ai/langchain/releases', 'https://www.smith.langchain.com', 'https://github.com/langchain-ai/langchainjs/blob/main/CONTRIBUTING.md', 'https://langchain-ai.github.io/langgraph/reference/types/#langgraph.types.Command', 'https://github.com/langchain-ai/langgraph/edit/main/docs/docs/tutorials/index.md', 'https://docs.smith.langchain.com/reference/js', 'https://github.com/features/actions', 'https://www.youtube.com/about/copyright/', 'https://github.com/features/discussions', 'https://python.langchain.

### Trying To Remove Invalid Links By Validating 

In [8]:
def valid_links(url):
    """Check if a link is valid and reachable."""
    try:
        response = requests.get(url)
        return response.status_code == 200  # Valid if status code is 200
    except requests.exceptions.RequestException:
        return False  # Invalid link
    

In [9]:
validlink = [link for link in list_link if valid_links(link)]

In [13]:
len(set(validlink))

261

### Saving It As CSV file

In [20]:
# Write list to a CSV file (as a single row)
with open("recent_links.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(validlink)  # Save as a single row with values separated by commas

print("CSV file saved successfully!")

CSV file saved successfully!


In [34]:
validlink=set(validlink)
len(validlink)

269

### Extracting Text

In [85]:
! pip install lxml

^C





[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [14]:
def text_extractor(url):
    try:
        response=requests.get(url,timeout=5)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")

    response=requests.get(url)
    soup=BeautifulSoup(response.text,'html.parser')
    text=soup.get_text(separator=' ',strip=True)
    return text

In [15]:
info_list=[]
for i in validlink:
    fultext=text_extractor(i)
    info_list.append(fultext)
    

  k = self.parse_starttag(i)


In [19]:
validlink=(list(validlink))
print(len(validlink))
print(len(info_list))

261
261


## Normalization

In [20]:
data_frame=pd.DataFrame({'URL':validlink,'documents':info_list})
data_frame

Unnamed: 0,URL,documents
0,https://github.com/baskaryan,baskaryan (Bagatur) · GitHub Skip to content N...
1,https://academy.langchain.com/,LangChain Academy Docs Python LangChain LangSm...
2,https://github.com/langchain-ai/langgraph/edit...,Sign in to GitHub · GitHub Skip to content You...
3,https://github.com/langchain-ai/langgraph/blob...,langgraph/CONTRIBUTING.md at main · langchain-...
4,https://langchain-ai.github.io/langgraph/conce...,Why LangGraph? Skip to content Join us at Inte...
...,...,...
256,https://langchain-ai.github.io/langgraph/conce...,Concepts Skip to content Join us at Interrupt:...
257,https://opensource.org/licenses/MIT,The MIT License – Open Source Initiative Skip ...
258,https://github.com/nfcampos,nfcampos (Nuno Campos) · GitHub Skip to conten...
259,https://www.youtube.com/about/policies/,YouTube Community Guidelines and policies - Ho...


### Case Normalization

In [21]:
data_frame['documents']=data_frame['documents'].str.lower()
data_frame

Unnamed: 0,URL,documents
0,https://github.com/baskaryan,baskaryan (bagatur) · github skip to content n...
1,https://academy.langchain.com/,langchain academy docs python langchain langsm...
2,https://github.com/langchain-ai/langgraph/edit...,sign in to github · github skip to content you...
3,https://github.com/langchain-ai/langgraph/blob...,langgraph/contributing.md at main · langchain-...
4,https://langchain-ai.github.io/langgraph/conce...,why langgraph? skip to content join us at inte...
...,...,...
256,https://langchain-ai.github.io/langgraph/conce...,concepts skip to content join us at interrupt:...
257,https://opensource.org/licenses/MIT,the mit license – open source initiative skip ...
258,https://github.com/nfcampos,nfcampos (nuno campos) · github skip to conten...
259,https://www.youtube.com/about/policies/,youtube community guidelines and policies - ho...


### Tokenization

In [22]:
# Make sure to download the punkt tokenizer models if not already done
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SHASHANK_\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [23]:
data_frame['tokens']= data_frame['documents'].apply(word_tokenize)

In [24]:
# data_frame

### Removing Puctuations

In [25]:
def remove_punctuation(tokens):
    cleaned_tokens = []  # Initialize an empty list
    for word in tokens:
        cleaned_word = re.sub(r'[^\w\s]', '', word)  # Remove punctuation
        if cleaned_word:  # Only add non-empty words
            cleaned_tokens.append(cleaned_word)
    return cleaned_tokens

In [26]:
data_frame['cleaned_tokens']=data_frame['tokens'].apply(remove_punctuation)

## Lematization

In [27]:
# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

In [28]:
# Function to perform lemmatization
def lemmatize_tokens(tokens):
    lemmatized_words = []
    for word in tokens:
        lemma = lemmatizer.lemmatize(word)  # Convert to lemma
        lemmatized_words.append(lemma)
    return lemmatized_words

In [29]:
data_frame['lemmatized']=data_frame['cleaned_tokens'].apply(lemmatize_tokens)

In [30]:
i_list=data_frame['lemmatized'].to_list()


In [31]:
u_link=data_frame['URL'].to_list()
# data_frame=data_frame.drop(0, axis=0).reset_index(drop=True)

### Trying To Do Striding And Padding And Storing Data In Dataframe

In [37]:
stride=40
token_size=128
data_list=[]
data_frame

Unnamed: 0,URL,documents,tokens,cleaned_tokens,lemmatized
0,https://github.com/baskaryan,baskaryan (bagatur) · github skip to content n...,"[baskaryan, (, bagatur, ), ·, github, skip, to...","[baskaryan, bagatur, github, skip, to, content...","[baskaryan, bagatur, github, skip, to, content..."
1,https://academy.langchain.com/,langchain academy docs python langchain langsm...,"[langchain, academy, docs, python, langchain, ...","[langchain, academy, docs, python, langchain, ...","[langchain, academy, doc, python, langchain, l..."
2,https://github.com/langchain-ai/langgraph/edit...,sign in to github · github skip to content you...,"[sign, in, to, github, ·, github, skip, to, co...","[sign, in, to, github, github, skip, to, conte...","[sign, in, to, github, github, skip, to, conte..."
3,https://github.com/langchain-ai/langgraph/blob...,langgraph/contributing.md at main · langchain-...,"[langgraph/contributing.md, at, main, ·, langc...","[langgraphcontributingmd, at, main, langchaina...","[langgraphcontributingmd, at, main, langchaina..."
4,https://langchain-ai.github.io/langgraph/conce...,why langgraph? skip to content join us at inte...,"[why, langgraph, ?, skip, to, content, join, u...","[why, langgraph, skip, to, content, join, us, ...","[why, langgraph, skip, to, content, join, u, a..."
...,...,...,...,...,...
256,https://langchain-ai.github.io/langgraph/conce...,concepts skip to content join us at interrupt:...,"[concepts, skip, to, content, join, us, at, in...","[concepts, skip, to, content, join, us, at, in...","[concept, skip, to, content, join, u, at, inte..."
257,https://opensource.org/licenses/MIT,the mit license – open source initiative skip ...,"[the, mit, license, –, open, source, initiativ...","[the, mit, license, open, source, initiative, ...","[the, mit, license, open, source, initiative, ..."
258,https://github.com/nfcampos,nfcampos (nuno campos) · github skip to conten...,"[nfcampos, (, nuno, campos, ), ·, github, skip...","[nfcampos, nuno, campos, github, skip, to, con...","[nfcampos, nuno, campos, github, skip, to, con..."
259,https://www.youtube.com/about/policies/,youtube community guidelines and policies - ho...,"[youtube, community, guidelines, and, policies...","[youtube, community, guidelines, and, policies...","[youtube, community, guideline, and, policy, h..."


In [39]:
# Function to split document into overlapping chunks
def chunk_document(text, link):
    words = text  # Tokenize the document into words
    data_list = []  # Store chunks along with their link
    i = 0

    while i < len(words):
            chunk = words[i:i + token_size]  # Extract chunk
            i += stride  # Move by stride

            # If the last chunk is shorter, pad it with '0'
            while len(chunk) < token_size:
                chunk.append('0')

            data_list.append((' '.join(chunk), link)) 
    return data_list

In [40]:
processed_data = []
for index, row in data_frame.iterrows():
    processed_data.extend(chunk_document(row['lemmatized'], row['URL']))

In [49]:
len(processed_data)

11077

In [50]:
# Convert processed data into a DataFrame
df_chunks = pd.DataFrame(processed_data, columns=['tokens', 'link'])

In [54]:
df_chunks

Unnamed: 0,tokens,link
0,baskaryan bagatur github skip to content navig...,https://github.com/baskaryan
1,manage code change discussion collaborate outs...,https://github.com/baskaryan
2,use case by industry healthcare financial serv...,https://github.com/baskaryan
3,fund open source developer the readme project ...,https://github.com/baskaryan
4,jump to search code repository user issue pull...,https://github.com/baskaryan
...,...,...
11072,different langchainjs module together you can ...,https://github.com/langchain-ai/langchain-next...
11073,out the doc here http jslangchaincomdocs deplo...,https://github.com/langchain-ai/langchain-next...
11074,comment reach out to u on twitter langchainai ...,https://github.com/langchain-ai/langchain-next...
11075,repository release no release published packag...,https://github.com/langchain-ai/langchain-next...


### Converting The Textual Data To Embeddings And Storing It In Faiss

In [55]:
import faiss
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [61]:
import numpy as np 

In [63]:
df_chunks

Unnamed: 0,tokens,link
0,baskaryan bagatur github skip to content navig...,https://github.com/baskaryan
1,manage code change discussion collaborate outs...,https://github.com/baskaryan
2,use case by industry healthcare financial serv...,https://github.com/baskaryan
3,fund open source developer the readme project ...,https://github.com/baskaryan
4,jump to search code repository user issue pull...,https://github.com/baskaryan
...,...,...
11072,different langchainjs module together you can ...,https://github.com/langchain-ai/langchain-next...
11073,out the doc here http jslangchaincomdocs deplo...,https://github.com/langchain-ai/langchain-next...
11074,comment reach out to u on twitter langchainai ...,https://github.com/langchain-ai/langchain-next...
11075,repository release no release published packag...,https://github.com/langchain-ai/langchain-next...


In [57]:
# 2. Load the pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [64]:
# 3. Convert documents to embeddings
document_embeddings = model.encode(df_chunks['tokens'].tolist())

In [66]:
dimension = document_embeddings.shape[1]
dimension

384

### Implementing Cosine Similarity

In [72]:
#  Normalize embeddings before adding them to FAISS (cosine similarity)
faiss.normalize_L2(document_embeddings)

In [73]:
# Create FAISS index
index = faiss.IndexFlatIP(dimension)  # IP = Inner Product (used for cosine similarity)
index.add(np.array(document_embeddings))

In [58]:
import cohere

In [59]:
# Initialize the Cohere client with your API key
co = cohere.Client("zBcppvYKxb70AMbo7HVKBwcBrNIVVLuEZoTgHrbh")

### Running In Playground 

In [9]:
# ! pip install gradio

In [81]:
import gradio as gr

In [83]:
df_chunks

Unnamed: 0,tokens,link
0,baskaryan bagatur github skip to content navig...,https://github.com/baskaryan
1,manage code change discussion collaborate outs...,https://github.com/baskaryan
2,use case by industry healthcare financial serv...,https://github.com/baskaryan
3,fund open source developer the readme project ...,https://github.com/baskaryan
4,jump to search code repository user issue pull...,https://github.com/baskaryan
...,...,...
11072,different langchainjs module together you can ...,https://github.com/langchain-ai/langchain-next...
11073,out the doc here http jslangchaincomdocs deplo...,https://github.com/langchain-ai/langchain-next...
11074,comment reach out to u on twitter langchainai ...,https://github.com/langchain-ai/langchain-next...
11075,repository release no release published packag...,https://github.com/langchain-ai/langchain-next...


In [95]:
def final_function(query):
    query_embedding = model.encode([query])
    faiss.normalize_L2(query_embedding)  # Normalize query for cosine similarity

    distances, indices = index.search(np.array(query_embedding),3)

    results = []
    for i in range(3):
        doc_index = indices[0][i]
        results.append({
            'document': df_chunks.iloc[doc_index]['tokens'],
            'similarity_score': float(distances[0][i]),
            'links': df_chunks.iloc[doc_index]['link']  # Now it's cosine similarity
        })
    new=''
    for j in results:
        new=new+j['document']

    urls=''
    for k in results:
        urls=urls+k['links']+','+' '


    

    prompt  = f"""
You are an AI assistant with deep knowledge of various topics. 
Use the following context to answer the user's question accurately in a summary form  covering all answer. 
If the context does not provide enough information, say "I don't know".
If the context is not able to have information regarding query say"I DONT KNOW" 

    Context:
   {new}

   Question:
   {query}

   Answer (Be concise and informative):
    """

    # Call Cohere’s API to generate text
    response = co.generate(
        model="command-xlarge-nightly",  # Choose a Cohere model
        prompt=prompt, 
        max_tokens=1000,  # Limit response length
        temperature=0.5,  # Lower temp = more factual, higher = more creative
        stop_sequences=["\n"]
    )
    answer=[]
    answer.append(response.generations[0].text.strip())
    answer.append(f' ,links:({urls})')
    return ' '.join(answer)

    
    

In [88]:
final_function('what is the difference between langraph and langchain')

'Langchain and Langgraph are both tools within the Langchain ecosystem, but they serve different purposes. Langchain provides a standard interface to interact with models and components, making it useful for straightforward retrieval and chain-of-thought processes. On the other hand, Langgraph is an orchestration framework designed for more complex, company-specific tasks, offering a low-level and controllable environment without the restrictions of a single black-box cognitive architecture. Langgraph is particularly beneficial for building and deploying agents with long-term memory capabilities, ensuring efficient performance without adding overhead to your code.  ,links:(https://academy.langchain.com/courses/intro-to-langgraph https://langchain-ai.github.io/langgraph/tutorials/introduction/ https://academy.langchain.com/courses/intro-to-langgraph )'

In [98]:
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🌈 AI Chatbot")  # Styled heading
    gr.Markdown("💡 **Ask me anything!** I'm here to assist you.")
    
    with gr.Row():  # Arrange elements in a row
        input_box = gr.Textbox(placeholder="Type your message...", label="Your Query")
        submit_btn = gr.Button("🚀 Ask")

    output_box = gr.Textbox(label="Bot's Response", lines=20)

    submit_btn.click(final_function, inputs=input_box, outputs=output_box)

demo.launch()

* Running on local URL:  http://127.0.0.1:7866

To create a public link, set `share=True` in `launch()`.


