# Get content

In [14]:
from bs4 import BeautifulSoup
import requests
from googlesearch import search

def get_content_from_url(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        # Remove script and style elements
        for script in soup(["script", "style"]):
            script.decompose()
        # Get text content
        text = soup.get_text()
        # Clean up whitespace
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        text = ' '.join(chunk for chunk in chunks if chunk)
        return text
    except:
        return ""

topic = input("Enter a topic to search: ")
content = ""

# Get first 3 search results
for url in search(topic, num_results=3):
    print(url)
    content += get_content_from_url(url) + "\n\n"

content

Trying URL: https://computer.com/


'HOME - Computer.Com Skip to content Products Edge Network Infrastructure as a Service Platform as a Service Virtual & Dedicated Servers Video Streaming Platform Security AI & Machine Learning Cloud for Mobile Custom Services Edge Network CDN Next-gen CDN for dynamic and static content delivery Image Stack Dynamic image optimization: WebP, AVIF, cropping, resizing DNS Hosting Managed DNS service for mission-critical availability Cloud Storage S3-compatible storage for cloud-native environment Public DNS Fast, secure, and free DNS resolver that protects your privacy. Network Map Reliable global infrastructure for utmost service availability Edge Network All-in-one edge platform for better application delivery Pricing Choose the billing plan that suits your business needs Cloud Products Cloud Edge Services All-in-one edge platform for better application delivery Virtual Instances Virtual machines with pay-as-you-go billing and customizable configurations Computer Basic Shared virtual mac

# Cleaning/Preprocessing

In [2]:
import re

def clean_text(text):
    text = re.sub(r"\[.*?\]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

cleaned_data = clean_text(content)
print("Cleaned text:\n", cleaned_data[:500])


Cleaned text:
 What Is NLP (Natural Language Processing)? | IBM Home Topics Natural language processing What is NLP (natural language processing)? Learn about IBM's NLP solutions Sign up for AI updates Updated: 11 August 2024 Contributor: Cole Stryker, Jim Holdsworth What is NLP? Natural language processing (NLP) is a subfield of computer science and artificial intelligence (AI) that uses machine learning to enable computers to understand and communicate with human language. NLP enables computers and digital d


In [3]:
len(cleaned_data)

109961

# Knowledge base using sentence embenddings

In [6]:
from sentence_transformers import SentenceTransformer
import faiss # Facebook AI Similarity Seacrh
import numpy as np

model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
chunks = [cleaned_data[i:i+500] for i in range(0, len(cleaned_data), 500)] # chunking
embeddings = model.encode(chunks) # embeddings for each chunk
# embeddings array of the form (N,D) where N = no. of chunks and D = dim of embeddings vec


dimension = embeddings.shape[1] # extract D
index = faiss.IndexFlatL2(dimension) # initialize L2 eucidean distance for similarity b/w vecs; saying it has dimension number of dimensions for the vectors
index.add(np.array(embeddings)) # add embeddings to FAISS index; FAISS build internal struct for optimal search
print(f"Indexed {len(chunks)} chunks.")


  from .autonotebook import tqdm as notebook_tqdm


Indexed 62 chunks.


# Basic ver

In [9]:
from transformers import pipeline
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

In [10]:
def find_closest_chunk(question):
    question_embedding = model.encode([question])
    _, closest_index = index.search(np.array(question_embedding), 1)
    return chunks[closest_index[0][0]]

def answer_question(question):
    context = find_closest_chunk(question)
    result = qa_pipeline(question=question, context=context)
    return result['answer']

question = "what is natural language processing?"
answer = answer_question(question)
print(f"Question: {question}\nAnswer: {answer}")


Question: what is natural language processing?
Answer: providing computers with the ability to process data encoded in natural language


# proper RAG

## Query processing

In [11]:
user_query = "What is Natural Language Processing?"

query_embedding = model.encode([user_query])
D, I = index.search(query_embedding, k=10)  # top 5

relevant_passages = [chunks[i] for i in I[0]]

In [12]:
relevant_passages

['Natural language processing (NLP) is a subfield of computer science and especially artificial intelligence. It is primarily concerned with providing computers with the ability to process data encoded in natural language and is thus closely related to information retrieval, knowledge representation and computational linguistics, a subfield of linguistics. Typically data is collected in text corpora, using either rule-based, statistical or neural-based approaches in machine learning and deep learn',
 ' can be derived from a natural language expression which usually takes the form of organized notations of natural language concepts. Introduction and creation of language metamodel and ontology are efficient however empirical solutions. An explicit formalization of natural language semantics without confusions with implicit assumptions such as closed-world assumption (CWA) vs. open-world assumption, or subjective Yes/No vs. objective True/False is expected for the construction of a basis 

In [13]:
relevant_passages_str = ''.join(relevant_passages)

In [14]:
relevant_passages_str

'Natural language processing (NLP) is a subfield of computer science and especially artificial intelligence. It is primarily concerned with providing computers with the ability to process data encoded in natural language and is thus closely related to information retrieval, knowledge representation and computational linguistics, a subfield of linguistics. Typically data is collected in text corpora, using either rule-based, statistical or neural-based approaches in machine learning and deep learn can be derived from a natural language expression which usually takes the form of organized notations of natural language concepts. Introduction and creation of language metamodel and ontology are efficient however empirical solutions. An explicit formalization of natural language semantics without confusions with implicit assumptions such as closed-world assumption (CWA) vs. open-world assumption, or subjective Yes/No vs. objective True/False is expected for the construction of a basis of sem

## Contextual generation

In [18]:
# too heavy

from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen2.5-Coder-7B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="mps"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)


messages = [
    {"role": "system", "content": relevant_passages_str},
    {"role": "user", "content": user_query}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=512
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

ImportError: Using `low_cpu_mem_usage=True` or a `device_map` requires Accelerate: `pip install 'accelerate>=0.26.0'`

In [None]:
response

In [None]:
# Faster on colab:
# https://colab.research.google.com/drive/1D9i2py0A1Q09b8QzDymus_vWfL5HpW_o?usp=sharing