# **1.web crawling with scrapy**

In [None]:
# Install Scrapy and other necessary libraries
!pip install scrapy
!pip install scrapy-crawlera

# Create a Scrapy project
!scrapy startproject cuda_docs

# Navigate to the project directory
%cd cuda_docs

In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess

class CudaSpider(scrapy.Spider):
    name = "cuda"
    start_urls = ['https://docs.nvidia.com/cuda/']

    def parse(self, response):
        # Extract text content and links
        for link in response.css('a::attr(href)').getall():
            if link and link.startswith('/cuda/'):
                yield response.follow(link, self.parse)

        yield {
            'url': response.url,
            'content': ' '.join(response.css('::text').getall())
        }

# Configure and run the crawler
process = CrawlerProcess(settings={
    "FEEDS": {
        "cuda_docs.json": {"format": "json"},
    },
})

process.crawl(CudaSpider)
process.start()


# **2. Data Chunking and Embedding Creation**

In [None]:
# Install the necessary libraries
!pip install sentence-transformers
!pip install nltk

import json
import nltk
from sentence_transformers import SentenceTransformer, util

nltk.download('punkt')

# Load the scraped data
with open('cuda_docs.json', 'r') as f:
    data = json.load(f)

# Initialize the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Chunk the data based on sentence similarity
chunks = []
for entry in data:
    sentences = nltk.sent_tokenize(entry['content'])
    embeddings = model.encode(sentences, convert_to_tensor=True)
    clusters = util.community_detection(embeddings, min_community_size=2, threshold=0.75)

    for cluster in clusters:
        chunk = {
            'url': entry['url'],
            'content': ' '.join([sentences[i] for i in cluster])
        }
        chunks.append(chunk)


In [None]:
!pip install pymilvus


# **3. Vector Database Creation with Milvus**

In [None]:
!pip uninstall grpcio grpcio-tools pymilvus -y


In [None]:
!pip install grpcio==1.53.0 grpcio-tools==1.53.0
!pip install pymilvus==2.2.11
!pip install sentence-transformers


In [None]:
 # Import required modules
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility
import numpy as np
from sentence_transformers import SentenceTransformer

# Connect to Milvus
try:
    connections.connect("default", host="localhost", port="19530")
    print("Connected to Milvus")
except Exception as e:
    print(f"Error connecting to Milvus: {e}")

# Define the schema
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=384),
    FieldSchema(name="url", dtype=DataType.VARCHAR, max_length=500)
]

schema = CollectionSchema(fields, description="CUDA documentation chunks")

# Create a collection if it doesn't exist
collection_name = "cuda_docs"
if utility.has_collection(collection_name):
    collection = Collection(collection_name)
    print(f"Collection '{collection_name}' already exists.")
else:
    collection = Collection(name=collection_name, schema=schema)
    print(f"Collection '{collection_name}' created.")

# Prepare the data
model = SentenceTransformer('all-MiniLM-L6-v2')

chunks = [
    {"content": "CUDA is a parallel computing platform and programming model.", "url": "https://docs.nvidia.com/cuda/cuda-introduction/index.html"},
    # Add more chunks as needed
]

ids = list(range(len(chunks)))
embeddings = [model.encode(chunk['content']).tolist() for chunk in chunks]
urls = [chunk['url'] for chunk in chunks]

# Insert the data
try:
    data = [ids, embeddings, urls]
    mr = collection.insert(data)
    print("Data inserted successfully")
except Exception as e:
    print(f"Error inserting data into Milvus: {e}")

# Check the number of entities
print(f"Number of entities in collection: {collection.num_entities}")


In [None]:
!pip install pymilvus==2.2.11 sentence-transformers grpcio==1.53.0


In [None]:
# Import required modules
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility
from sentence_transformers import SentenceTransformer

# Connect to Milvus
try:
    connections.connect("default", host="localhost", port="19530")
    print("Connected to Milvus")
except Exception as e:
    print(f"Error connecting to Milvus: {e}")

# Define the schema
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=384),
    FieldSchema(name="url", dtype=DataType.VARCHAR, max_length=500)
]

schema = CollectionSchema(fields, description="CUDA documentation chunks")

# Create a collection if it doesn't exist
collection_name = "cuda_docs"
if utility.has_collection(collection_name):
    collection = Collection(collection_name)
    print(f"Collection '{collection_name}' already exists.")
else:
    collection = Collection(name=collection_name, schema=schema)
    print(f"Collection '{collection_name}' created.")


In [None]:
# Load the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Example data
chunks = [
    {"content": "CUDA is a parallel computing platform and programming model.", "url": "https://docs.nvidia.com/cuda/cuda-introduction/index.html"},
    # Add more chunks as needed
]

# Prepare the data
embeddings = [model.encode(chunk['content']).tolist() for chunk in chunks]
urls = [chunk['url'] for chunk in chunks]

# Insert the data into Milvus
try:
    data = [
        [i for i in range(len(embeddings))],  # ids
        embeddings,
        urls
    ]
    mr = collection.insert(data)
    print("Data inserted successfully")
except Exception as e:
    print(f"Error inserting data into Milvus: {e}")

# Check the number of entities
if collection:
    print(f"Number of entities in collection: {collection.num_entities}")
else:
    print("Collection object is not defined.")


In [None]:
from pymilvus import utility

# Load the collection
collection.load()

def retrieve(query, top_k=10):
    query_embedding = model.encode(query)
    search_params = {"metric_type": "L2", "params": {"nprobe": 10}}

    results = collection.search(
        data=[query_embedding],
        anns_field="embedding",
        param=search_params,
        limit=top_k,
        expr=None,
        output_fields=["url"]
    )

    return results

# Example query
query = "What is CUDA?"
results = retrieve(query)

# Re-rank based on similarity
def re_rank(results, query):
    query_embedding = model.encode(query, convert_to_tensor=True)
    scores = []

    for result in results[0]:
        embedding = result.entity.get("embedding")
        score = util.pytorch_cos_sim(query_embedding, embedding)
        scores.append((result, score))

    scores.sort(key=lambda x: x[1], reverse=True)
    return [result[0] for result in scores]

re_ranked_results = re_rank(results, query)


# **4. Retrieval and Re-ranking**

In [None]:
!pip install transformers

from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-distilled-squad")
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased-distilled-squad")

def answer_question(question, context):
    inputs = tokenizer.encode_plus(question, context, add_special_tokens=True, return_tensors="pt")
    input_ids = inputs["input_ids"].tolist()[0]

    outputs = model(**inputs)
    answer_start = torch.argmax(outputs[0])  # Get the most likely beginning of answer
    answer_end = torch.argmax(outputs[1]) + 1  # Get the most likely end of answer

    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
    return answer

# Get the context from the re-ranked results
context = re_ranked_results[0].entity.get("content")

# Get the answer
answer = answer_question(query, context)
print(f"Answer: {answer}")


# **5. Question Answering with an LLM**

In [None]:
# Install transformers package if not already installed
!pip install transformers

from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

# Load the pretrained tokenizer and model for question answering
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-distilled-squad")
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased-distilled-squad")

# Define a function to answer questions given a context
def answer_question(question, context):
    inputs = tokenizer.encode_plus(question, context, add_special_tokens=True, return_tensors="pt")
    input_ids = inputs["input_ids"].tolist()[0]

    # Get the model's predictions
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the most likely start and end positions of the answer
    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits) + 1

    # Convert the token ids to tokens and then to a string answer
    answer = tokenizer.decode(input_ids[answer_start:answer_end])

    return answer

# Example query and context (replace with actual retrieved content)
query = "What is CUDA?"
context = "CUDA is a parallel computing platform and programming model."

# Example re-ranked results (replace with actual retrieval logic)
re_ranked_results = [
    {"entity": {"content": context}}
]

# Get the context from the re-ranked results
context = re_ranked_results[0]["entity"]["content"]

# Get the answer to the query based on the context
answer = answer_question(query, context)
print(f"Answer: {answer}")


In [None]:
!streamlit run app.py


# **6. (Optional) User Interface with Streamlit**

In [None]:


# Create a Streamlit app
%%writefile app.py
import streamlit as st
import json
import nltk
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection

# Load the necessary models and data
model = SentenceTransformer('all-MiniLM-L6-v2')
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-distilled-squad")
qa_model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased-distilled-squad")

# Connect to Milvus
connections.connect("default", host="localhost", port="19530")
collection = Collection("cuda_docs")
collection.load()

def retrieve(query, top_k=10):
    query_embedding = model.encode(query)
    search_params = {"metric_type": "L2", "params": {"nprobe": 10}}

    results = collection.search(
        data=[query_embedding],
        anns_field="embedding",
        param=search_params,
        limit=top_k,
        expr=None,
        output_fields=["url"]
    )

    return results

def re_rank(results, query):
    query_embedding = model.encode(query, convert_to_tensor=True)
    scores = []

    for result in results[0]:
        embedding = result.entity.get("embedding")
        score = util.pytorch_cos_sim(query_embedding, embedding)
        scores.append((result, score))

    scores.sort(key=lambda x: x[1], reverse=True)
    return [result[0] for result in scores]

def answer_question(question, context):
    inputs = tokenizer.encode_plus(question, context, add_special_tokens=True, return_tensors="pt")
    input_ids = inputs["input_ids"].tolist()[0]

    outputs = qa_model(**inputs)
    answer_start = torch.argmax(outputs[0])
    answer_end = torch.argmax(outputs[1]) + 1

    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
    return answer

# Streamlit app
st.title("NVIDIA CUDA Documentation QA System")

query = st.text_input("Enter your query:")
if query:
    results = retrieve(query)
    re_ranked_results = re_rank(results, query)
    context = re_ranked_results[0].entity.get("content")
    answer = answer_question(query, context)
    st.write(f"Answer: {answer}")
    st.write(f"Context: {context}")
