In [1]:
import requests
from bs4 import BeautifulSoup
from pymongo import MongoClient

In [2]:
# MongoDB Atlas setup
client = MongoClient("mongodb+srv://swapnilsingh:ganpati@cluster0.ydftg.mongodb.net/")
db = client['vt_chatbot']
pages_collection = db['pages']

In [3]:
# Function to fetch sitemap and parse URLs
def get_urls_from_sitemap(sitemap_url):
    response = requests.get(sitemap_url)
    # Make sure 'lxml' parser is installed
    soup = BeautifulSoup(response.content, 'lxml')  # Use 'lxml' instead of 'xml'
    urls = [loc.text for loc in soup.find_all('loc')]
    return urls

In [4]:
# Function to scrape the content of each page
def scrape_page_content(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    page_title = soup.title.string if soup.title else 'No Title'
    page_content = " ".join([p.text for p in soup.find_all('p')])
    return {"url": url, "title": page_title, "content": page_content}

In [5]:
# Store VT pages in MongoDB
def store_pages_in_mongo(sitemap_url):
    urls = get_urls_from_sitemap(sitemap_url)
    for url in urls:
        page_data = scrape_page_content(url)
        pages_collection.insert_one(page_data)
        print(f"Stored {page_data['title']}")

In [6]:
# Example: Use the VT sitemap to scrape and store pages
sitemap_url = 'https://www.vt.edu/content/dam/vt_edu/sitemap.xml'
store_pages_in_mongo(sitemap_url)



Stored Home | Virginia Tech
Stored Rankings | Virginia Tech
Stored Current Students | Virginia Tech
Stored Beyond Boundaries | Virginia Tech
Stored Libraries | Virginia Tech
Stored Privacy Statement | Virginia Tech
Stored Virginia Tech Admissions Site Map | Virginia Tech
Stored Research | Virginia Tech
Stored Colleges | Virginia Tech
Stored Faculty & Staff | Virginia Tech
Stored University Status | Virginia Tech
Stored Facts About Virginia Tech | Virginia Tech
Stored 404 Resource at '/content/vtnews_vt_edu/en/tags.html' not found: No resource found
Stored Online Learning | Virginia Tech
Stored Health and Wellness | Virginia Tech
Stored Cultural Centers | Virginia Tech
Stored Discover Blacksburg | Virginia Tech
Stored 404 Error - Page Not Found | Virginia Tech
Stored Living-Learning Programs | Virginia Tech
Stored Arts@VirginiaTech | Virginia Tech
Stored Campus Locations | Virginia Tech
Stored Transfer Student Admissions | Virginia Tech
Stored Academics | Virginia Tech
Stored Maps and D

In [9]:
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

In [11]:
# Load the embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# FAISS index for storing embeddings
embedding_dim = 384  # For the 'all-MiniLM-L6-v2' model
index = faiss.IndexFlatL2(embedding_dim)

# Retrieve pages from MongoDB
def get_pages_from_mongo():
    return list(pages_collection.find({}, {'_id': 0, 'content': 1, 'title': 1, 'url': 1}))

# Create embeddings and store them in FAISS
def create_embeddings_and_store():
    pages = get_pages_from_mongo()
    for page in pages:
        content_embedding = model.encode(page['content'])
        index.add(np.array([content_embedding]))
        # You can also store a mapping between embeddings and pages in MongoDB or another structure
        pages_collection.update_one({'url': page['url']}, {'$set': {'embedding': content_embedding.tolist()}})
        print(f"Processed {page['title']}")

# Create embeddings for all scraped pages
create_embeddings_and_store()

Processed Home | Virginia Tech
Processed Rankings | Virginia Tech
Processed Current Students | Virginia Tech
Processed Beyond Boundaries | Virginia Tech
Processed Libraries | Virginia Tech
Processed Privacy Statement | Virginia Tech
Processed Virginia Tech Admissions Site Map | Virginia Tech
Processed Research | Virginia Tech
Processed Colleges | Virginia Tech
Processed Faculty & Staff | Virginia Tech
Processed University Status | Virginia Tech
Processed Facts About Virginia Tech | Virginia Tech
Processed 404 Resource at '/content/vtnews_vt_edu/en/tags.html' not found: No resource found
Processed Online Learning | Virginia Tech
Processed Health and Wellness | Virginia Tech
Processed Cultural Centers | Virginia Tech
Processed Discover Blacksburg | Virginia Tech
Processed 404 Error - Page Not Found | Virginia Tech
Processed Living-Learning Programs | Virginia Tech
Processed Arts@VirginiaTech | Virginia Tech
Processed Campus Locations | Virginia Tech
Processed Transfer Student Admissions 

In [12]:
from fastapi import FastAPI
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

In [13]:
app = FastAPI()

# Load the generative model (GPT)
gpt_model = GPT2LMHeadModel.from_pretrained('gpt2')
gpt_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Function to perform query and retrieve relevant documents using FAISS
def get_relevant_docs(query, k=3):
    query_embedding = model.encode(query)
    distances, indices = index.search(np.array([query_embedding]), k)
    relevant_docs = []
    for i in indices[0]:
        doc = pages_collection.find_one({'embedding': {'$exists': True}})
        relevant_docs.append(doc['content'])
    return relevant_docs

# Function to generate a response using GPT-2
def generate_response(retrieved_docs, query):
    context = "\n".join(retrieved_docs)
    input_text = f"Query: {query}\nContext: {context}\nAnswer:"
    input_ids = gpt_tokenizer.encode(input_text, return_tensors='pt')
    output = gpt_model.generate(input_ids, max_length=200)
    return gpt_tokenizer.decode(output, skip_special_tokens=True)

@app.post("/query")
async def query_vt_chatbot(query: str):
    # Step 1: Retrieve relevant documents
    relevant_docs = get_relevant_docs(query)
    
    # Step 2: Generate response
    response = generate_response(relevant_docs, query)
    return {"response": response}

# To run the FastAPI app, execute the following command:
# uvicorn filename:app --reload

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [15]:
from fastapi import FastAPI, Request
from pydantic import BaseModel

In [16]:
app = FastAPI()

# Example of request body for querying
class QueryRequest(BaseModel):
    query: str

# Example of endpoint for querying
@app.post("/query")
async def query_rag(request: QueryRequest):
    query = request.query
    # Perform RAG operations (retrieval + generation)
    # Example response (this should be replaced with actual RAG logic)
    response = {
        "query": query,
        "retrieved_docs": ["Document 1", "Document 2", "Document 3"],
        "generated_answer": "This is the generated answer based on the query and retrieved documents."
    }
    return response

In [17]:
import requests
import json

# URL of the FastAPI RAG system
URL = "http://localhost:8000/query"

# Function to test the RAG model
def test_rag_model(query_text):
    # Prepare the payload with the query
    payload = {
        "query": query_text
    }
    
    # Send POST request to the FastAPI server
    response = requests.post(URL, json=payload)
    
    # Check if the response is valid
    if response.status_code == 200:
        response_data = response.json()
        # Display the query, retrieved documents, and generated answer
        print("Query:", response_data.get("query"))
        print("\nRetrieved Documents:")
        for doc in response_data.get("retrieved_docs", []):
            print(f"- {doc}")
        print("\nGenerated Answer:")
        print(response_data.get("generated_answer"))
    else:
        print(f"Failed to query the RAG system. Status code: {response.status_code}")

if __name__ == "__main__":
    # Test with a sample query
    test_query = "What are the available courses at VT?"
    test_rag_model(test_query)


ConnectTimeout: HTTPConnectionPool(host='localhost', port=8000): Max retries exceeded with url: /query (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x44be69790>, 'Connection to localhost timed out. (connect timeout=None)'))