In [20]:
pip install requests beautifulsoup4 transformers faiss-cpu sentence-transformers




In [22]:
import requests
from bs4 import BeautifulSoup

def scrape_website(url):
    # Set a user-agent to simulate a browser request
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    try:
        # Send GET request to the website with headers
        response = requests.get(url, headers=headers)

        if response.status_code != 200:
            print(f"Failed to retrieve content from {url}. Status code: {response.status_code}")
            return None

        # Parse HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract all textual content from paragraphs
        text = " ".join([p.get_text() for p in soup.find_all('p')])
        return text
    except Exception as e:
        print(f"An error occurred while scraping {url}: {e}")
        return None

# Example usage
url = "https://www.uchicago.edu/"
website_text = scrape_website(url)

if website_text:
    print(website_text[:500])  # Print first 500 characters of the scraped text
else:
    print("Failed to retrieve website content.")


A diversity of people and ideas, coupled with free and open discourse, lays the foundation for students and scholars to bring forth original ideas that define fields and enrich human life. UChicago students develop the habits of mind and intellectual skills needed to confront complex challenges. UChicago researchers have contributed to some of the world’s greatest discoveries, advancements, and bodies of knowledge. Faculty have a free and challenging environment in which to pursue the most origi


In [23]:
def chunk_text(text, chunk_size=500):
    # Split text into chunks of a specific size
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
    return chunks

# Example usage
chunks = chunk_text(website_text)
print(chunks[:2])  # Print first 2 chunks


['A diversity of people and ideas, coupled with free and open discourse, lays the foundation for students and scholars to bring forth original ideas that define fields and enrich human life. UChicago students develop the habits of mind and intellectual skills needed to confront complex challenges. UChicago researchers have contributed to some of the world’s greatest discoveries, advancements, and bodies of knowledge. Faculty have a free and challenging environment in which to pursue the most origi', "nal research. As a community partner, we invest in Chicago’s South Side across such areas as health, education,  economic growth, and the arts. We are an international community of scholars working to solve the world's most pressing issues, with initiatives and programs on all seven continents. Chicago is not only in our name, it’s woven into the fabric of this institution. Located in the Hyde Park neighborhood, we benefit from the diversity, arts, and vibrant culture of our South Side com

In [24]:
from sentence_transformers import SentenceTransformer

# Initialize pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')

def get_embeddings(chunks):
    # Convert each chunk to embeddings
    embeddings = model.encode(chunks)
    return embeddings

# Example usage
chunk_embeddings = get_embeddings(chunks)
print(chunk_embeddings[:2])  # Print the first two embeddings


[[ 4.58043404e-02 -2.07584933e-03  1.43037653e-02  6.92893099e-03
  -6.77575693e-02 -4.95091677e-02 -1.57416775e-03 -6.64268658e-02
   7.62342736e-02  1.08552225e-01  3.59665826e-02 -3.63682248e-02
  -9.58061814e-02 -3.11354231e-02  4.31821187e-04  4.20903563e-02
  -5.89733683e-02 -6.06777221e-02 -3.97578292e-02  1.49847399e-02
   1.14547636e-03  3.90778705e-02  6.21520728e-03  6.71921521e-02
  -8.15694258e-02  6.93417341e-02  4.62126620e-02 -7.11729154e-02
   4.35531959e-02 -2.39560939e-02  8.99683964e-03  6.08324744e-02
   4.08646315e-02  2.50846576e-02  6.22564647e-03  6.25678226e-02
  -8.05600174e-03  8.27684626e-02  2.90900525e-02 -1.16504813e-02
  -9.16371718e-02 -4.74752523e-02  3.15322205e-02  5.01023531e-02
  -4.38381732e-02 -1.38991728e-01 -1.26548614e-02 -2.64236461e-02
   2.88277157e-02 -6.09268527e-03 -1.40049886e-02 -8.54715481e-02
  -1.31495576e-02 -1.08932450e-01 -5.12540266e-02  1.48048969e-02
  -3.02180517e-02  7.60205183e-03 -2.34153066e-02 -6.50347769e-02
   7.83972

In [25]:
import faiss
import numpy as np

def create_faiss_index(embeddings):
    # Convert embeddings to numpy array (FAISS requires numpy arrays)
    embeddings_np = np.array(embeddings).astype(np.float32)

    # Create FAISS index
    index = faiss.IndexFlatL2(embeddings_np.shape[1])  # L2 distance metric
    index.add(embeddings_np)  # Add embeddings to the index
    return index

# Example usage
index = create_faiss_index(chunk_embeddings)


In [26]:
def query_to_embedding(query):
    # Convert user query to embedding
    query_embedding = model.encode([query])
    return query_embedding

def retrieve_relevant_chunks(query_embedding, index, k=3):
    # Perform similarity search (find top k closest chunks)
    distances, indices = index.search(np.array(query_embedding).astype(np.float32), k)

    # Retrieve the corresponding chunks based on the indices
    relevant_chunks = [chunks[i] for i in indices[0]]
    return relevant_chunks

# Example usage
query = "What is the latest research at the University of Chicago?"
query_embedding = query_to_embedding(query)
relevant_chunks = retrieve_relevant_chunks(query_embedding, index)
print(relevant_chunks)  # Print the top relevant chunks


["nal research. As a community partner, we invest in Chicago’s South Side across such areas as health, education,  economic growth, and the arts. We are an international community of scholars working to solve the world's most pressing issues, with initiatives and programs on all seven continents. Chicago is not only in our name, it’s woven into the fabric of this institution. Located in the Hyde Park neighborhood, we benefit from the diversity, arts, and vibrant culture of our South Side community", 'A diversity of people and ideas, coupled with free and open discourse, lays the foundation for students and scholars to bring forth original ideas that define fields and enrich human life. UChicago students develop the habits of mind and intellectual skills needed to confront complex challenges. UChicago researchers have contributed to some of the world’s greatest discoveries, advancements, and bodies of knowledge. Faculty have a free and challenging environment in which to pursue the most

In [27]:
def generate_response(query, relevant_chunks):
    # Concatenate retrieved chunks and form a prompt for the response
    context = " ".join(relevant_chunks)
    response = f"Query: {query}\nContext: {context}\nAnswer:"
    return response

# Example usage
response = generate_response(query, relevant_chunks)
print(response)  # Output the response based on the retrieved chunks


Query: What is the latest research at the University of Chicago?
Context: nal research. As a community partner, we invest in Chicago’s South Side across such areas as health, education,  economic growth, and the arts. We are an international community of scholars working to solve the world's most pressing issues, with initiatives and programs on all seven continents. Chicago is not only in our name, it’s woven into the fabric of this institution. Located in the Hyde Park neighborhood, we benefit from the diversity, arts, and vibrant culture of our South Side community A diversity of people and ideas, coupled with free and open discourse, lays the foundation for students and scholars to bring forth original ideas that define fields and enrich human life. UChicago students develop the habits of mind and intellectual skills needed to confront complex challenges. UChicago researchers have contributed to some of the world’s greatest discoveries, advancements, and bodies of knowledge. Facult