In [1]:
!pip install transformers==4.47.0

Collecting transformers==4.47.0
  Downloading transformers-4.47.0-py3-none-any.whl (10.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m57.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers<0.22,>=0.21
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m56.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.4.1
  Downloading safetensors-0.4.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (436 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m436.1/436.1 kB[0m [31m67.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.24.0
  Downloading huggingface_hub-0.27.0-py3-none-any.whl (450 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m450.5/450.5 kB[0m [31m35.3 MB/s[0m eta [36m0:00:00[0m
Collecting pyyaml>=5.1
  Downloading PyYAML-

In [2]:
# The error indicates that the 'transformers' library is not installed.
# To resolve this, we need to ensure the 'transformers' library is installed in the environment.
# I'll add the installation step for the 'transformers' package.

# Install transformers
!pip install transformers


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m40.7 MB/s[0m eta [36m0:00:00[0m
Collecting numpy<3.0,>=1.25.0
  Downloading numpy-2.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (19.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.5/19.5 MB[0m [31m61.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: numpy, faiss-cpu
  Attempting uninstall: numpy
    Found existing installation: numpy 1.23.4
    Not uninstalling numpy at /shared-libs/python3.9/py/lib/python3.9/site-packages, outside environment /root/venv
    Can't uninstall 'numpy'. No files were found to uninstall.
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
scipy 1.9.3 requires numpy<1

In [None]:
import requests
from bs4 import BeautifulSoup
from transformers import AutoTokenizer, AutoModel
import torch
import faiss
from logging import getLogger
import cachetools

# Initialize logger
logger = getLogger(__name__)

# Initialize cache for embeddings
embedding_cache = cachetools.LRUCache(maxsize=100)

# Load pre-trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

def fetch_data_from_website(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup.get_text()

def generate_embeddings(text_chunk):
    inputs = tokenizer(text_chunk, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

def process_query(query):
    """
    Processes the user input into an embedding.

    Args:
        query: The user's input query.

    Returns:
        The vector embedding of the query.
    """
    return generate_embeddings(query)

def store_embeddings(index, texts, embedding_fn):
    embeddings = []
    for text in texts:
        if text not in embedding_cache:
            embedding = embedding_fn(text)
            embedding_cache[text] = embedding
        else:
            embedding = embedding_cache[text]
        # Convert embedding to PyTorch tensor before appending
        embeddings.append(torch.from_numpy(embedding))  
    embeddings = torch.cat(embeddings, dim=0).numpy()
    index.add(embeddings)
# URLs to process
urls = [
    "https://www.uchicago.edu/",
    "https://www.washington.edu/",
    "https://www.stanford.edu/",
    "https://und.edu/"
]

# Fetch and embed text data from websites
texts = [fetch_data_from_website(url) for url in urls]
index = faiss.IndexFlatL2(384)  # Dimension of embeddings for sentence-transformers/all-MiniLM-L6-v2

store_embeddings(index, texts, generate_embeddings)

# Main loop to handle user queries
while True:
    user_input = input("Enter your query: ")
    
    if user_input.lower() in ['exit', 'quit']:
        print("Exiting...")
        break

    # Process the user input into an embedding
    query_embedding = process_query(user_input)
    
    # Perform similarity search
    distances, indices = index.search(query_embedding, k=5)
    
    # Retrieve and print the most relevant texts
    retrieved_texts = [texts[i] for i in indices[0]]
    for text in retrieved_texts:
        print(f"Relevant text: {text[:500]}...")  # Print the first 500 characters for brevity


Relevant text:  UW Homepage &lt;iframe src="https://www.googletagmanager.com/ns.html?id=GTM-KQ6QQBT" height="0" width="0" style="display:none;visibility:hidden" aria-hidden="true"&gt;&lt;/iframe&gt; Skip to main content MyUWCalendarDirectoriesLibrariesUW MedicineMapsUW NewsHelpful Links Computing/ITWorkday HCMHusky CardUW BothellUW TacomaUW FacebookUW TwitterUniversity of WashingtonUniversity of Washington StudentsParentsFaculty & StaffAlumniQuick LinksAbout About the UWDiversityGlobal ImpactInnovationLeadersh...
Relevant text: 























 
UND | Grand Forks, ND | University of North Dakota











 Skip to main content



Open Menu

Close Menu

University of North Dakota

Open Search

Close Search




University of North Dakota

Info For

Admitted Students
Current Students
Families of Current Students
Faculty & Staff
Alumni



Logins

Email
Blackboard
Campus Connection
Employee Self-Service
Hawk Central
Degree Map
Zoom



Directory
Search
Search
Submit











Acade

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=8af59b1f-0f11-4005-baa5-3b1b335e1721' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>