## Set up your environment

Install the LangChain libraries required for this notebook:

In [2]:
!pip install -qU \
    langchain-pinecone==0.1.3 \
    langchain-openai==0.1.8 \
    langchain-text-splitters==0.2.0 \
    langchain==0.2.1 \
    pinecone-notebooks==0.1.1

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m973.5/973.5 kB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m397.0/397.0 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.6/294.6 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.5/383.5 kB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.8/244.8 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Set environment variables for your Pinecone and OpenAI API keys:

In [3]:
import os

# initialize connection to pinecone (orget API key at app.pinecone.io)
if not os.environ.get("SET_PINECONE_API_KEY"):
    from pinecone_notebooks.colab import Authenticate
    Authenticate()

In [4]:
api_key = os.environ.get("PINECONE_API_KEY")

# available at platform.openai.com/api-keys
os.environ['OPENAI_API_KEY'] = os.environ.get('OPENAI_API_KEY') or "Your open AI key"

## Store knowledge in Pinecone


In [14]:

import requests
from bs4 import BeautifulSoup
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_pinecone import PineconeEmbeddings
import pinecone
import os

# URL of the Special:AllPages page on the wiki
all_pages_url = 'https://wiki.umiacs.umd.edu/umiacs/index.php/Special:AllPages'
response = requests.get(all_pages_url)
soup = BeautifulSoup(response.content, 'html.parser')

# Create a set of unique pages on the wiki
pageset = set()
for link in soup.find_all('a'):
    href = link.get('href')
    # Ensure the link is valid and points to an internal page
    if href and href.startswith('/umiacs/index.php'):
        # Build the full URL for the wiki page
        full_url = 'https://wiki.umiacs.umd.edu' + href
        # Add the unique URL to the set, following redirects
        response = requests.get(full_url, allow_redirects=True)
        response_url = response.url
        pageset.add(response_url)

# List of URLs for HTML pages
html_links = list(pageset)

# Function to fetch HTML content from a URL
def fetch_html_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Ensure the request was successful
        return response.text  # Return the HTML content as a string
    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch {url}: {e}")
        return None

# Function to extract text and headers from HTML using BeautifulSoup
def extract_text_and_headers_from_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')

    # Extract text from HTML
    page_text = soup.get_text(separator=' ', strip=True)

    # Extract the first header (e.g., h2 or h1)
    header = soup.find('h2') or soup.find('h1')
    header_text = header.get_text(strip=True) if header else "No Header"

    return page_text, header_text

text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=0)

# Parse and split the HTML content from each link, converting to Documents
parsed_pages = []
for url in html_links:
    html_content = fetch_html_from_url(url)
    if html_content:  # Only proceed if the HTML was fetched successfully
        extracted_text, header_text = extract_text_and_headers_from_html(html_content)

        # Split the extracted text into chunks
        chunks = text_splitter.split_text(extracted_text)

        # Create a Document object for each chunk and add to parsed_pages
        for chunk in chunks:
            document = Document(
                metadata={'Header 2': header_text, 'url': url},
                page_content=chunk
            )
            parsed_pages.append(document)





Failed to fetch https://wiki.umiacs.umd.edu/umiacs/index.php/UMIACS:General_disclaimer: 404 Client Error: Not Found for url: https://wiki.umiacs.umd.edu/umiacs/index.php/UMIACS:General_disclaimer




Failed to fetch https://wiki.umiacs.umd.edu/umiacs/index.php/UMIACS:Privacy_policy: 404 Client Error: Not Found for url: https://wiki.umiacs.umd.edu/umiacs/index.php/UMIACS:Privacy_policy




In [15]:
parsed_pages[0]



Initialize a LangChain embedding object. Note that this step uses a Pinecone API key you set as an environment variable earlier.

In [16]:
from langchain_pinecone import PineconeEmbeddings

model_name = 'multilingual-e5-large'
embeddings = PineconeEmbeddings(
    model=model_name,
    pinecone_api_key=os.environ.get('PINECONE_API_KEY')
)

We initialize a new client instance for Pinecone:

In [17]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))

Now we setup our index specification, this allows us to define the cloud provider and region where we want to deploy our index.

In [18]:
from pinecone import ServerlessSpec

cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'

spec = ServerlessSpec(cloud=cloud, region=region)

Define our index name:

In [19]:
index_name = "umiacs-wiki"

Now create a serverless index in Pinecone for storing the embeddings of your document, setting the index dimensions and distance metric to match those of the Pinecone `multilingual-e5-large` model you'll use to create the embeddings:



In [20]:
import time

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=embeddings.dimension,
        metric="cosine",
        spec=spec
    )
    # wait for index to be ready
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

Embed and upsert each chunk as a distinct record in a namespace. Namespaces let you partition records within an index and are essential for [implementing multitenancy](https://docs.pinecone.io/guides/indexes/implement-multitenancy) when you need to isolate the data of each customer/user.

In [21]:
import time
import json
import uuid  # For generating unique IDs, if not already present in documents

from langchain_pinecone import PineconeVectorStore

# Namespace and index settings
namespace = "umiacswiki-vector"

# List to store vector IDs
vector_ids = []

# Function to generate or extract vector IDs from documents
def generate_vector_id(document):
    if 'url' in document.metadata:
        return document.metadata['url']

    # Option 2: Generate a UUID for each document
    return str(uuid.uuid4())

# Attach unique vector IDs to the documents and store them
for doc in parsed_pages:
    vector_id = generate_vector_id(doc)
    vector_ids.append(vector_id)
    # Add the vector ID to the document metadata (if necessary for tracking)
    doc.metadata['vector_id'] = vector_id

# Insert the documents into Pinecone using PineconeVectorStore
docsearch = PineconeVectorStore.from_documents(
    documents=parsed_pages,  # Documents now have vector IDs in metadata
    index_name=index_name,
    embedding=embeddings,
    namespace=namespace
)

# Simulate a delay (optional)
time.sleep(1)

# Store vector IDs externally (e.g., in a JSON file)
with open('vector_ids.json', 'w') as f:
    json.dump(vector_ids, f, indent=4)

print(f"Inserted {len(vector_ids)} vectors and stored their IDs in 'vector_ids.json'")


Inserted 402 vectors and stored their IDs in 'vector_ids.json'


Use Pinecone's `list` and `query` operations to look at one of the records:

In [22]:
index = pc.Index(index_name)

for ids in index.list(namespace=namespace):
    query = index.query(
        id=ids[0],
        namespace=namespace,
        top_k=1,
        include_values=False,
        include_metadata=True
    )
    print(query)

{'matches': [{'id': '015e8080-83de-4faf-bd1b-289e993bec57',
              'metadata': {'Header 2': 'Contents',
                           'text': 'python your_file.py Storage There are 3 '
                                   'types of user storage available to users '
                                   'in GAMMA: Home directories Project '
                                   'directories Scratch directories There is '
                                   'also read-only storage available for '
                                   'Dataset directories. GAMMA users can also '
                                   'request Nexus project allocations . Home '
                                   'Directories You have 30GB of home '
                                   'directory storage available at '
                                   '/nfshomes/<username> .  It has both '
                                   'Snapshots and Backups enabled. Home '
                                   'directories are inten

In [None]:
pc.delete_index(index_name)

After you delete an index, you cannot use it again or recover it.