In [1]:
## Installing libraries
!pip install langchain pinecone huggingface tiktoken pypdf langchain_huggingface langchain_community -q


[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
## Improting libraries
from langchain_huggingface import HuggingFaceEndpointEmbeddings
from langchain.vectorstores import Pinecone
from langchain_core.documents import Document

In [3]:
# Create LangChain documents for Machine Learning concepts

doc1 = Document(
    page_content="Supervised learning is a type of machine learning where the model is trained on labeled data. It is used in tasks like classification and regression.",
    metadata={"type": "Learning Paradigm"}
)

doc2 = Document(
    page_content="Unsupervised learning involves training a model on data without labeled responses. It is commonly used for clustering and dimensionality reduction.",
    metadata={"type": "Learning Paradigm"}
)

doc3 = Document(
    page_content="Decision Trees are interpretable machine learning models used for classification and regression. They split the data into branches based on feature values.",
    metadata={"type": "Algorithm"}
)

doc4 = Document(
    page_content="Support Vector Machines (SVMs) are powerful classifiers that work well on both linear and non-linear problems by using kernel tricks.",
    metadata={"type": "Algorithm"}
)

doc5 = Document(
    page_content="Overfitting is a common problem in machine learning where a model learns the training data too well, including noise, leading to poor generalization on new data.",
    metadata={"type": "Model Issue"}
)


In [4]:
## Creating list of docs
docs = [doc1, doc2, doc3, doc4, doc5]

In [10]:
## Initializing vector store with specific properties
vector_store = Pinecone(
    embedding=HuggingFaceEndpointEmbeddings(),
    text_key='pinecone_db',
    index= Pinecone.get_pinecone_index
    )

AttributeError: module 'pinecone' has no attribute 'Index'

In [8]:
## Adding docs in vector store (all at once)
vector_store.add_documents(documents=docs)

['d0780ad9-3cee-4c1d-b578-b15be3d8956a',
 'cc8286d5-efcc-466a-a944-3521498f5f25',
 'd1a5e62a-3e9c-4184-88fd-bb2734949ca1',
 '662655b1-5d6d-4db9-be9b-f6ee5643dd2a',
 'b06237e2-9a45-42fc-99b6-e07f15e53f0e']

In [9]:
## View documents
vector_store.get(include=['embeddings', 'documents', 'metadatas'])

{'ids': ['d0780ad9-3cee-4c1d-b578-b15be3d8956a',
  'cc8286d5-efcc-466a-a944-3521498f5f25',
  'd1a5e62a-3e9c-4184-88fd-bb2734949ca1',
  '662655b1-5d6d-4db9-be9b-f6ee5643dd2a',
  'b06237e2-9a45-42fc-99b6-e07f15e53f0e'],
 'embeddings': array([[-0.00349487, -0.06937262, -0.03674874, ...,  0.01894872,
          0.03802383,  0.03368926],
        [-0.03784934,  0.00225465, -0.00951576, ..., -0.00423702,
         -0.00273802,  0.01476773],
        [-0.02975775, -0.07652662, -0.03067826, ...,  0.00419427,
          0.05259743, -0.00317904],
        [-0.01568775, -0.01699524, -0.06006106, ..., -0.00444106,
          0.00874989, -0.00119687],
        [-0.05773736, -0.01843516,  0.00245588, ...,  0.01997842,
          0.06201811, -0.01657497]], shape=(5, 768)),
 'documents': ['Supervised learning is a type of machine learning where the model is trained on labeled data. It is used in tasks like classification and regression.',
  'Unsupervised learning involves training a model on data without label

In [10]:
## Search Vectors
vector_store.similarity_search(
    query= 'Machine Learning algorithmns ?',
    k=4
)

[Document(metadata={'type': 'Algorithm'}, page_content='Support Vector Machines (SVMs) are powerful classifiers that work well on both linear and non-linear problems by using kernel tricks.'),
 Document(metadata={'type': 'Learning Paradigm'}, page_content='Supervised learning is a type of machine learning where the model is trained on labeled data. It is used in tasks like classification and regression.'),
 Document(metadata={'type': 'Model Issue'}, page_content='Overfitting is a common problem in machine learning where a model learns the training data too well, including noise, leading to poor generalization on new data.'),
 Document(metadata={'type': 'Learning Paradigm'}, page_content='Unsupervised learning involves training a model on data without labeled responses. It is commonly used for clustering and dimensionality reduction.')]

In [11]:
## Search Vector with Similarity Score
vector_store.similarity_search_with_score(
    query= 'Machine Learning algorithmns ?',
    k=4
)

[(Document(metadata={'type': 'Algorithm'}, page_content='Support Vector Machines (SVMs) are powerful classifiers that work well on both linear and non-linear problems by using kernel tricks.'),
  1.0240678787231445),
 (Document(metadata={'type': 'Learning Paradigm'}, page_content='Supervised learning is a type of machine learning where the model is trained on labeled data. It is used in tasks like classification and regression.'),
  1.1079070568084717),
 (Document(metadata={'type': 'Model Issue'}, page_content='Overfitting is a common problem in machine learning where a model learns the training data too well, including noise, leading to poor generalization on new data.'),
  1.2296489477157593),
 (Document(metadata={'type': 'Learning Paradigm'}, page_content='Unsupervised learning involves training a model on data without labeled responses. It is commonly used for clustering and dimensionality reduction.'),
  1.255737543106079)]

In [12]:
## Metadata Filtering
vector_store.similarity_search_with_score(
    query= '',
    filter= {'type' : 'Algorithm'}
)

[(Document(metadata={'type': 'Algorithm'}, page_content='Support Vector Machines (SVMs) are powerful classifiers that work well on both linear and non-linear problems by using kernel tricks.'),
  1.8860936164855957),
 (Document(metadata={'type': 'Algorithm'}, page_content='Decision Trees are interpretable machine learning models used for classification and regression. They split the data into branches based on feature values.'),
  2.0888729095458984)]

In [16]:
## Creating New Document
updated_doc5 = Document(
    page_content="Underfitting occurs when a machine learning model is too simple to capture the underlying patterns in the data. It results in poor performance on both training and test datasets.",
    metadata={"type": "Model Issue"}
)

## Update Document
vector_store.update_document(document_id= 'b06237e2-9a45-42fc-99b6-e07f15e53f0e', document=updated_doc5)

## Checking updated document
vector_store.get(include=['embeddings', 'documents', 'metadatas'])

{'ids': ['d0780ad9-3cee-4c1d-b578-b15be3d8956a',
  'cc8286d5-efcc-466a-a944-3521498f5f25',
  'd1a5e62a-3e9c-4184-88fd-bb2734949ca1',
  '662655b1-5d6d-4db9-be9b-f6ee5643dd2a',
  'b06237e2-9a45-42fc-99b6-e07f15e53f0e'],
 'embeddings': array([[-0.00349487, -0.06937262, -0.03674874, ...,  0.01894872,
          0.03802383,  0.03368926],
        [-0.03784934,  0.00225465, -0.00951576, ..., -0.00423702,
         -0.00273802,  0.01476773],
        [-0.02975775, -0.07652662, -0.03067826, ...,  0.00419427,
          0.05259743, -0.00317904],
        [-0.01568775, -0.01699524, -0.06006106, ..., -0.00444106,
          0.00874989, -0.00119687],
        [-0.06818321, -0.06726655, -0.00853008, ..., -0.0204084 ,
          0.0782131 , -0.00220448]], shape=(5, 768)),
 'documents': ['Supervised learning is a type of machine learning where the model is trained on labeled data. It is used in tasks like classification and regression.',
  'Unsupervised learning involves training a model on data without label

In [17]:
## Delete Document
vector_store.delete(ids= 'b06237e2-9a45-42fc-99b6-e07f15e53f0e')

## Checking updated document
vector_store.get(include=['embeddings', 'documents', 'metadatas'])

{'ids': ['d0780ad9-3cee-4c1d-b578-b15be3d8956a',
  'cc8286d5-efcc-466a-a944-3521498f5f25',
  'd1a5e62a-3e9c-4184-88fd-bb2734949ca1',
  '662655b1-5d6d-4db9-be9b-f6ee5643dd2a'],
 'embeddings': array([[-0.00349487, -0.06937262, -0.03674874, ...,  0.01894872,
          0.03802383,  0.03368926],
        [-0.03784934,  0.00225465, -0.00951576, ..., -0.00423702,
         -0.00273802,  0.01476773],
        [-0.02975775, -0.07652662, -0.03067826, ...,  0.00419427,
          0.05259743, -0.00317904],
        [-0.01568775, -0.01699524, -0.06006106, ..., -0.00444106,
          0.00874989, -0.00119687]], shape=(4, 768)),
 'documents': ['Supervised learning is a type of machine learning where the model is trained on labeled data. It is used in tasks like classification and regression.',
  'Unsupervised learning involves training a model on data without labeled responses. It is commonly used for clustering and dimensionality reduction.',
  'Decision Trees are interpretable machine learning models use