In [99]:
# !pip install langchain-pinecone

In [7]:
# !pip install pinecone

In [23]:
import bs4
from langchain_community.document_loaders import WebBaseLoader

eur_lex = "https://eur-lex.europa.eu/eli/reg/2016/679/oj/"


eur_lex_loader = WebBaseLoader(web_paths=[eur_lex])
eur_lex_docs = eur_lex_loader.load()

In [95]:
print(eur_lex_docs[0].page_content[10400:10650])

is Regulation is intended to contribute to the accomplishment of an area of freedom, security and justice and of an economic union, to economic and social progress, to the strengthening and the convergence of the economies within the internal market,


In [36]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
)
splits = text_splitter.split_documents(eur_lex_docs)

In [37]:
print(len(splits))

517


In [75]:
texts = [doc.page_content for doc in splits]
metadatas = [doc.metadata for doc in splits]  

In [79]:
metadatas[5]

{'source': 'https://eur-lex.europa.eu/eli/reg/2016/679/oj/',
 'title': 'Regulation - 2016/679 - EN - gdpr - EUR-Lex',
 'language': 'en'}

In [81]:
# from pinecone import Pinecone, ServerlessSpec
# from langchain.vectorstores import Pinecone as LangPinecone
# from langchain.embeddings import HuggingFaceEmbeddings

# import os

# # 1️⃣ Create Pinecone client instance
# pc = Pinecone(
#     api_key=os.environ["PINECONE_API_KEY"]
# )

# # 2️⃣ Create index if it doesn't exist
# if "cra-index" not in pc.list_indexes().names():
#     pc.create_index(
#         name="cra-index",
#         dimension=384,          # must match your embedding size
#         metric="cosine",
#         spec=ServerlessSpec(
#             cloud="aws",
#             region="us-east-1"   # pick your cloud region
#         )
#     )

In [59]:
import os
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(
    api_key=os.environ["PINECONE_API_KEY"],
    environment="us-east-1"
)

In [60]:
print(pc.list_indexes().names())

['cra-index']


In [64]:
index_info = pc.describe_index("cra-index")
print(index_info)

{'deletion_protection': 'disabled',
 'dimension': 384,
 'host': 'cra-index-z61yr43.svc.aped-4627-b74a.pinecone.io',
 'metric': 'cosine',
 'name': 'cra-index',
 'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
 'status': {'ready': True, 'state': 'Ready'},
 'tags': None,
 'vector_type': 'dense'}


In [80]:
from langchain_pinecone import Pinecone as LangPinecone
from langchain.embeddings import HuggingFaceEmbeddings


embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# vectorstore = LangPinecone.from_texts(
#     texts=texts,
#     embedding=embedding,
#     index_name="cra-index",
#     metadatas=metadatas  # optional
# )
# print("✅ Documents stored in Pinecone")

✅ Documents stored in Pinecone


In [85]:
# Target the index
index_name="cra-index"
dense_index = pc.Index(index_name)


# View stats for the index
stats = dense_index.describe_index_stats()
print(stats)

{'dimension': 384,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 517}},
 'total_vector_count': 517,
 'vector_type': 'dense'}


In [91]:
# Define the query
query = "What is the fundamental right protected under GDPR?"

query_vector = embedding.embed_query(query)

# # Search the dense index
# results = dense_index.search(
#     namespace='__default__',
#     query={
#         "top_k": 3,
#         "inputs": {
#             'text': query_vector
#         }
#     }
# )

# 2️⃣ Search using the vector
results = dense_index.query(
    namespace='__default__',
    top_k=3,
    vector=query_vector,
    include_metadata=True
)

for hit in results['matches']:
    print(f"id: {hit['id']:<5} | score: {round(hit['score'], 2):<5} | metadata: {hit['metadata']}")

id: ab425280-fb09-432c-a6fc-14d2c5b22660 | score: 0.65  | metadata: {'language': 'en', 'source': 'https://eur-lex.europa.eu/eli/reg/2016/679/oj/', 'text': 'Having regard to the opinion of the European Economic and Social Committee\xa0(1),\n\n\nHaving regard to the opinion of the Committee of the Regions\xa0(2),\n\n\nActing in accordance with the ordinary legislative procedure\xa0(3),\n\nWhereas:\n\n\n\n\n\n\n\n(1)\n\n\nThe protection of natural persons in relation to the processing of personal data is a fundamental right. Article\xa08(1) of the Charter of Fundamental Rights of the European Union (the ‘Charter’) and Article 16(1) of the Treaty on the Functioning of the European Union (TFEU) provide that everyone has the right to the protection of personal data concerning him or her.\n\n\n\n\n\n\n\n\n\n\n\n\n(2)', 'title': 'Regulation - 2016/679 - EN - gdpr - EUR-Lex'}
id: 18389c35-dc3a-4407-8948-d540a1d2af92 | score: 0.63  | metadata: {'language': 'en', 'source': 'https://eur-lex.europa

In [98]:
# !pip list