In [1]:
import cohere
from pinecone import Pinecone, ServerlessSpec
import time
import os
from dotenv import load_dotenv
from src.utils import load_and_chunk_documents

  from .autonotebook import tqdm as notebook_tqdm


In [2]:


# Load environment variables from .env file
load_dotenv()

# Access the API keys
cohere_api_key = os.getenv('COHERE_API_KEY')
pinecone_api_key = os.getenv('PINECONE_API_KEY')


chunked_docs = load_and_chunk_documents(r"./data")


Processing file: long-doc.txt...
Loaded 1 documents from long-doc.txt
Processing file: pdf.pdf...
Loaded 660 documents from pdf.pdf


In [3]:
INDEX_NAME = "ragbot"

# Initialize Cohere Client
co = cohere.Client(cohere_api_key)

# Initialize Pinecone Client
pc = Pinecone(api_key=pinecone_api_key)

# Check if the index exists, else create it
if INDEX_NAME not in [index.name for index in pc.list_indexes()]:
    pc.create_index(
        name=INDEX_NAME,
        dimension=1024,  # Cohere embedding dimension (check model specs)
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

# Connect to the index
index = pc.Index(INDEX_NAME)

In [4]:

def get_cohere_embedding(text):
    response = co.embed(texts=[text], model="embed-english-v3.0")  # Use the best available model
    return response.embeddings[0]


In [5]:
text= "gdhwmj,bbvcn.kqwbqc chjkjdwnb sqmnhgavb"
embeddings = get_cohere_embedding(text)

ConnectError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1007)

In [6]:
import huggingface_hub

In [8]:
huggingface_hub.login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [11]:
from huggingface_hub import login
import os
import time
import torch
from dotenv import load_dotenv
from langchain_community.document_loaders import Docx2txtLoader, TextLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pinecone import Pinecone, ServerlessSpec
from transformers import AutoModel, AutoTokenizer
from huggingface_hub import hf_hub_download
from src.utils import load_and_chunk_documents
import requests

os.environ["HF_HUB_DISABLE_SSL_VERIFY"] = "1"
os.environ["CURL_CA_BUNDLE"] = ""

# Load environment variables from .env file
load_dotenv()

PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")

# Login to Hugging Face with SSL verification disabled
login(token=HUGGINGFACEHUB_API_TOKEN)

# Disable SSL verification globally in requests
requests.packages.urllib3.disable_warnings()
requests.Session().verify = False




SSLError: (MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /api/whoami-v2 (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1007)')))"), '(Request ID: d5140174-f195-48f7-9944-ccdc257f8b4a)')

In [None]:
# Load embedding model from Hugging Face Hub with SSL disabled
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

# Download model files (optional, for offline access)
hf_hub_download(repo_id=MODEL_NAME, filename="config.json")

# Load tokenizer and model (disable SSL verification)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = AutoModel.from_pretrained(MODEL_NAME, trust_remote_code=True)

# Function to generate embeddings
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze().tolist()  # CLS token representation

# Initialize Pinecone with SSL disabled
pc = Pinecone(api_key=PINECONE_API_KEY)
INDEX_NAME = "rag"

# Ensure index exists
if INDEX_NAME not in [index.name for index in pc.list_indexes()]:
    pc.create_index(
        name=INDEX_NAME,
        dimension=384,  # all-MiniLM-L6-v2 embedding size
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

# Connect to the Pinecone index
index = pc.Index(INDEX_NAME)

chunk_documents = load_and_chunk_documents(r"./data")

# Insert document chunks into Pinecone
vector_data = [
    {
        "id": f"doc_{i}",
        "values": get_embedding(chunk.page_content),  # Generate embedding
        "metadata": {"text": chunk.page_content}
    }
    for i, chunk in enumerate(chunk_documents)
]

index.upsert(vectors=vector_data)
time.sleep(2)  # Allow indexing time

# Search function
def search_pinecone(query, top_k=5):
    query_embedding = get_embedding(query)
    results = index.query(vector=query_embedding, top_k=top_k, include_metadata=True)

    print("\n🔍 Search Results:")
    for match in results.matches:
        print(f"\nScore: {match.score}")
        print(f"Text: {match.metadata['text']}")

# Example search
query_text = "what is Nationalism and Imperialism?"
search_pinecone(query_text, top_k=5)
