In [18]:
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec

In [19]:
with open("C:\\Users\\hp\\Documents\\GitHub\\A guide to brain tumor.txt","r",encoding="UTF-8")as f:
    data=f.read()
data



In [20]:
data=re.sub(r"\[\d+\]", "", data)
data



In [21]:
text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ".", " "],  
    chunk_size=1000,                    
    chunk_overlap=200                
)

In [22]:
docs=text_splitter.create_documents([data])
print(len(docs))

55


In [23]:
print(docs)



In [24]:
texts=[doc.page_content for doc in docs]
texts

['A brain tumor (sometimes referred more commonly as brain cancer) occurs when a group of cells within the brain turn cancerous and grow out of control, creating a mass. There are two main types of tumors: malignant (cancerous) tumors and benign (non-cancerous) tumors. These can be further classified as primary tumors, which start within the brain, and secondary tumors, which most commonly have spread from tumors located outside the brain, known as brain metastasis tumors. All types of brain tumors may produce symptoms that vary depending on the size of the tumor and the part of the brain that is involved. Where symptoms exist, they may include headaches, seizures, problems with vision, vomiting and mental changes. Other symptoms may include difficulty walking, speaking, with sensations, or unconsciousness.',
 'The cause of most brain tumors is unknown, though up to 4% of brain cancers may be caused by CT scan radiation. Uncommon risk factors include exposure to vinyl chloride, Epstein

In [26]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
embeddings=model.encode(texts)
print(embeddings.shape)

(55, 384)


In [27]:
embeddings

array([[ 0.05783342, -0.0067555 ,  0.00924895, ...,  0.03892153,
         0.03880731, -0.00074746],
       [ 0.0287189 ,  0.04840526, -0.00646243, ...,  0.00070438,
         0.0254956 ,  0.0346994 ],
       [ 0.03403351,  0.02955316, -0.02757662, ..., -0.05471791,
         0.03902707,  0.01566894],
       ...,
       [-0.02979328,  0.05026311, -0.09490243, ..., -0.04724129,
         0.03374846, -0.01869216],
       [-0.11466787, -0.01736349, -0.04318532, ...,  0.0057146 ,
        -0.00452529, -0.04098954],
       [-0.00773815,  0.00548788, -0.0356007 , ...,  0.00827882,
         0.0719254 , -0.0371856 ]], dtype=float32)

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()
API_KEY=os.getenv("PINECONE_API_KEY")
pc=Pinecone(api_key=API_KEY)

In [None]:
index_name="brain-tumor"
pc.create_index(
    name=index_name,
    dimension=384,
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)

{
    "name": "brain-tumor",
    "metric": "cosine",
    "host": "brain-tumor-9opds0h.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [39]:
index=pc.Index(index_name)

In [28]:
ids=[f"chunk{i}" for i in range(len(embeddings))]

In [43]:
meta=[{"texts":texts[i]} for i in range(len(texts))]

In [44]:
to_upsert=[(ids[i],embeddings[i],meta[i]) for i in range(len(embeddings))]

In [45]:
to_upsert

[('chunk0',
  array([ 5.78334220e-02, -6.75550103e-03,  9.24894866e-03,  1.11683728e-02,
         -2.28429120e-02, -2.51556318e-02,  1.00358576e-01,  8.16885475e-03,
          7.16971755e-02, -4.43182178e-02, -9.32135992e-03, -2.75103264e-02,
         -9.21983737e-03, -5.85495569e-02, -5.45555763e-02, -7.85108283e-03,
         -4.30995449e-02, -3.87843139e-02,  4.33577038e-02,  1.43388122e-01,
         -7.75262713e-02, -6.52564690e-03, -4.70346212e-02,  4.47301753e-02,
         -1.35414312e-02,  3.94888073e-02,  4.08106744e-02,  1.41096059e-02,
         -9.67098549e-02,  2.59236228e-02,  4.51406650e-02, -2.46923100e-02,
         -1.10275596e-01,  7.86185488e-02,  6.10525571e-02,  1.32895606e-02,
         -5.99063337e-02,  1.05157867e-01, -4.55460884e-02, -1.79561991e-02,
         -4.02424037e-02, -4.72517274e-02, -2.07573595e-03,  5.16345128e-02,
          1.85528304e-02, -3.80276218e-02, -9.98834074e-02, -1.13133475e-01,
         -1.46053731e-02, -6.13535568e-02,  7.81967305e-03,  1.5

In [46]:
index.upsert(vectors=to_upsert)

{'upserted_count': 55}