!nvidia-smi

In [36]:
# Install necessary libraries
!pip install transformers sentence-transformers faiss-cpu PyMuPDF



In [51]:
!pip install torch



In [38]:
!pip install PyMuPDF



In [52]:
import os
import faiss
import json
from transformers import AutoModel, AutoTokenizer
from sentence_transformers import SentenceTransformer, models
import fitz  # Import fitz from PyMuPDF

In [54]:
# Check CUDA availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Model Name
model_name = "Alibaba-NLP/gte-large-en-v1.5"

# Load the Hugging Face model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
model.to(device)  # Move model to GPU if available

# Create a SentenceTransformer model using the loaded AutoModel and AutoTokenizer
word_embedding_model = models.Transformer(model_name)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
sentence_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
sentence_model.to(device)  # Move SentenceTransformer model to GPU if available

# Display the number of dimensions the model supports for the vector database
embedding_dim = sentence_model.get_sentence_embedding_dimension()
print(f"The model supports {embedding_dim} dimensions for the vector database.")

Using device: cuda
The repository for Alibaba-NLP/gte-large-en-v1.5 contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/Alibaba-NLP/gte-large-en-v1.5.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y
The repository for Alibaba-NLP/gte-large-en-v1.5 contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/Alibaba-NLP/gte-large-en-v1.5.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y
The model supports 1024 dimensions for the vector database.


In [55]:
# Define the data directory
data_dir = "data"

In [56]:
if os.path.exists(data_dir):
    # Iterate over files in the directory
    for filename in os.listdir(data_dir):
        # Print each filename
        print(filename)
else:
    print(f"Directory '{data_dir}' does not exist.")

Diagnostic and statistical manual of mental disorders _ DSM-5 ( PDFDrive.com ).pdf
guideline-170-en.pdf
FirstAid-manual.pdf
Current Essentials of Medicine(1)(1).pdf
essentials-of-human-nutrition1.pdf
disease-handbook-complete.pdf
LN_Pediatrics_final.pdf
Gerontological Nursing.pdf


In [57]:
def read_pdfs(directory):
    documents = []
    for filename in os.listdir(directory):
        if filename.endswith(".pdf"):
            doc_path = os.path.join(directory, filename)
            doc = fitz.open(doc_path)
            num_pages = doc.page_count
            text = [doc[i].get_text() for i in range(num_pages)]
            documents.append({"source": filename, "text": text})
            doc.close()
    return documents

In [58]:
# Load PDFs
documents = read_pdfs(data_dir)

In [59]:
# Define the text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100, length_function=len, is_separator_regex=False)

# Split documents into chunks
all_chunks = []
metadata = []

In [60]:
for doc in documents:
    doc_name = doc["source"]
    for page_num, page_content in enumerate(doc["text"]):
        chunks = text_splitter.split_text(page_content)
        for chunk_index, chunk in enumerate(chunks):
            all_chunks.append(chunk)
            metadata.append({"pdf_name": doc_name, "pdf_page": page_num, "chunk_index": chunk_index})

In [61]:
# Number of chunks
total_chunks = len(all_chunks)
print(f"Total chunks: {total_chunks}")

# Create embeddings and build the FAISS index
index = faiss.IndexFlatL2(embedding_dim)

Total chunks: 12718


In [62]:
for i, chunk in enumerate(all_chunks):
    # Tokenize the chunk
    inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True)
    inputs = {key: value.to(device) for key, value in inputs.items()}  # Move inputs to GPU

    # Pass the inputs through the model to get embeddings
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pool the embeddings
        embeddings = embeddings.cpu().numpy()  # Move embeddings back to CPU for FAISS

    # Add the vector to the index
    index.add(embeddings)

    # Print progress
    print(f"Processing chunk {i + 1} out of {total_chunks}", end="\r")



In [63]:
# Save the index
faiss.write_index(index, "vector_index.faiss")

# Save the metadata
with open("metadata.json", "w") as f:
    json.dump(metadata, f)

print("\nVector database creation complete.")


Vector database creation complete.


# This is a segregator

In [6]:
!pip install langchain_google_genai PyMuPDF faiss-cpu transformers sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transform

In [7]:
!python --version

Python 3.10.12


In [8]:
import faiss
import json
import torch
from sentence_transformers import SentenceTransformer, models
from transformers import AutoTokenizer
from langchain_google_genai import ChatGoogleGenerativeAI

  from tqdm.autonotebook import tqdm, trange


In [11]:
# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the FAISS index (keeping it on CPU)
index = faiss.read_index("vector_index.faiss")

# Load the metadata
with open("metadata.json", "r") as f:
    metadata = json.load(f)

Using device: cpu


In [12]:
# Load the SentenceTransformer model
model_name = "Alibaba-NLP/gte-large-en-v1.5"  # Change to your specific model if different
tokenizer = AutoTokenizer.from_pretrained(model_name)
word_embedding_model = models.Transformer(model_name)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
sentence_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
sentence_model.to(device)  # Move model to GPU if available

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

The repository for Alibaba-NLP/gte-large-en-v1.5 contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/Alibaba-NLP/gte-large-en-v1.5.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


configuration.py:   0%|          | 0.00/7.13k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- configuration.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


The repository for Alibaba-NLP/gte-large-en-v1.5 contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/Alibaba-NLP/gte-large-en-v1.5.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


modeling.py:   0%|          | 0.00/57.5k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- modeling.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

SentenceTransformer(
  (0): Transformer({'max_seq_length': 8192, 'do_lower_case': False}) with Transformer model: NewModel 
  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [13]:
import fitz
data_dir="data"

In [14]:
# Function to perform similarity search
def similarity_search(query, top_k=3):
    # Tokenize the query
    inputs = tokenizer(query, return_tensors="pt", padding=True, truncation=True)
    inputs = {key: value.to(device) for key, value in inputs.items()}  # Move inputs to GPU

    # Pass the inputs through the model to get embeddings
    with torch.no_grad():
        query_embedding = sentence_model.encode(query, convert_to_tensor=True)
        query_embedding = query_embedding.unsqueeze(0)  # Ensure it's a 2D array with shape (1, embedding_dim)
        query_embedding = query_embedding.to("cpu").numpy()  # Move embedding back to CPU for FAISS

    # Search the index for the top_k most similar vectors
    distances, indices = index.search(query_embedding, top_k)

    # Retrieve the corresponding chunks and their metadata
    results = []
    for i, idx in enumerate(indices[0]):
        # Fetch the chunk metadata
        chunk_metadata = metadata[idx]
        pdf_name = chunk_metadata["pdf_name"]
        pdf_page = chunk_metadata["pdf_page"]
        chunk_index = chunk_metadata["chunk_index"]
        result = {
            "chunk": {
                "pdf_name": pdf_name,
                "pdf_page": pdf_page,
                "chunk_index": chunk_index
            },
            "distance": distances[0][i]
        }
        results.append(result)

    return results

In [15]:
# Function to extract text from a PDF page
def extract_text_from_page(pdf_path, page_num):
    doc = fitz.open(pdf_path)
    text = doc[page_num].get_text()
    doc.close()
    return text

In [25]:
# Function to generate a response using the ChatGoogleGenerativeAI model
def generate_response(context, query):
    # Initialize the ChatGoogleGenerativeAI model
    google_model = ChatGoogleGenerativeAI(
        model="gemini-1.5-flash",
        google_api_key="your_gemini_api_key",
        temperature=0.6
    )

    # Combine context and query into the message history
    messages = [
        {"role": "user", "content": context},
        {"role": "user", "content": query}
    ]

    # Generate the response
    response = google_model.invoke(messages)
    return response.content  # Correctly extract the response content

In [26]:
# Define your query
query = "What is Schizophrenia?"

In [27]:
!curl ipinfo.io

{
  "ip": "34.125.70.200",
  "hostname": "200.70.125.34.bc.googleusercontent.com",
  "city": "Las Vegas",
  "region": "Nevada",
  "country": "US",
  "loc": "36.1750,-115.1372",
  "org": "AS396982 Google LLC",
  "postal": "89111",
  "timezone": "America/Los_Angeles",
  "readme": "https://ipinfo.io/missingauth"
}

In [28]:
# Perform similarity search to get the top 3 relevant chunks
results = similarity_search(query, top_k=3)

# Prepare the context from the search results
context = ""
visited_pages = set()  # To keep track of visited pages and avoid duplication

for res in results:
    chunk = res["chunk"]
    pdf_name = chunk["pdf_name"]
    pdf_page = chunk["pdf_page"]
    page_key = (pdf_name, pdf_page)

    if page_key not in visited_pages:
        visited_pages.add(page_key)
        pdf_path = f"{data_dir}/{pdf_name}"
        page_text = extract_text_from_page(pdf_path, pdf_page)

        context += f"Document: {pdf_name}, Page: {pdf_page}\n"
        context += f"Text:\n{page_text}\n\n\n"

# Show the contexts fetched
print("Contexts Fetched:\n")
print(context)

# Generate the response
response = generate_response(context, query)

# Print the response
print("\nResponse:\n", response)

Contexts Fetched:

Document: Diagnostic and statistical manual of mental disorders _ DSM-5 ( PDFDrive.com ).pdf, Page: 131
Text:
87
Schizophrenia Spectrum and
Other Psychotic Disorders
Schizophrenia spectrum and other psychotic disorders include schizophrenia,
other psychotic disorders, and schizotypal (personality) disorder. They are defined by ab-
normalities in one or more of the following five domains: delusions, hallucinations, disor-
ganized thinking (speech), grossly disorganized or abnormal motor behavior (including
catatonia), and negative symptoms.
Key Features That Define the Psychotic Disorders
Delusions
Delusions are fixed beliefs that are not amenable to change in light of conflicting evidence.
Their content may include a variety of themes (e.g., persecutory, referential, somatic, reli-
gious, grandiose). Persecutory delusions (i.e., belief that one is going to be harmed, harassed,
and so forth by an individual, organization, or other group) are most common. Referential
d