In [3]:
import google.generativeai as genai

  from .autonotebook import tqdm as notebook_tqdm


Key Configuration

In [4]:
from dotenv import load_dotenv
import os

In [9]:
load_dotenv()
google_key=os.getenv('GOOGLE_API_KEY')
genai.configure(api_key=google_key)


Reading and Storing the Document

In [15]:
import fitz

In [16]:
# Function to merge multiple PDFs into one
def merge_pdfs_from_folder(folder_path, output_pdf_path):
    """
    Merges all PDFs in the specified folder into a single PDF.
    
    Args:
        folder_path (str): Path to the folder containing PDFs.
        output_pdf_path (str): Path to save the merged PDF.
        
    Returns:
        str: Path to the merged PDF.
    """
    # Create a new empty PDF document
    output_pdf = fitz.open()

    # Loop through all files in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".pdf"):  # Only consider PDF files
            file_path = os.path.join(folder_path, file_name)
            input_pdf = fitz.open(file_path)  # Open the PDF
            output_pdf.insert_pdf(input_pdf)  # Insert pages into the output PDF
            input_pdf.close()  # Close the input PDF after merging

    # Save the merged PDF
    output_pdf.save(output_pdf_path)
    output_pdf.close()

    return output_pdf_path

In [31]:
final_pdf=merge_pdfs_from_folder("Empath-AI/Data/Mental_Health-PDFS","Empath-AI/Data/Final.pdf")

In [32]:
# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

In [34]:
extracted_text=extract_text_from_pdf("Empath-AI/Data/Final.pdf")

In [39]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

def split_text_into_chunks(text):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=50)
    chunks = text_splitter.split_text(text)
    return chunks



In [40]:
extracted_chunks=split_text_into_chunks(extracted_text)

In [42]:
for i, chunk in enumerate(extracted_chunks[:5]):
    print(f"Chunk {i}: {chunk[:200]}") 

Chunk 0: MENTAL 
HEALTH CARE
 in Settings Where Mental Health 
Resources Are Limited
An Easy-Reference 
GUIDEBOOK 
for Healthcare Providers
 in Developed and Developing Countries
PA M E L A  S M I T H ,  M D
 
Chunk 1: An Easy-Reference Guidebook for Healthcare Providers 
in Developed and Developing Countries
 Copyright © 2014 Pamela Smith.
All rights reserved. No part of this book may be used or reproduced by any m
Chunk 2: Archway Publishing books may be ordered through booksellers or by contacting:
Archway Publishing
1663 Liberty Drive
Bloomington, IN 47403
www.archwaypublishing.com
1-(888)-242-5904
Because of the dyna
Chunk 3: views of the publisher, and the publisher hereby disclaims any responsibility for them.
The  eld guide is not a substitute for comprehensive psychiatry, psychology, or 
other related mental health te
Chunk 4: describe generally accepted practices. Application of this information in a particular 
situation remains the responsibility of the practitioner or hea

In [43]:
def generate_gemini_embeddings(chunks):
    embeddings = []
    for chunk in chunks:
        if chunk.strip():  # Ensure the chunk is not empty
            response = genai.embed_content(
                model="models/text-embedding-004",  # Gemini Pro embedding model
                content=chunk
            )
            
            # Now we directly access 'embedding' as it contains the values directly
            if isinstance(response, dict) and 'embedding' in response:
                embeddings.append(response['embedding'])  # Append the embedding directly
    return embeddings

In [44]:
embedded_data=generate_gemini_embeddings(extracted_chunks)

In [None]:
len(embedded_data[4])

In [None]:
load_dotenv()
pinecone_key=os.getenv("PINECONE_API_KEY")
print(pinecone_key)

In [None]:
from pinecone import Pinecone
pc=Pinecone(api_key=pinecone_key)
index=pc.Index("empath-ai")