In [5]:
pdfs = [

    ("https://www.ug.edu.gh/sites/default/files/documents/APPROVED_COLLEGE_OF_BASIC_AND_APPLIED_SCIENCES-2023-2024-September_1_2023.pdf", "Approved College of Basic and Applied Sciences 2023-2024"),
    ("https://www.ug.edu.gh/sites/default/files/documents/APPROVED_COLLEGE_OF_EDUCATION-2023-2024-September_1_2023.pdf", "Approved College of Education 2023-2024"),
    ("https://www.ug.edu.gh/sites/default/files/documents/Sandwich%202022-2023.pdf", "Sandwich Programs 2022-2023"),
    ("https://www.ug.edu.gh/sites/default/files/documents/UGEL_HOSTELS.pdf", "UGEL Hostels"),
    ("https://www.ug.edu.gh/sites/default/files/documents/RESIDENTIAL_FEES_FOR_UNIVERSITY_HALLS_120124.pdf", "Residential Fees for University Halls"),
    ("https://www.ug.edu.gh/freshers/sites/freshers/files/documents/PAYMENT%20INFO%20-%20GENERAL%20-%20GHANAIANS.pdf", "Payment Info - General - Ghanaians"),
    ("https://www.ug.edu.gh/freshers/sites/freshers/files/documents/CBAS%20-%20GHANAIANS.pdf", "CBAS - Ghanaians"),
    ("https://www.ug.edu.gh/freshers/sites/freshers/files/documents/COE%20-%20GHANAIANS.pdf", "COE - Ghanaians"),
    ("https://www.ug.edu.gh/freshers/sites/freshers/files/documents/CHS%20-%20GHANAIANS.pdf", "CHS - Ghanaians"),
    ("https://www.ug.edu.gh/freshers/sites/freshers/files/documents/COH%20-%20GHANAIANS.pdf", "COH - Ghanaians"),
    ("https://www.ug.edu.gh/freshers/sites/freshers/files/documents/PAYMENT%20INFO%20-%20GENERAL%20-%20INTERNATIONALS.pdf", "Payment Info - General - Internationals"),
    ("https://www.ug.edu.gh/freshers/sites/freshers/files/documents/COE%20-%20INTRENATIONAL%20STUDENTS.pdf", "COE - International Students"),
    ("https://www.ug.edu.gh/freshers/sites/freshers/files/documents/CBAS%20-%20INTRENATIONAL%20STUDENTS.pdf", "CBAS - International Students"),
    ("https://www.ug.edu.gh/freshers/sites/freshers/files/documents/CHS%20%20INTRENATIONAL%20STUDENTS.pdf", "CHS - International Students"),
    ("https://www.ug.edu.gh/freshers/sites/freshers/files/documents/COH%20-%20INTERNATIONAL.pdf", "COH - International Students"),
    ("https://www.ug.edu.gh/freshers/payment-fees/ma-epm", "Economic Policy Management Program"),
    ("https://www.ug.edu.gh/sites/default/files/aad/Humanities%20handbook%202017%20%28website%29-min.pdf", "Humanities Handbook 2017"),
    ("https://www.ug.edu.gh/sites/default/files/aad/humanities%20handbook%202015.pdf", "Humanities Handbook 2015"),
    ("https://www.ug.edu.gh/sites/default/files/aad/chs%20handbbook%202017%20%28website%29-min.pdf", "CHS Handbook 2017"),
    ("https://www.ug.edu.gh/sites/default/files/aad/science%20handbook%202017%20%28website%29-min.pdf", "Science Handbook 2017"),
    ("https://www.ug.edu.gh/freshers/sites/freshers/files/documents/PAYMENT%20INFO%20-%20GENERAL%20-%20GHANAIANS.pdf", "Payment Information - Ghanaians"),
    ("https://www.ug.edu.gh/freshers/sites/freshers/files/documents/CBAS%20-%20GHANAIANS.pdf", "CBAS - Ghanaians"),
    ("https://www.ug.edu.gh/freshers/sites/freshers/files/documents/COE%20-%20GHANAIANS.pdf", "COE - Ghanaians"),
    ("https://www.ug.edu.gh/freshers/sites/freshers/files/documents/CHS%20-%20GHANAIANS.pdf", "CHS - Ghanaians"),
    ("https://www.ug.edu.gh/freshers/sites/freshers/files/documents/COH%20-%20GHANAIANS.pdf", "COH - Ghanaians"),
    ("https://ipo.ug.edu.gh/help/vital-information-for-fresh-international-undergraduate-students", "Vital Information for Fresh International Students"),
    ("https://www.ug.edu.gh/freshers/steps-online-registration-0", "Online Registration Steps"),
    ("https://www.ug.edu.gh/aad/fees", "Fees"),
    ("https://www.ug.edu.gh/sites/default/files/documents/APPROVED_UGBS_SPECIAL_PROGRAMS-2023-2024-September_1_2023.pdf", "Approved UGBS Special Programs 2023-2024"),
    ("https://www.ug.edu.gh/sites/default/files/documents/APPROVED_COLLEGE_OF_HUMANITIES-2023-2024-September_1_2023.pdf", "Approved College of Humanities 2023-2024"),
    ("https://www.ug.edu.gh/sites/default/files/documents/APPROVED_COLLEGES_OF_HEALTH_SCIENCES.pdf", "Approved Colleges of Health Sciences"),
]

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import tiktoken
from pinecone import Pinecone, ServerlessSpec
from langchain_openai import OpenAIEmbeddings
import os
from dotenv import load_dotenv, find_dotenv
from uuid import uuid4
from io import BytesIO
from PyPDF2 import PdfReader
import requests

class CreateEmbedding:
    """A class for creating embeddings using OpenAI and Pinecone.

    This class provides methods to create embeddings for text data using OpenAI's text-embedding model and store them in a Pinecone index.
    """

    load_dotenv(find_dotenv())    
    OPENAI_API_KEY:str = os.environ.get("OPENAI_API_KEY")
    PINECONE_API_KEY:str = os.environ.get("PINECONE_API_KEY")
    PINECONE_INDEX_NAME:str = os.environ.get("PINECONE_INDEX_NAME")
    PINECONE_CLOUD_PROVIDER:str = os.environ.get("PINECONE_CLOUD_PROVIDER")
    PINECONE_CLOUD_REGION:str = os.environ.get("PINECONE_CLOUD_REGION")

    def __init__(self, batch_limit:int = 100, vector_dimension:int=1536, splitter_chunk_size:int = 400, splitter_chunk_overlap:int = 100, vector_query_metric:str='dotproduct', model_being_trained_for:str = 'gpt-3.5-turbo', token_encoding_name:str = 'cl100k_base', embedding_model_name:str = 'text-embedding-ada-002', separators:list = ["\n\n", "\n", " ", ""]):
        """Initializes the CreateEmbedding class."""
        self.batch_limit = batch_limit
        self.model_being_trained_for = model_being_trained_for
        self.token_encoding_name = token_encoding_name
        self.embedding_model_name = embedding_model_name
        self.splitter_chunk_size = splitter_chunk_size
        self.splitter_chunk_overlap = splitter_chunk_overlap
        self.vector_query_metric = vector_query_metric
        self.vector_dimension = vector_dimension
        self.separators = separators
        print(self.PINECONE_INDEX_NAME)
        print(self.PINECONE_API_KEY)
        self.pinecone = Pinecone(api_key=self.PINECONE_API_KEY)
        tiktoken.encoding_for_model(self.model_being_trained_for)
        self.tokenizer = tiktoken.get_encoding(self.token_encoding_name)

        if self.PINECONE_INDEX_NAME not in self.pinecone.list_indexes().names():
            self.pinecone.create_index(
                name=self.PINECONE_INDEX_NAME,
                dimension=self.vector_dimension,
                metric=self.vector_query_metric,
                spec=ServerlessSpec(
                    cloud=self.PINECONE_CLOUD_PROVIDER,
                    region=self.PINECONE_CLOUD_REGION
                )
            )
            
        self.index = self.pinecone.Index(self.PINECONE_INDEX_NAME)
        
        self.text_splitter_recursive = RecursiveCharacterTextSplitter(
            chunk_size=self.splitter_chunk_size,
            chunk_overlap=self.splitter_chunk_overlap,
            length_function=self.tiktoken_len,
            separators=self.separators
        )
        self.embeddings = self.create_embeddings()
        print("Initialized CreateEmbedding class")
    def tiktoken_len(self, text):
        """Calculates the length of a text in tokens."""
        tokens = self.tokenizer.encode(text, disallowed_special=())
        return len(tokens)

    def create_embeddings(self):
        """Creates embeddings for text chunks using OpenAI's text-embedding model."""
        embeddings = OpenAIEmbeddings(openai_api_key=self.OPENAI_API_KEY)
        return embeddings

    # def get_vectorstore(self, text_chunks, embeddings):
    #     """Creates a Pinecone vector store from text chunks and embeddings."""
    #     if self.PINECONE_INDEX_NAME not in self.pinecone.list_indexes().names():
    #         self.pinecone.create_index(
    #             name=self.PINECONE_INDEX_NAME,
    #             dimension=self.vector_dimension,
    #             metric=self.vector_query_metric,
    #             spec=ServerlessSpec(
    #                 cloud=self.PINECONE_CLOUD_PROVIDER,
    #                 region=self.PINECONE_CLOUD_REGION
    #             )
    #         )
            
    #     docsearch = PineconeVectorStore.from_texts(text_chunks, embeddings, index_name=self.PINECONE_INDEX_NAME)
    #     return docsearch

    def extract_text_from_pdf(self, pdf_data):
        """Extracts text from a PDF document."""
        pdf_reader = PdfReader(pdf_data)

        text_by_page = []
        for page in pdf_reader.pages:
            text = page.extract_text()
            text_by_page.append(text)

        return text_by_page

    def create_embeddings_with_meta(self, pdf_data, pdf_link, pdf_title):
        """Creates embeddings with metadata and stores them in the Pinecone index."""
        text_by_page = self.extract_text_from_pdf(pdf_data)
        print(f"Got text from: ({pdf_title})")
        texts = []
        metadatas = []
        for page_num, page_text in enumerate(text_by_page):
            record_texts = self.text_splitter_recursive.split_text(page_text)
            record_metadatas = [{
                "page_number": page_num + 1,
                "text": text,
                "title": pdf_title,
                "link": pdf_link
            } for text in record_texts]
            
            texts.extend(record_texts)
            metadatas.extend(record_metadatas)
            
            if len(texts) >= self.batch_limit:
                ids = [str(uuid4()) for _ in range(len(texts))]
                embeds = self.embeddings.embed_documents(texts)
                self.index.upsert(vectors=zip(ids, embeds, metadatas))
                texts = []
                metadatas = []
                print(f"Upserted {len(ids)} vectors into Pinecone index")
        
        if len(texts) > 0:
            ids = [str(uuid4()) for _ in range(len(texts))]
            embeds = self.embeddings.embed_documents(texts)
            self.index.upsert(vectors=zip(ids, embeds, metadatas))
            texts = []
            metadatas = []
            print(f"Upserted {len(ids)} vectors into Pinecone index")
        print(f"Finished processing PDF: ({pdf_title})")

In [7]:
create_embedding = CreateEmbedding()
for pdf in pdfs:
    try:
        response = requests.get(pdf[0])
        response.raise_for_status()  # Raise an exception for HTTP errors
        pdf_data = BytesIO(response.content)
        create_embedding.create_embeddings_with_meta(pdf_data, pdf[0], pdf[1])
        print(f"Processed PDF: {pdf[0]}")
    except requests.exceptions.RequestException as e:
        print(f"Failed to download PDF: {pdf}. Error: {e}")
    except Exception as e:
        print(f"An error occurred while processing PDF: {pdf}. Error: {e}")
print("Finished processing all PDFs")

ugbuddy-knowledge-base
f8630851-a544-4ec8-8757-75e1867a4255
Initialized CreateEmbedding class
Got text from: (Approved College of Basic and Applied Sciences 2023-2024)
Upserted 12 vectors into Pinecone index
Finished processing PDF: (Approved College of Basic and Applied Sciences 2023-2024)
Processed PDF: https://www.ug.edu.gh/sites/default/files/documents/APPROVED_COLLEGE_OF_BASIC_AND_APPLIED_SCIENCES-2023-2024-September_1_2023.pdf
Got text from: (Approved College of Education 2023-2024)
Upserted 9 vectors into Pinecone index
Finished processing PDF: (Approved College of Education 2023-2024)
Processed PDF: https://www.ug.edu.gh/sites/default/files/documents/APPROVED_COLLEGE_OF_EDUCATION-2023-2024-September_1_2023.pdf
Got text from: (Sandwich Programs 2022-2023)
Upserted 5 vectors into Pinecone index
Finished processing PDF: (Sandwich Programs 2022-2023)
Processed PDF: https://www.ug.edu.gh/sites/default/files/documents/Sandwich%202022-2023.pdf
Got text from: (UGEL Hostels)
Upserted 2 