In [None]:
# Import libraries
import os
import gc

import torch
from dotenv import load_dotenv
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM  
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings

from langchain_astradb import AstraDBVectorStore
from langchain.indexes.vectorstore import VectorStoreIndexWrapper


def main():
    
    # Load your API secret keys
    try:
        load_dotenv()
        ASTRA_DB_APPLICATION_TOKEN = os.environ["ASTRA_DB_APPLICATION_TOKEN"]
        ASTRA_DB_ID = os.environ["ASTRA_DB_ID"]
        ASTRA_DB_API_ENDPOINT = os.environ["ASTRA_DB_API_ENDPOINT"]\
        
    except:
        print("Mention your API keys")
        
    # make Hub downloads resilient on slower links
    os.environ["HF_HUB_DOWNLOAD_TIMEOUT"] = "180"
    os.environ["HF_HUB_DOWNLOAD_RETRY"]   = "20"
    
    dataset_path = r"D:\Intelligent QA AI\research_docs"
    all_docs = read_pdfs(dataset_path)
    
    chunks = generate_chunks(all_docs)
    
    store_embeddings(chunks)

# Function to read the contents of PDFs
def read_pdfs(dataset_path):
    all_docs = []

    for file in os.listdir(dataset_path):
        if file.endswith('.pdf'): 

            file_path = os.path.join(dataset_path, file)
            loader = PyPDFLoader(file_path, mode="single")
            docs = loader.load()

            all_docs.append(docs[0]) 
            
    return all_docs

# Function to divide the extracted text into chunks
def generate_chunks(all_docs):
    
    text_splitter = CharacterTextSplitter(separator = "\n",
                                          chunk_size = 900, chunk_overlap = 100,
                                          length_function = len)

    chunks = text_splitter.split_documents(all_docs)
    
    return chunks
    
# Function to convert extract chunks into embeddings and store them in vector database
def store_embeddings(chunks):
    
    embedding = HuggingFaceEmbeddings(model_name = "NeuML/pubmedbert-base-embeddings")
    
    # Setting up vector store
    vstore = AstraDBVectorStore(embedding = embedding,
                                collection_name = "langchain_pdf_query",
                                api_endpoint = ASTRA_DB_API_ENDPOINT,
                                token = ASTRA_DB_APPLICATION_TOKEN)

    vstore.add_documents(chunks)
    astra_vector_index = VectorStoreIndexWrapper(vectorstore = vstore)
    
    return vstore

    
if __name__ == '__main__':
    main()