In [None]:
!pip install -qU pypdf langchain_community PyPDF2 nltk pymupdf transformers pinecone-client chromadb bitsandbytes


In [None]:
!pip install --upgrade transformers bitsandbytes

In [None]:
from langchain_community.document_loaders import PyPDFLoader
import PyPDF2

Read PDF texts

In [None]:
import fitz
import os

# Function to extract text from a single PDF
def extract_text_from_pdf(pdf_path):
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)

    # Initialize an empty string to store extracted text
    text = ""

    # Loop through each page and extract text
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        text += page.get_text("text")

    return text

# Folder containing the PDF files
pdf_folder = 'pdf'

# List to store the text from each PDF file
pdf_texts = []

# Loop through all PDF files in the folder
for filename in os.listdir(pdf_folder):
    if filename.endswith('.pdf'):
        # Construct full file path
        pdf_path = os.path.join(pdf_folder, filename)

        # Extract text from the current PDF file
        extracted_text = extract_text_from_pdf(pdf_path)

        # Append the extracted text to the list
        pdf_texts.append(extracted_text)

if pdf_texts:
    print("Text from first PDF file:")
    print(pdf_texts[0][:1000])
    print("\n--- End of text preview ---\n")


Vectorize

In [None]:
import os
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize
from nltk.tokenize import sent_tokenize
from transformers import BertTokenizer, BertModel
from langchain.embeddings.base import Embeddings
import torch

nltk.download('punkt')

def split_text_into_chunks(text, chunk_size=150):
    sentences = sent_tokenize(text)
    chunks = [' '.join(sentences[i:i+chunk_size]) for i in range(0, len(sentences), chunk_size)]
    return chunks

class CustomEmbeddings(Embeddings):
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def embed_documents(self, texts):
        return [self.embed_query(text) for text in texts]

    def embed_query(self, text):
        inputs = self.tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding=True)
        with torch.no_grad():
            outputs = self.model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1)
        return embeddings.squeeze().numpy()

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

# Ensure the padding token is correctly set
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Initialize the custom embedding function with BERT model
embedding_function = CustomEmbeddings(model=model, tokenizer=tokenizer)

# Directory containing the text files
directory = '/content/sample_data/text'

text_lst= []

# Loop over multiple text files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.txt'):
        # Read the contents of the file
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            pdf_text = file.read()

        # Split the text into chunks
        chunks = split_text_into_chunks(pdf_text)

        # Vectorize each chunk using the custom embedding function
        vectors = [embedding_function.embed_query(chunk) for chunk in chunks]

        text_lst.append(vectors)

        # Print or store the resulting vectors
        print(f"Vectors for {filename}:")
        for idx, vector in enumerate(vectors):
            print(f"Vector {idx+1}: {vector}")



Pinecone initialization and upsert

In [None]:
from pinecone import Pinecone, ServerlessSpec
import pinecone

# # Initialize Pinecone
# pinecone_client = Pinecone(api_key="76e72a0a-c875-418d-bf24-d1da16967c6a")


# Create an index
# pinecone_client.create_index(
#     name="stadtlabor",
#     dimension=768,  # Set this to the dimensionality of your vectors
#     metric="cosine",  # Or "euclidean", depending on your use case
#     spec=ServerlessSpec(
#         cloud='aws',
#         region='us-east-1'  # Choose your region
#     )
# )


from pinecone import Pinecone

pc = Pinecone(api_key="76e72a0a-c875-418d-bf24-d1da16967c6a")
index = pc.Index("hackathon")

In [None]:
for i, vector in enumerate(vectors):
    vector_list = vector.tolist()
    index.upsert([{"id": str(i), 'values' : vector_list, 'metadata': {'text': chunks[i]}}])



Llama initialization

This part is still remaining.

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaTokenizer, LlamaModel, BitsAndBytesConfig
from torch import cuda, bfloat16

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
# device = 'cpu'

token = "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"
tokenizer = AutoTokenizer.from_pretrained( "meta-llama/Llama-3.1-8B", use_auth_token= token)


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype= bfloat16
)

# quantization_config = BitsAndBytesConfig(
#     load_in_4bit=True,  # Use 4-bit quantization
#     bnb_4bit_use_double_quant=True,  # Optional: Use double quantization
#     bnb_4bit_quant_type="fp4"  # Optional: Set quantization type (fp4 for faster performance)
# )

model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.1-8B",
    use_auth_token= token,
    quantization_config=bnb_config,  # Use 'int8' or 'int4' for quantization
    # device_map="auto"
)

model = model.to(device)


In [None]:
# # Save the model
# model_save_path = 'quantized_llama_model.pt'  # specify your desired file path

# # Save the state_dict of the model
# torch.save(model.state_dict(), model_save_path)
# print(f"Model saved to {model_save_path}")

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
from langchain.chains import RetrievalQA, LLMChain
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from transformers import pipeline

# embedding_model = HuggingFaceEmbeddings(model_name="meta-llama/Llama-3.1-8B")

vector_store = Pinecone(index=index, embedding=embedding_function, text_key= 'text')

prompt_template = PromptTemplate(
    template="Based on the following context, answer the question: {context}\n\nQuestion: {question}\n\nAnswer:",
    input_variables=["context", "question"]
)

# llm = HuggingFacePipeline(pipeline=pipeline("text-generation", model=model, tokenizer=tokenizer))

llm_chain = LLMChain(llm=HuggingFacePipeline(pipeline=pipeline("text-generation", model=model, tokenizer=tokenizer)),
                     prompt=prompt_template)

rag_chain = RetrievalQA.from_chain_type(
    llm=llm_chain,
    chain_type="stuff",
    retriever=vector_store.as_retriever(),
    return_source_documents=True,
    # prompt=prompt_template
)

query = "What is this document about?"
results = rag_chain( "What is this document about?")

print("Answer:", results['result'])

# for doc in results['source_documents']:
#     print(f"Source Document: {doc.metadata['source']}, Score: {doc.score}")