**Installing and importing dependencies**




In [10]:
!pip install -q transformers sentence_transformers faiss-cpu torch PyPDF2 nltk

In [11]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import pandas as pd
import PyPDF2
import os
import nltk
nltk.download('punkt')
nltk.download('punkt_tab') # Add this line to download the missing resource
from nltk.tokenize import sent_tokenize
from google.colab import userdata

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


**Setting up the model and tokenizer**

In [12]:
from google.colab import userdata
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

HUGGING_FACE_ACCESS_TOKEN = userdata.get('HF_TOKEN')

model_name = 'google/gemma-2-2b-it'

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    token=HUGGING_FACE_ACCESS_TOKEN
    ).to('cuda')

tokenizer = AutoTokenizer.from_pretrained(model_name, token=HUGGING_FACE_ACCESS_TOKEN)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

**Extracting and tokenizing info from the PDF files**

In [13]:
def extract_text_from_pdf(pdf_path):
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = "".join([page.extract_text() for page in reader.pages])
        return text
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return ""

def split_text_into_chunks(text, max_chunk_size=1000):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(current_chunk) + len(sentence) <= max_chunk_size:
            current_chunk += sentence + " "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + " "

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

**Extracting info from the PDFs**

In [14]:
encoder = SentenceTransformer('all-MiniLM-L6-v2')

import nltk
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)

# Process PDF files
pdf_file_path = "/content/Enterprise RAG.pdf" # Changed variable name to reflect it's a file path
df_documents = pd.DataFrame(columns=['path', 'text_chunks', 'embeddings'])

# Directly process the single PDF file
# Removed the os.listdir loop as it's for a single file now

text = extract_text_from_pdf(pdf_file_path)
chunks = split_text_into_chunks(text)
document_embeddings = encoder.encode(chunks)
new_row = pd.DataFrame({'path': [pdf_file_path], 'text_chunks': [chunks], 'embeddings': [document_embeddings]})
df_documents = pd.concat([df_documents, new_row], ignore_index=True)

df_documents

Unnamed: 0,path,text_chunks,embeddings
0,/content/Enterprise RAG.pdf,"[2022 Annual ReportDear Fellow Shareholder, \n...","[[-0.060755942, -0.04281306, -0.036390726, -0...."


Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.
Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.
Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.


**Creating a FAISS index from all document embeddings**

In [15]:
all_embeddings = np.vstack(df_documents['embeddings'].tolist())
dimension = all_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(all_embeddings)

**Calculating the embedding distance and generating an answer**

In [16]:
def find_most_similar_chunks(query, top_k=3):
    query_embedding = encoder.encode([query])
    distances, indices = index.search(query_embedding, top_k)
    results = []
    total_chunks = sum(len(chunks) for chunks in df_documents['text_chunks'])
    for i, idx in enumerate(indices[0]):
        if idx < total_chunks:
            doc_idx = 0
            chunk_idx = idx
            while chunk_idx >= len(df_documents['text_chunks'].iloc[doc_idx]):
                chunk_idx -= len(df_documents['text_chunks'].iloc[doc_idx])
                doc_idx += 1
            results.append({
                'document': df_documents['path'].iloc[doc_idx],
                'chunk': df_documents['text_chunks'].iloc[doc_idx][chunk_idx],
                'distance': distances[0][i]
            })
    return results

def generate_response(query, context, max_length=1000):
    prompt = f"Context: {context}\n\nQuestion: {query}\n\nAnswer:"
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to('cuda')

    with torch.no_grad():
        output = model.generate(input_ids, max_new_tokens=max_length, num_return_sequences=1)

    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

    # Extracting the answer part by removing the prompt portion
    answer_start = decoded_output.find("Answer:") + len("Answer:")
    answer = decoded_output[answer_start:].strip()

    return answer

def query_documents(query):
    similar_chunks = find_most_similar_chunks(query)
    context = " ".join([result['chunk'].replace("\n", "") for result in similar_chunks])
    response = generate_response(query, context)
    return response, similar_chunks

**Looking for info in the PDFs**

In [17]:
query = 'summarize this PDF?'
answer, relevant_chunks = query_documents(query)

print(f"Query: {query}\n\n-----\n")
print(f"Generated answer: {answer}\n\n-----\n")
print("Relevant chunks:")
for chunk in relevant_chunks:
    print(f"Document: {chunk['document']}")
    print(f"Chunk: {chunk['chunk']}".replace("\n", ""))
    print(f"Distance: {chunk['distance']}")
    print()

Query: summarize this PDF?

-----

Generated answer: This PDF is a section of a company's annual report, likely for a real estate investment trust (REIT). It provides information about the company's stock repurchase program, performance, and financial condition. 

Here's a breakdown of the key points:

**Stock Repurchases:**
* The company has been actively repurchasing its own stock throughout the year.
* The total number of shares purchased, average price paid, and approximate value of shares remaining to be purchased are detailed.

**Performance:**
* A line graph compares the company's common stock performance with the Russell 2000 Index and the FTSE NAREIT All REIT Index from 2017 to 2022.
* This data is based on an investment of $100 in each of the three indices on December 31, 2017, with reinvestment of dividends.

**Financial Condition:**
* The company has declared distributions for its Series C and D Preferred Stock.
* The table shows the distribution dates, total distributions 