# Notebook for Databricks integration

In [None]:
#Install dependencies

!pip install faiss-cpu transformers pymupdf

## Import required libraries

In [5]:
import fitz  # PyMuPDF
import faiss
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np

## Define functions

In [None]:
def extract_text_from_pdf(pdf_path):
    '''
    Function to extract text from the PDF using PyMuPDF
    '''
    doc = fitz.open(pdf_path)
    texts = []
    for page in doc:
        texts.append(page.get_text())
    return texts

In [None]:
def embed_texts(texts, model_name="sentence-transformers/all-MiniLM-L6-v2"):
    '''
    Function to extract embedding vectors using HuggingFace transformer
    '''
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    
    embeddings = []
    for text in texts:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
            embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
    
    return embeddings

In [None]:
def build_faiss_index(embeddings):
    '''
    Function to build the FAISS index for query lookup
    '''
    dim = embeddings[0].shape[0]
    index = faiss.IndexFlatL2(dim)
    index.add(np.array(embeddings))
    return index

In [None]:
import numpy as np

def search_index(query, index, texts, model_name="sentence-transformers/all-MiniLM-L6-v2", top_k=3):
    '''
    Function to search for the query against the FAISS index
    '''
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    
    inputs = tokenizer(query, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        query_embedding = model(**inputs).last_hidden_state.mean(dim=1).squeeze().numpy()
    
    query_embedding = np.expand_dims(query_embedding, axis=0)
    distances, indices = index.search(query_embedding, top_k)
    
    results = [texts[i] for i in indices[0]]
    return results


## Execution script

In [9]:
# Example usage:

# Step 1: Extract text from a PDF
pdf_path = "docs/company_policy.pdf"  
texts = extract_text_from_pdf(pdf_path)

# Step 2: Generate embeddings
embeddings = embed_texts(texts)

# Step 3: Build FAISS index
index = build_faiss_index(embeddings)

# Step 4: Ask a question
query = "What is this document about?"
results = search_index(query, index, texts)

# Step 5: Print results
for i, res in enumerate(results):
    print(f"Result {i+1}:")
    print(res)
    print("-" * 50)


Result 1:
 20
Business Conduct
February 2025
Introduction
Behaviors
Protecting Apple
Accountability
Integrity
Resources
Resources
Apple Antitrust and Competition Law Policy Statement
Anti-Corruption Policy
Apple Customer Privacy Policy
Apple Global Whistleblowing Policy
Apple Human Rights Policy
Apple Supplier Code of Conduct
Apple Third Party Code of Conduct
Apple Public Policy Advocacy Website
Intellectual Property
Investor Relations
Legal Department Contacts
Trademark and Copyright Information
Trademark List
Business Conduct Helpline
(web form and telephone options)

--------------------------------------------------
Result 2:
Contents
4	
Your Responsibilities and Obligation to Take Action
4	
Reporting a Concern
4	
No Retaliation
5	
Your Rights as an Employee
5	
Human Rights
6 	 Drugs and Alcohol
6 	 Environment, Health, and Safety (EH&S)
6 	 Harassment and Discrimination
6 	 Respect
7	
Protecting Apple’s Assets and Confidential Information
7	
Accuracy of Business Records and Fraud
