In [1]:
from dotenv import load_dotenv
import os
# Import necessary libraries for document processing, vector embeddings, and interaction with Pinecone
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain_community.vectorstores import Pinecone

from pinecone import Pinecone, PodSpec
from langchain_core.documents.base import Document


In [2]:
load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

In [3]:
pdf_directory='Policies/'
files = os.listdir(pdf_directory)

# Filter for PDF files
pdf_files = [file for file in files if file.endswith('.pdf')]

# Initialize an empty list to hold all pages from all PDFs
all_pages = []

# Load and split each PDF
for pdf_file in pdf_files:
    pdf_path = os.path.join(pdf_directory, pdf_file)
    pdf_loader = PyPDFLoader(pdf_path)
    pages = pdf_loader.load_and_split()
    all_pages.extend(pages)  # Add the pages from the current PDF to the list of all pages

# Now `all_pages` contains pages from all PDFs in the directory
print(f'Total PDFs loaded: {len(pdf_files)}')
print(f'Total pages loaded: {len(all_pages)}')
print(type(all_pages[0]))



Total PDFs loaded: 4
Total pages loaded: 43
<class 'langchain_core.documents.base.Document'>


In [4]:
# Combine page contents into a single context string for processing
context = "\n".join(str(p.page_content) for p in all_pages)

In [5]:
# Split the combined context into manageable chunks for embedding generation
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3200, chunk_overlap=400)
texts = text_splitter.split_text(context)

In [6]:
from sentence_transformers import SentenceTransformer
import numpy as np



def generate_embeddings(texts):
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    """
    Generate embeddings for a list of texts using the SentenceTransformer model.
    
    Parameters:
    texts (list of str): A list of sentences for which to generate embeddings.
    
    Returns:
    np.ndarray: A NumPy array of shape (n_texts, embedding_size) containing the sentence embeddings.
    """
    # The encode method directly returns the embeddings as a NumPy array
    embeddings = model.encode(texts)
    return embeddings


In [7]:
pinecone_client = Pinecone(api_key=PINECONE_API_KEY)
pinecone_index = pinecone_client.Index("demo")

In [8]:
# Map document IDs to texts and upsert embeddings into Pinecone
id_to_text = {}  # Dictionary to map IDs to texts
for i, text in enumerate(texts):
    embedding_list = generate_embeddings([text])[0].tolist()
    document_id = str(i)
    pinecone_index.upsert(vectors=[(document_id, embedding_list)])
    id_to_text[document_id] = text

In [9]:
query_text="What are the exceptions to Work from home policy?"
query_embedding = generate_embeddings([query_text])[0].tolist()

In [10]:


query_results = pinecone_index.query(vector=query_embedding, top_k=4)

In [11]:
documents = [
    Document(
        page_content=id_to_text.get(match['id'], "Content not found"),
        metadata={"score": match['score'], "page": match['id']}
    )
    for match in query_results["matches"]
    if match['id'] in id_to_text  # Ensure the ID exists in id_to_text
]


In [12]:
# Combine page contents into a single context string for processing
context = "\n".join(str(p.page_content) for p in documents)

In [13]:
print(documents[0])

page_content='Policy Elements  \n● All employees will be required to work for a period of 3 day s/ Week  from the office pr emises or a Total \nof at least 13 working days  from office  premises  in a month.   \n● Additionally,  an employee can avail an extra pre -approved 1 -day WFH  in a week  by the  approval  of \nRespective  Manager and it complies with the 13 day / month condition above . \n● Any exception to the rules  above will have to be approved by the Department Heads.  \nRequesting Work from Home Procedure  \nWhen employees plan to work from home, this procedure must be followed:  \n● Employees file a request through email  to Respective Manager with CC to HR, at least two da ys in \nadvance . The Managers must approve their request considering all elements we mentioned above.  \nDisclaimer  \nThe Company may periodically monitor, review and evaluate the working and efficacy of this  policy, and modify / \nmake improvements in its working, as may be considered appropriate 

In [14]:
from transformers import AutoTokenizer, BartForConditionalGeneration

model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")



In [16]:
inputs = tokenizer([context], return_tensors="pt")

# Generate Summary
summary_ids = model.generate(inputs["input_ids"], num_beams=2)
tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

'Employees will be required to work for a period of 3 day s/ Week from the office pr emises. Any exception to the rules  above will have to be approved by the Department Heads. The Company may periodically monitor, review and evaluate the working and efficacy of this policy.'