In [1]:
from dotenv import load_dotenv
import os
# Import necessary libraries for document processing, vector embeddings, and interaction with Pinecone
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain_community.vectorstores import Pinecone

from pinecone import Pinecone


In [2]:
load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

In [3]:
pdf_directory='Policies/'
files = os.listdir(pdf_directory)

pdf_files = [file for file in files if file.endswith('.pdf')]

all_pages = []

for pdf_file in pdf_files:
    pdf_path = os.path.join(pdf_directory, pdf_file)
    pdf_loader = PyPDFLoader(pdf_path)
    pages = pdf_loader.load_and_split()
    all_pages.extend(pages)  


print(f'Total PDFs loaded: {len(pdf_files)}')
print(f'Total pages loaded: {len(all_pages)}')
print(type(all_pages[0]))



Total PDFs loaded: 4
Total pages loaded: 43
<class 'langchain_core.documents.base.Document'>


In [4]:
# Combine page contents into a single context string for processing
context = "\n".join(str(p.page_content) for p in all_pages)

In [5]:
print(len(context))

62906


In [6]:
# Split the combined context into manageable chunks for embedding generation
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3200, chunk_overlap=400)
texts = text_splitter.split_text(context)

In [7]:
from sentence_transformers import SentenceTransformer



def generate_embeddings(texts):
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    """
    Generate embeddings for a list of texts using the SentenceTransformer model.
    
    Parameters:
    texts (list of str): A list of sentences for which to generate embeddings.
    
    Returns:
    np.ndarray: A NumPy array of shape (n_texts, embedding_size) containing the sentence embeddings.
    """
    # The encode method directly returns the embeddings as a NumPy array
    embeddings = model.encode(texts)
    return embeddings


In [3]:
pinecone_client = Pinecone(api_key=PINECONE_API_KEY)
pinecone_index = pinecone_client.Index("demo")

In [9]:
# # Map document IDs to texts and upsert embeddings into Pinecone
id_to_text = {}  # Dictionary to map IDs to texts
for i, text in enumerate(texts):
    embedding_list = generate_embeddings([text])[0].tolist()
#     document_id = str(i)
#     pinecone_index.upsert(vectors=[(document_id, embedding_list)])
#     id_to_text[document_id] = text

In [234]:
import mysql.connector
from mysql.connector import Error

# Assuming `texts` is your list of document texts and `generate_embeddings` is defined
id_to_text = {}
# Connect to the MySQL database
try:
    connection = mysql.connector.connect(
        host='localhost',  # Typically 'localhost'
        user='root',
        password='Clicflyer@123',
        database='policy'
    )
    
    if connection.is_connected():
        cursor = connection.cursor()
        
        for i, text in enumerate(texts):
            # Generate embeddings and create a document ID
            embedding_list = generate_embeddings([text])[0].tolist()  # Assuming this function is defined
            document_id = str(i)
            pinecone_index.upsert(vectors=[(document_id, embedding_list)])
            # Insert into id_to_text for local mapping (optional)
            id_to_text[document_id] = text
            
            # Insert document ID and text into the database
            insert_query = "INSERT INTO documents (document_id, text) VALUES (%s, %s)"
            cursor.execute(insert_query, (document_id, text))
        
        connection.commit()

except Error as e:
    print("Error while connecting to MySQL", e)
finally:
    if connection.is_connected():
        cursor.close()
        connection.close()
        print("MySQL connection is closed")


MySQL connection is closed


In [235]:
query_text="What is the total number of leaves employee can avail in a year?"
query_embedding = generate_embeddings([query_text])[0].tolist()

In [236]:


query_results = pinecone_index.query(vector=query_embedding, top_k=1)
print(query_results)

{'matches': [{'id': '11', 'score': 0.50785476, 'values': []}],
 'namespace': '',
 'usage': {'read_units': 5}}


In [237]:
# Assuming query_results["matches"] is a list of dictionaries, each with an 'id' key
ids_to_lookup = [match['id'] for match in query_results["matches"]]

import mysql.connector

# Function to fetch texts for given IDs from the database
def fetch_texts_for_ids(db_connection, ids):
    # Prepare a query. The IN clause will be populated with the list of IDs.
    query = "SELECT document_id, text FROM documents WHERE document_id IN (%s)"
    
    # Format the list of IDs into a string that can be used in the query
    format_strings = ','.join(['%s'] * len(ids))
    query = query % format_strings
    
    cursor = db_connection.cursor()
    cursor.execute(query, tuple(ids))  # Execute the query with the list of IDs
    result = cursor.fetchall()  # Fetch all results
    
    # Convert the result to a dictionary for easy access
    id_to_text = {document_id: text for document_id, text in result}
    return id_to_text

# Example usage
try:
    connection = mysql.connector.connect(
        host='localhost',  
        user='root', 
        password='Clicflyer@123', 
        database='Policy'  
    )
    
    if connection.is_connected():
        id_to_text = fetch_texts_for_ids(connection, ids_to_lookup)
finally:
    if connection.is_connected():
        connection.close()


In [238]:
context = "\n".join(
    id_to_text.get(id, "Content not found")
    for id in ids_to_lookup  # Use the same list of IDs you queried
)


In [239]:
print(context)

• Total  31 Leaves  Annually  
 
 
21 PL 
Kind  of Leave  
5 SL 
 5 CL
5  
  
Casual  Leaves - 5 in a Year  
 
 
CL leave  may  be used  : 
• For personal  work/family  engagements  
• CL is not carried  forward  for the next  Leave  Year 
• CL is not en-cashable  
• CL can be taken  for a minimum  period  of half day to a maximum  of 1.5 days  in a month.
6  
  
 
Sick Leaves  – 5 in a year  
 
SL leave  may  be used  and intimated :  
• For medical  issues  
• If any employee  avails  more  than  3 sick leaves,  in continuity  he/she  needs  to present  Medical  
certificate  by an Authorized  Doctor.  
• SL is not carried  forward  for the next  Leave  Year 
• SL is not en -cashable
7  
  
Privileged  Leaves  – 21 in a year  
 
• Employees  can earn  21 PL in a year  w.e.f.  Date  of Joining  
• The entitlement  is accrued  at the rate of 1.75  days  per completed  month  of service.  
• The approval  and scheduling  of such  time  off will be subject  to prior Reporting  Manager  A

In [66]:
context = "\n".join(
    id_to_text.get(match['id'], "Content not found")
    for match in query_results["matches"]
    if match['id'] in id_to_text  # Ensure the ID exists in id_to_text
)
print(context)

• Total  31 Leaves  Annually  
 
 
21 PL 
Kind  of Leave  
5 SL 
 5 CL
5  
  
Casual  Leaves - 5 in a Year  
 
 
CL leave  may  be used  : 
• For personal  work/family  engagements  
• CL is not carried  forward  for the next  Leave  Year 
• CL is not en-cashable  
• CL can be taken  for a minimum  period  of half day to a maximum  of 1.5 days  in a month.
6  
  
 
Sick Leaves  – 5 in a year  
 
SL leave  may  be used  and intimated :  
• For medical  issues  
• If any employee  avails  more  than  3 sick leaves,  in continuity  he/she  needs  to present  Medical  
certificate  by an Authorized  Doctor.  
• SL is not carried  forward  for the next  Leave  Year 
• SL is not en -cashable
7  
  
Privileged  Leaves  – 21 in a year  
 
• Employees  can earn  21 PL in a year  w.e.f.  Date  of Joining  
• The entitlement  is accrued  at the rate of 1.75  days  per completed  month  of service.  
• The approval  and scheduling  of such  time  off will be subject  to prior Reporting  Manager  A

In [69]:
from transformers import AutoTokenizer, BartForConditionalGeneration

model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")



In [240]:
print(len(context))

3110


In [241]:
def chunk_text(text, max_length):
    # Tokenize the text into words
    words = text.split()
    # Split words into chunks of max_length
    return [' '.join(words[i:i+max_length]) for i in range(0, len(words), max_length)]

def summarize_chunks(chunks):
    summaries = []
    for chunk in chunks:
        inputs = tokenizer([chunk], return_tensors="pt", max_length=1024, truncation=True)
        summary_ids = model.generate(inputs["input_ids"], num_beams=2, max_length=1024, min_length=5)
        summary = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)[0]
        summaries.append(summary)
    return summaries



input_text = f"Question to be Answered: {query_text} document to be summarized: {context}"
# print(input_text)
chunks = chunk_text(input_text, 1024)  # You might need to adjust the chunk size based on tokens not words
chunk_summaries = summarize_chunks(chunks)

# Optionally combine the chunk summaries
final_summary = ' '.join(chunk_summaries)
print("Final Summary:", final_summary)


Final Summary: Employees can earn 21 PL in a year w.e.f. Date of Joining. The entitlement is accrued at the rate of 1.75 days per completed month of service. Female Employees are entitled to maternity leave for childbirth and post -natal care.


In [4]:
pinecone_index.delete(delete_all=True)



{}