In [3]:
import requests
from bs4 import BeautifulSoup
import faiss
import os
import openai
from dotenv import load_dotenv
import pdfplumber
from io import BytesIO
from langchain_openai import OpenAI
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pickle
import numpy as np

# Load environment variables
load_dotenv(".config")  # Load the environment variables from the .config file
openai_api_key = os.getenv("OPENAI_API_KEY")

# Fetch content
def fetch_text_content(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
    }
    
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        # Check if the content is a PDF
        if 'application/pdf' in response.headers.get('Content-Type', ''):
            with pdfplumber.open(BytesIO(response.content)) as pdf:
                text = ''.join(page.extract_text() for page in pdf.pages)
            return text
        else:
            # If it's not a PDF, treat it as a webpage
            soup = BeautifulSoup(response.content, 'html.parser')
            return soup.get_text()  # Extract all text from the HTML
    else:
        return f"Failed to fetch the content from URL: {url}, Status Code: {response.status_code}"
    
# Preprocess text into chunks
def preprocess_text(text):
    # Split text into chunks of size 500 with overlap of 50 characters
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    return splitter.split_text(text)
    
# Store both text chunks and associated metadata
def store_text_and_metadata(text_chunks, url):
    metadata = []
    for i, chunk in enumerate(text_chunks):
        metadata_entry  = {
            "chunk_id": i,
            "source": url,
            "page": i // 10 + 1,  
            "text": chunk  # Add the text to the metadata entry
        }
        metadata.append(metadata_entry)
    return metadata

# Generating a FAISS index
def generate_faiss_index(text_chunks_with_metadata):
    
    # Extract the text from the chunk+metadata structure for FAISS processing
    texts = [entry["text"] for entry in text_chunks_with_metadata]
    
    # Generate embeddings using OpenAI's Embeddings model
    embeddings = OpenAIEmbeddings(model="text-embedding-ada-002", api_key=openai_api_key)
    
    # Create the FAISS index
    faiss_index = FAISS.from_texts(texts, embeddings)
    
    # Store metadata separately
    metadata = {i: entry for i, entry in enumerate(text_chunks_with_metadata)}
    
    return faiss_index, metadata

# Save FAISS index using FAISS's write_index
def save_faiss_index(faiss_index, metadata, file_path, metadata_file_path):
    # Save the FAISS index to disk
    faiss.write_index(faiss_index.index, file_path)
    # Save metadata using pickle
    with open(metadata_file_path, "wb") as f:   
        pickle.dump(metadata, f)     

# Load FAISS index using FAISS's read_index
def load_faiss_index(file_path, metadata_file_path):
    # Load the FAISS index from disk
    index = faiss.read_index(file_path)
    # Load metadata using pickle
    with open(metadata_file_path, "rb") as f:
        metadata = pickle.load(f)
    return index, metadata
      
# Initialize the OpenAI client with your API key
llm = OpenAI(model="gpt-4o", api_key=openai_api_key)

# Summarization
def summarize_text(text):
    prompts = {
        "benefits": "Extract and summarize the benefits of the scheme:",
        "application_process": "Describe the application process for the scheme:",
        "eligibility": "Who is eligible for the scheme:",
        "documents": "List the documents required for the scheme:",
    }
    summaries = {}
    for key, prompt in prompts.items():
        response = openai.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "user", "content": f"{prompt}\n{text}"}
                ])
        summaries[key] = response.choices[0].message.content
    return summaries

# Query System 
def query_system(query, faiss_index, metadata):
    embeddings = OpenAIEmbeddings(model="text-embedding-ada-002", api_key=openai_api_key)
    query_embedding = embeddings.embed_query(query)
    
    # Convert the query embedding to a numpy array and reshape it
    query_embedding = np.array(query_embedding).reshape(1, -1)
    
    # Perform the search
    distances, indices = faiss_index.search(query_embedding, k=1)
    
    # Retrieve the similar texts based on the indices
    similar_texts = [metadata[idx]["text"] for idx in indices[0]]
    
        # Iterate over each text and generate a response
    answers = []
    for text in similar_texts:
        # Create a message for the chat model
        response = openai.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "user", "content": text}
                ]
            )
            # Extract the content of the response
        answers.append(response.choices[0].message.content)   
       
    # Extract source information for each text
    sources = [metadata[idx]["source"] for idx in indices[0]]
    return answers, sources

PDF content fetched successfully.
Text preprocessed into 47 chunks.
Starting summarization process...
Summarization completed.

Summary for benefits:
The PM Street Vendor’s AtmaNirbhar Nidhi (PM SVANidhi) scheme is a central initiative designed to support and empower street vendors who have been adversely impacted by the COVID-19 pandemic. Here is a summary of its key benefits and features:

1. **Objective and Scope**: 
   - The scheme aims to provide financial assistance by facilitating a working capital loan of up to ₹10,000 to street vendors. This helps them restart their businesses which have been affected by the pandemic.
   - It is intended to incentivize regular loan repayment and promote digital transactions to enhance creditworthiness.

2. **Eligibility**:
   - Targeted at street vendors operating in urban areas since or before March 24, 2020.
   - Eligible beneficiaries should possess a Certificate of Vending or Identity Card issued by Urban Local Bodies (ULBs).
   - Vendors 

In [None]:
# Example of how to use the entire workflow
def run_workflow():
    # Step 1: Fetch PDF content
    url = "https://mohua.gov.in/upload/uploadfiles/files/PMSVANidhi%20Guideline_English.pdf"
    try:
        text = fetch_text_content(url)
        print("PDF content fetched successfully.")
    except Exception as e:
        print(f"Error fetching PDF content: {e}")
        return
    
    # Step 2: Preprocess the text into chunks
    text_chunks = preprocess_text(text)
    print(f"Text preprocessed into {len(text_chunks)} chunks.")
    
    # Step 3: Store text and metadata
    text_and_metadata = store_text_and_metadata(text_chunks, url)
    
    # Step 4: Generate FAISS index with both text and metadata
    faiss_index, metadata = generate_faiss_index(text_and_metadata)
    
    # Step 5: Save the FAISS index and metadata
    save_faiss_index(faiss_index, metadata, "faiss_index.index", "faiss_metadata.pkl")
    
    # Step 6: Load the FAISS index and metadata for later use
    loaded_index, loaded_metadata = load_faiss_index("faiss_index.index", "faiss_metadata.pkl")

    # Step 7: Summarize the text (new step)
    print("Starting summarization process...")
    try:
        full_text = " ".join([chunk["text"] for chunk in text_and_metadata])
        summaries = summarize_text(full_text)
        print("Summarization completed.")
        for key, summary in summaries.items():
            print(f"\nSummary for {key}:\n{summary}\n")
    except Exception as e:
        print(f"Error during summarization: {e}")
    
    # Step 8: Query the system to verify that it returns the correct responses
    if loaded_index is not None:
        query = "Implementation timeline of the scheme?"
        answers, sources = query_system(query, loaded_index,loaded_metadata)
        
        print("\nQuery Results:")
        for answer, source in zip(answers, sources):
            print(f"Answer: {answer}")
            print(f"Source: {source}")
        print("\nQuery system verified successfully!")

# Function to inspect FAISS index and associated metadata
def inspect_faiss_index_with_metadata(faiss_index, metadata):
    num_vectors = faiss_index.ntotal
    print(f"Number of vectors in FAISS index: {num_vectors}")
    
    for i in range(min(1, num_vectors)):  # Check first 5 entries
        # Access vector embeddings
        vector = faiss_index.reconstruct(i)
        print(f"Vector {i}: {vector}")
        
        # Access associated metadata (stored separately)
        meta_data = metadata.get(i, None)
        print(f"Metadata for vector {i}: {meta_data}")

# Run the workflow
run_workflow()