In [32]:
def chunk_text(text, chunk_size=500, overlap=50):

    """
    Splits text into smaller chunks with optional overlaps.
    chunk_controller.py
    Args:
        text (str): The input text to split.
        chunk_size (int): The maximum size of each chunk.
        overlap (int): The number of overlapping characters between chunks.
    Returns:
        list: A list of text chunks.
    """
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + chunk_size, len(text))
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks


In [33]:
from flask import Flask, request, jsonify
import os
#upload_controller.py
UPLOAD_FOLDER = 'data/uploads'
os.makedirs(UPLOAD_FOLDER, exist_ok=True)

app = Flask(__name__)

@app.route('/upload', methods=['POST'])
def upload_file():
    if 'file' not in request.files:
        return jsonify({"error": "No file part"}), 400
    file = request.files['file']
    if file.filename == '':
        return jsonify({"error": "No selected file"}), 400
    if file and file.filename.endswith('.pdf'):
        filepath = os.path.join(UPLOAD_FOLDER, file.filename)
        file.save(filepath)
        return jsonify({"message": "File uploaded successfully", "path": filepath}), 200
    else:
        return jsonify({"error": "Invalid file type. Only PDFs are allowed."}), 400


In [34]:
!pip install PyPDF2
#ingestion_controller.py
from PyPDF2 import PdfReader

def extract_text_from_pdf(filepath):
    """
    Extracts text from a PDF file.

    Args:
        filepath (str): Path to the PDF file.
    Returns:
        str: Extracted text from the PDF.
    """
    reader = PdfReader(filepath)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text




In [35]:
from sentence_transformers import SentenceTransformer
#embedding_controller.py
model = SentenceTransformer('all-MiniLM-L6-v2')  # You can replace this with another Sentence Transformers model

def generate_embeddings(chunks):
    """
    Generates embeddings for a list of text chunks.

    Args:
        chunks (list): List of text chunks.
    Returns:
        list: A list of embeddings.
    """
    embeddings = model.encode(chunks, convert_to_tensor=True)
    return embeddings


In [36]:
!pip install faiss-cpu
import faiss
import os
#vector_controller.py
def create_faiss_index(embeddings):
    """
    Creates a FAISS index from embeddings.

    Args:
        embeddings (list): List of embeddings to store in the index.
    Returns:
        faiss.Index: The created FAISS index.
    """
    dim = embeddings[0].shape[0]  # Dimension of embeddings
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings.cpu().numpy())  # Convert PyTorch tensor to NumPy
    return index

def save_faiss_index(index, path):
    """
    Saves a FAISS index to a file.

    Args:
        index (faiss.Index): The FAISS index to save.
        path (str): File path to save the index.
    """
    faiss.write_index(index, path)

def load_faiss_index(path):
    """
    Loads a FAISS index from a file.

    Args:
        path (str): File path to load the index.
    Returns:
        faiss.Index: The loaded FAISS index.
    """
    if os.path.exists(path):
        return faiss.read_index(path)
    else:
        raise FileNotFoundError(f"Index file not found at {path}")




In [41]:
import os
from pinecone import Pinecone, ServerlessSpec

# Load Pinecone API key from environment variables
PINECONE_API_KEY = "pcsk_622QE6_GnfmM3Pg86QbxSkfX4bMknDwg6pndgxwHEQnVJtqFDwyD2AHngi11MnYPU9LEBk"
# Ensure the required directory exists
os.makedirs("data/vector_store", exist_ok=True)

# Call the function with your PDF
process_pdf_pipeline("/content/drive/MyDrive/214M1A3108.pdf")

# Initialize the Pinecone client
pc = Pinecone(api_key=PINECONE_API_KEY)

def upsert_to_pinecone(index_name, embeddings, ids, texts):
    """
    Upserts vectors to a Pinecone index.

    Args:
        index_name (str): The name of the Pinecone index.
        embeddings (list): A list of embeddings.
        ids (list): A list of IDs corresponding to the embeddings.
        texts (list): A list of metadata text fields for each embedding.

    Returns:
        dict: A dictionary indicating success or failure.
    """
    try:
        # Check if the index exists
        if index_name not in pc.list_indexes().names():
            # Create the index
            pc.create_index(
                name=index_name,
                dimension=len(embeddings[0]),
                metric="euclidean",  # Adjust metric as needed (e.g., cosine)
                spec=ServerlessSpec(
                    cloud="aws",
                    region="us-west-2"  # Adjust the region as needed
                )
            )

        # Get the index instance
        index = pc.index(index_name)

        # Prepare the vectors for upsert
        vectors = [{"id": str(id_), "vector": embedding, "metadata": {"text_field": text}}
                   for id_, embedding, text in zip(ids, embeddings, texts)]

        # Upsert the vectors into Pinecone
        index.upsert(vectors)

        return {"message": f"Upserted {len(embeddings)} vectors to Pinecone index '{index_name}'"}
    except Exception as e:
        print(f"Error upserting to Pinecone: {str(e)}")
        return {"error": str(e)}



--- Starting PDF Processing Pipeline ---

[1/4] Extracting text from PDF...
Text extraction completed.
[2/4] Splitting text into chunks...
Text split into 7 chunks.
[3/4] Generating embeddings for chunks...
Embeddings generation completed.
[4/4] Storing embeddings in a FAISS index...
FAISS index saved locally.

--- PDF Processing Pipeline Completed ---



In [42]:
"""from src.models.controllers.ingestion_controller import extract_text_from_pdf
from src.models.controllers.chunk_controller import chunk_text
from src.models.controllers.embedding_controller import generate_embeddings
from src.models.controllers.vector_controller import create_faiss_index, save_faiss_index
from src.models.controllers.pinecone_controller import upsert_to_pinecone"""

def process_pdf_pipeline(filepath, use_pinecone=False):
    """
    Orchestrates the PDF processing pipeline.

    Args:
        filepath (str): Path to the PDF file.
        use_pinecone (bool): Whether to store embeddings in Pinecone. Defaults to False.
    """
    print("\n--- Starting PDF Processing Pipeline ---\n")

    # Step 1: Extract text
    print("[1/4] Extracting text from PDF...")
    text = extract_text_from_pdf(filepath)
    print("Text extraction completed.")

    # Step 2: Chunk the text
    print("[2/4] Splitting text into chunks...")
    chunks = chunk_text(text)
    print(f"Text split into {len(chunks)} chunks.")

    # Step 3: Generate embeddings
    print("[3/4] Generating embeddings for chunks...")
    embeddings = generate_embeddings(chunks)
    print("Embeddings generation completed.")

    # Step 4: Store embeddings
    if use_pinecone:
        print("[4/4] Uploading embeddings to Pinecone...")
        upsert_to_pinecone("pdf-compliance-index", embeddings, ids=range(len(chunks)))
        print("Embeddings uploaded to Pinecone.")
    else:
        print("[4/4] Storing embeddings in a FAISS index...")
        index = create_faiss_index(embeddings)
        save_faiss_index(index, 'data/vector_store/index.faiss')
        print("FAISS index saved locally.")

    print("\n--- PDF Processing Pipeline Completed ---\n")


In [43]:
process_pdf_pipeline("/content/drive/MyDrive/214M1A3108.pdf")


--- Starting PDF Processing Pipeline ---

[1/4] Extracting text from PDF...
Text extraction completed.
[2/4] Splitting text into chunks...
Text split into 7 chunks.
[3/4] Generating embeddings for chunks...
Embeddings generation completed.
[4/4] Storing embeddings in a FAISS index...
FAISS index saved locally.

--- PDF Processing Pipeline Completed ---

