In [14]:
import openai
import PyPDF2
import os
import re
import math
from nltk.tokenize import sent_tokenize
from tqdm import tqdm
# import spacy
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
openai.api_key = os.getenv("OPENAI_API_KEY")

In [None]:
def load_pdf_text(pdf_path):
    """Load text from a PDF file."""
    text = ""
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text() or ""
    return text


def encode_chunks(chunks, nlp):
    """Encode text chunks using spaCy."""
    encoded_chunks = []
    for chunk in chunks:
        doc = nlp(chunk)
        encoded_chunks.append(doc.vector)  # Get the vector representation
    return encoded_chunks

In [None]:
def main(pdf_path):
    """Main function to load PDF, combine text, split it, and encode."""
    pdf_text = load_pdf_text(pdf_path)

    # Initialize the RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,  # Adjust as needed
        chunk_overlap=200  # Adjust as needed
    )

    # Split the text into chunks
    text_chunks = text_splitter.split_text(pdf_text)

    # Load the spaCy model
    nlp = spacy.load("en_core_web_md")  # Load the spaCy model

    # Encode the chunks
    encoded_chunks = encode_chunks(text_chunks, nlp)

    q = "what is add/drop period?"
    q_emb = nlp(q).vector

    # Find the 5 most similar chunks
    similarities = cosine_similarity([q_emb], encoded_chunks).flatten()

    # find the top 5
    top_5_idx = similarities.argsort()[::-1][:5]

    # get the text of the top 5
    top_5_text = [text_chunks[i] for i in top_5_idx]

    # build the prompt
    prompt = "Please answer the following questions based on the provided text:\n\n"
    prompt += f"Q: {q}\n\n"
    prompt += "Top 5 most relevant text chunks:\n\n"
    for i, text in enumerate(top_5_text):
        prompt += f"{i + 1}. {text}\n\n"

    print(prompt)

    # # Print the results
    # for i, (chunk, encoded) in enumerate(zip(text_chunks, encoded_chunks)):
    #     print(f"Chunk {i + 1}:\n{chunk}\n")
    #     print(f"Encoded Vector {i + 1}:\n{encoded}\n")

if __name__ == "__main__":
    pdf_file_path = "/Users/cpming/Downloads/Freshmen-Handbook_2022-23_Final.pdf"  # Replace with your PDF file path
    main(pdf_file_path)

