In [1]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
import fitz  # PyMuPDF
from langchain.text_splitter import TokenTextSplitter
import datetime
from rich.console import Console
from keybert import KeyBERT

In [2]:
# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

In [18]:
# Function to enrich metadata and log history
def enrich_metadata(pdf_path):
    # Extract text from the PDF
    pdf_text = extract_text_from_pdf(pdf_path)
    
    # Initialize the TokenTextSplitter with desired chunk size and overlap
    text_splitter = TokenTextSplitter(chunk_size=50, chunk_overlap=0)
    chunks = text_splitter.split_text(pdf_text)
    
    # Initialize the CountVectorizer with stop words removal
    vectorizer_model = CountVectorizer(stop_words="english")
    
    # Fit the vectorizer on the text to learn the vocabulary
    vectorizer_model.fit([pdf_text])
    
    # Create and fit the BERTopic model using the CountVectorizer
    topic_model = BERTopic(vectorizer_model=vectorizer_model)
    topics, probs = topic_model.fit_transform(chunks)
    
    # Get information about the identified topics
    topic_info = topic_model.get_topic_info()
    
    # Metadata for the PDF document
    title = 'The Solar System'
    author = 'xyz'
    url = 'http://localhost:8888/edit/Documents/Python/solarsystem.pdf'
    
    # Log metadata and topics into a list
    keys = []
    for i in range(len(chunks)):
        text = chunks[i]
        keywords = extract_keys(text, 1, 0.34)  # Extract keywords using KeyBERT
        
        keys.append({
            'document': pdf_path,
            'title': title,
            'author': author,
            'url': url,
            'doc': text,
            'keywords': keywords,
            'topics': topic_info
        })
        
    return keys

In [19]:
# Define the writehistory function 
def writehistory(logging_text):
    with open("history.log", "a", encoding="utf-8") as log_file:
        log_file.write(logging_text + "\n")

In [20]:
# Function to extract keywords using KeyBERT
def extract_keys(text, ngram, dvsity):
    kw_model = KeyBERT(model='intfloat/multilingual-e5-base')
    a = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, ngram), stop_words='english',
                                  use_mmr=True, diversity=dvsity, highlight=True)  # highlight=True
    tags = [kw[0] for kw in a]
    timestamped = datetime.datetime.now()
    # LOG THE TEXT AND THE METATAGS
    logging_text = f"LOGGED ON: {str(timestamped)}\nMETADATA: {str(tags)}\nsettings: keyphrase_ngram_range (1,{str(ngram)})  Diversity {str(dvsity)}\n---\nORIGINAL TEXT:\n{text}\n---\n\n"
    writehistory(logging_text)
    return tags

In [21]:
# Example usage
pdf_path = '/Users/pradhikshasuresh/Documents/Python/The Solar System.pdf'
enriched_metadata = enrich_metadata(pdf_path)
print(enriched_metadata[2])  # Print metadata and keywords for the third chunk

{'document': '/Users/pradhikshasuresh/Documents/Python/The Solar System.pdf', 'title': 'The Solar System', 'author': 'xyz', 'url': 'http://localhost:8888/edit/Documents/Python/solarsystem.pdf', 'doc': " and light to support life on Earth. It is by far the largest object in the solar system, containing more \nthan 99.8% of the total mass. The Sun's immense gravity keeps the planets and other objects in orbit around \n", 'keywords': ['planets', 'gravity', 'solar', 'mass', 'largest'], 'topics':    Topic  Count                         Name  \
0     -1     16  -1_planet_planets_solar_sun   

                                      Representation  \
0  [planet, planets, solar, sun, jupiter, dwarf, ...   

                                 Representative_Docs  
0  [planets, their moons, dwarf planets, and coun...  }
