In [3]:
#import fitz 
from tqdm import tqdm 
import re

from langchain.embeddings import OllamaEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyMuPDFLoader

chuck_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1700,
    chunk_overlap=150,
    length_function=len
)

#pdf_path = "../raw data/pdf/human-nutrition-text.pdf"
pdf_path = "../../raw data/pdf/CMH_Pub_11-1.pdf"

def text_formatter(text: str) -> str:
    # Eliminar todo lo que este antes del primer '\n'
    cleaned_text = text[text.find('\n')+1:]

    # Replace new lines with space
    #cleaned_text = text.replace("\n", " ")

    # Remove extra spaces
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)

    # Remove special characters
    #cleaned_text = re.sub(r'[^a-zA-Z0-9\s.,;!?\'\"-]', '', cleaned_text)

    # Fix common OCR issues (e.g., replacing 'ﬁ' with 'fi')
    cleaned_text = re.sub(r'ﬁ', 'fi', cleaned_text)
    cleaned_text = re.sub(r'ﬂ', 'fl', cleaned_text)

    # Convert text to lowercase for consistency
    #cleaned_text = cleaned_text.lower().strip()
    
    if len(cleaned_text) < 500:
        return None

    return cleaned_text

def open_and_read_pdf(pdf_path: str, page_begin: int = 15, 
                      page_end: int = 564, chunk_size: int = 200) -> list[dict]:  # The page number where the

    #doc = fitz.open(pdf_path) 
    
    loader = PyMuPDFLoader(pdf_path)
    doc = loader.load() 
    pages_and_texts = []
    pbar = tqdm(total=len(doc) - page_begin - (len(doc) - page_end), desc="Processing PDF pages")
    for page_number, page in enumerate(doc):
        if page_number > page_end: break
        if page_number < page_begin:  
            pbar.update(1)
            continue
        #text = page.get_text()  
        text = page.page_content
        text = text_formatter(text)
        if text is None:
            pbar.update(1)
            continue

        if len(text) > chunk_size:
            chunks = chuck_splitter.split_text(text)
            for chunk in chunks:
                pages_and_texts.append({
                    "page_number": page_number - page_begin,  
                    "page_char_count": len(chunk),
                    "page_word_count": len(chunk.split(" ")),
                    "page_sentence_count_raw": len(chunk.split(". ")),
                    "page_token_count": len(chunk) / 4,  # 1 token = ~4 chars, see: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
                    "text": chunk,
                })
        else:
            pages_and_texts.append({
                "page_number": page_number - page_begin,  # adjust page numbers since our PDF starts on page 42
                "page_char_count": len(text),
                "page_word_count": len(text.split(" ")),
                "page_sentence_count_raw": len(text.split(". ")),
                "page_token_count": len(text) / 4,  # 1 token = ~4 chars, see: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
                "text": text,
            })
        pbar.update(1)
    pbar.close()
    return pages_and_texts

def upload2mongodb(book_name: str, texts: list[dict], embedding_model: OllamaEmbeddings, collection) -> None:
    pbar = tqdm(total=len(texts), desc="Uploading to MongoDB")
    for text in texts:
        document = {
            "book_name": book_name,
            "text": text,
            "embeddings": embedding_model.embed_query(text) 
        }
        collection.insert_one(document)
        pbar.update(1)
    pbar.close()

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts

Processing PDF pages: 565it [00:00, 1291.12it/s]                         


[{'page_number': 0,
  'page_char_count': 1695,
  'page_word_count': 277,
  'page_sentence_count_raw': 19,
  'page_token_count': 423.75,
 {'page_number': 0,
  'page_char_count': 1698,
  'page_word_count': 270,
  'page_sentence_count_raw': 15,
  'page_token_count': 424.5,
  'text': '(Operation BARBA- ROSSA, begun on 22 July 1941 by Field Marshal Walther von Brauchitsch, CinC of the German Army) to crush Soviet forces has ground to a halt on broken line from Lake Ladoga on N to Sea of Azov on S. At the extremities of front, Soviet garrisons of Leningrad and Sevastopol are besieged; on central front Germans are at outskirts of Moscow. Red Army is conducting general counteroffensive (be- gun on 6 December) to drive enemy westward. 3 fresh Soviet armies are exerting pressure against enemy spearheads in vicinity of Moscow. Although assured the support of satellite nations (Finland, Rumania, Hungary), Germans are at a disadvantage because of overextended supply lines and battle exhaustion. WES

In [5]:
import pymongo 
import dotenv 
from langchain_community.embeddings import OllamaEmbeddings

config = dotenv.dotenv_values()
MONGO_URI = config['MONGO_URI']
DB_NAME = config['DB_NAME']
COLLECTION_NAME = config['COLLECTION_NAME']
MODEL_NAME = config['MODEL_NAME']
ATLAS_VECTOR_SEARCH_INDEX_NAME = config['ATLAS_VECTOR_SEARCH_INDEX_NAME']

client = pymongo.MongoClient(MONGO_URI)

database = client['WorldWarIIDatabase']
collection = database['VectorCronology']

#docs = [{key:d[key] for key in d if key in ["page_number", "text"]} for d in pages_and_texts]
#collection.insert_many(insertions)
docs = [d['text'] for d in pages_and_texts]

embeddings_model = OllamaEmbeddings(model="llama3")

In [6]:
upload2mongodb(
    book_name="United States Army in World War II, Cronology 1941-1945",
    texts=docs,
    embedding_model=embeddings_model,
    collection=collection
)

Uploading to MongoDB: 100%|██████████| 2026/2026 [1:50:11<00:00,  3.26s/it]
