In [21]:
from langchain_community.document_loaders.pdf import PyPDFLoader
import os
import re
import pandas as pd
import config
import cohere
import math
import pymongo

def batch_embed(client, objects, batch_size=96):
    total_objects = len(objects)
    num_batches = math.ceil(total_objects / batch_size)
    embeds = []

    for i in range(num_batches):
        start_index = i * batch_size
        end_index = min((i+1) * batch_size, total_objects)
        batch = objects[start_index:end_index]

        response = client.embed(
            texts=batch, model="embed-english-v3.0", input_type = "search_document"
        )
        embeds+=response.embeddings
    
    return embeds
        

files = os.listdir('transcripts/') # replace with GET of MDB linktree
co = cohere.Client(config.cohere_key)

for file in files[0:1]:
    loader = PyPDFLoader('transcripts/'+file)
    pages = loader.load_and_split()

    # clean the pages
    for page in pages:
        lines = page.page_content.split('\n')
        lines_trimmed = list(map(str.strip, lines))
        cleaned_text = list(map(lambda s: re.sub(r'\s+\d{1,2}\s+$', '', s), lines))[3:-2]
        page.page_content = '\n'.join(cleaned_text)
    
    # create a Series from the pages
    spages = pd.Series([page.page_content for page in pages])
    
    # get embeddings
    embeds = pd.Series(batch_embed(co,spages.to_list(),96))

    # add page numbers
    page_numbers = pd.Series(range(1,len(pages)+1))
    
    # concat into dataframe and add parent doc name
    df = pd.DataFrame({'page_number':page_numbers, 'text':spages, 'embeddings':embeds})
    df.insert(0, 'parent_doc', file[0:-4])

    # insert pages into database
    mongo = pymongo.MongoClient(config.mongodb_cs)
    mongo.Transcripts.pages.insert_many(df.to_dict(orient='records'))
    
    # with open('transcripts/'+file[:-3]+'txt', 'w') as f:
    #         for page in pages:
    #             f.write(page.page_content)
    #             f.write('\n')

In [6]:
def pagechunker(chunk_size, overlap_size, pages): # takes a list of Documents (pages)
    start_page = 0
    total_pages = len(pages)
    chunks = []
    while start_page < total_pages:
        chunk_text = ''
        for page in pages[start_page:min(start_page+chunk_size, total_pages-1)]:
            chunk_text+=(page.page_content)
        start_page = start_page+overlap_size
        chunks.append(chunk_text)
        
    return chunks

chunks = pagechunker(40, 10, pages)