In [None]:
from langchain_community.document_loaders.pdf import PyPDFLoader
import os
import re
import pandas as pd
import config
import cohere
import math
import pymongo
import requests

def batch_embed(client, objects, batch_size=96):
    total_objects = len(objects)
    num_batches = math.ceil(total_objects / batch_size)
    embeds = []

    for i in range(num_batches):
        start_index = i * batch_size
        end_index = min((i+1) * batch_size, total_objects)
        batch = objects[start_index:end_index]

        response = client.embed(
            texts=batch, model="embed-english-v3.0", input_type = "search_document"
        )
        embeds+=response.embeddings
    
    return embeds
        
mongo = pymongo.MongoClient(config.mongodb_cs)
co = cohere.Client(config.cohere_key)

linktable = mongo['Transcripts']['linktable'].aggregate([
    {
        '$search': {
            'index': 'linktable_index',
            'regex': {
                'path': 'Hyperlink',
                'query': '(.*)TRANS(.*)',
                "allowAnalyzedField": True
            },
        }
    }
]
)

linktable_list = []
for row in linktable:
    linktable_list.append(row)
linktable_df = pd.DataFrame(linktable_list)
urls = linktable_df['Hyperlink'].to_list()
summaries = []

for url in urls[0:3]:
    
    document_name = url[(url.find('KSM2/'))+5:]
    
    try:
        response = requests.get(url)
        response.raise_for_status()

        with open(document_name, 'wb') as file:
            file.write(response.content)
        print(f"file {url} downloaded successfully")
    except requests.exceptions.RequestException as e:
        print(f'Error downloading file {url}: {e}')
    loader = PyPDFLoader(document_name)
    pages = loader.load_and_split()

    # clean the pages
    full_text = ''
    for page in pages:
        lines = page.page_content.split('\n')
        lines_trimmed = list(map(str.strip, lines))
        cleaned_text = list(map(lambda s: re.sub(r'\s+\d{1,2}\s+$', '', s), lines))[3:-2]
        page.page_content = '\n'.join(cleaned_text)
        full_text+=(page.page_content+'\n')
    
    # create a Series from the pages
    spages = pd.Series([page.page_content for page in pages])
    
    # get embeddings
    embeds = pd.Series(batch_embed(co,spages.to_list(),96))

    # add page numbers
    page_numbers = pd.Series(range(1,len(pages)+1))
    
    # concat into dataframe and add parent doc name
    df = pd.DataFrame({'page_number':page_numbers, 'text':spages, 'embeddings':embeds})
    df.insert(0, 'parent_doc', document_name)

    # insert pages into database
    mongo.Transcripts.pages.insert_many(df.to_dict(orient='records'))
    
    # generate summary of full_text and add to summaries list
    preamble = 'You are Carol RosenBot, a helpful AI language model. The following text is a transcript of a hearing in the military commission trial of several men accused of responsibility for plotting the 9/11 terror attacks. The Accused in this case are Khalid Sheikh Mohammed, Khallad Bin Attash, Mustafa al-Hawsawi, Ammar al-Baluchi, and Ramzi bin al-Shibh. Your task is to generate a summary of the below transcript. Your summary should be of moderate length and begin with "In this hearing". Try to sprinkle in some details.\n\n'
    summary = co.chat(
        preamble=preamble,
        message=full_text
    )
    try:
        summaries.append(summary.text)
    except:
        summaries.append('No summary available')
    
    os.remove(document_name)
linktable_df_test = linktable_df[0:3]
linktable_df_test['Summaries'] = summaries
mongo.Transcripts.transcripts.insert_many(linktable_df_test.to_dict(orient='records'))

mongo.close()

In [6]:
def pagechunker(chunk_size, overlap_size, pages): # takes a list of Documents (pages)
    start_page = 0
    total_pages = len(pages)
    chunks = []
    while start_page < total_pages:
        chunk_text = ''
        for page in pages[start_page:min(start_page+chunk_size, total_pages-1)]:
            chunk_text+=(page.page_content)
        start_page = start_page+overlap_size
        chunks.append(chunk_text)
        
    return chunks

chunks = pagechunker(40, 10, pages)