In [None]:
from langchain_community.document_loaders.pdf import PyPDFLoader
import os
import re
import pandas as pd
import keys
import cohere
import math
import requests
import psycopg2

def batch_embed(client, objects, batch_size=96):
    total_objects = len(objects)
    num_batches = math.ceil(total_objects / batch_size)
    embeds = []

    for i in range(num_batches):
        start_index = i * batch_size
        end_index = min((i+1) * batch_size, total_objects)
        batch = objects[start_index:end_index]

        response = client.embed(
            texts=batch, model="embed-english-v3.0", input_type = "search_document"
        )
        embeds+=response.embeddings
    
    return embeds

pgconn = psycopg2.connect(
    host="localhost",
    dbname='caroldb',
    user='postgres',
    password=keys.pg_pwd
)
co = cohere.Client(keys.cohere_key)
pgcur = pgconn.cursor()

# get all links to transcripts
pgcur.execute("""SELECT * FROM linktable WHERE hyperlink ILIKE '%trans%'""")
linktable = pd.DataFrame(pgcur.fetchall(), columns=['record_id', 'filingname', 'filingdate', 'aenumber', 'attachment', 'hyperlink'])
pgcur.execute("""SELECT hyperlink FROM transcripts""")
trans_completed = pd.DataFrame(pgcur.fetchall(), columns=['hyperlink'])

# select only uncompleted transcripts
dedup = pd.concat([linktable,trans_completed]).drop_duplicates(subset='hyperlink',keep=False)

batch_size = 10
num_batches = math.ceil(len(dedup) / batch_size)
print(f'Total links to process: {len(dedup)}')


for i in range(num_batches):
    print(f'Begin processing batch {i+1}')
    batch_df = dedup[i*batch_size : min((i+1)*batch_size,len(dedup))]
    urls = batch_df['hyperlink'].to_list()
    summaries = []

    for url in urls:
        document_name = url[(url.find('KSM2/'))+5:]
        
        try:
            response = requests.get(url)
            response.raise_for_status()

            with open(document_name, 'wb') as file:
                file.write(response.content)
            print(f"file {url} downloaded successfully")
        except requests.exceptions.RequestException as e:
            print(f'Error downloading file {url}: {e}')
        loader = PyPDFLoader(document_name)
        pages = loader.load_and_split()
        
        if len(pages) > 0:
            # clean the pages
            full_text = ''
            for page in pages:
                lines = page.page_content.split('\n')
                lines_trimmed = list(map(str.strip, lines))
                cleaned_text = list(map(lambda s: re.sub(r'\s+\d{1,2}\s+$', '', s), lines))[3:-2]
                page.page_content = '\n'.join(cleaned_text)
                full_text+=(page.page_content+'\n')
            
            # create a Series from the pages
            spages = pd.Series([page.page_content for page in pages])
            
            # get embeddings
            print(f'fetching embeddings for {document_name}')
            embeds = pd.Series(batch_embed(co,spages.to_list(),96))

            # add page numbers
            page_numbers = pd.Series(range(1,len(pages)+1))
            
            # concat into dataframe and add parent doc name
            df = pd.DataFrame({'page_number':page_numbers, 'text':spages, 'embedding':embeds})
            df.insert(0, 'parent_doc', document_name)

            # insert pages into database
            page_records = list(df.itertuples(index=False,name=None))
            query = """ 
                INSERT INTO pages (parent_doc, page_number, text, embedding)
                VALUES (%s, %s, %s, %s)
            """
            pgcur.executemany(query, page_records)
            
            # generate summary of full_text and add to summaries list
            print(f'generating summary for {document_name}')
            summary = co.chat(
                preamble=keys.preamble,
                message=full_text,
                max_tokens=500,
                temperature=.2
            )
            try:
                summaries.append(summary.text)
            except:
                summaries.append('No summary available')
        else:
            summaries.append('No summary available')    
            
        os.remove(document_name)

    batch_df['Summaries'] = summaries
    transcript_records = list(batch_df.itertuples(index=False,name=None))
    query = """ 
        INSERT INTO transcripts (transcript_id, filingname, filingdate, aenumber, attachment, hyperlink, summary)
        VALUES (%s, %s, %s, %s, %s, %s, %s)
    """
    pgcur.executemany(query, transcript_records)
    pgconn.commit()

pgconn.close()

In [25]:
pgconn = psycopg2.connect(
    host="localhost",
    dbname='caroldb',
    user='postgres',
    password=keys.pg_pwd
)
pgcur = pgconn.cursor()

# De-duplicate script
pgcur.execute("""
    DELETE FROM pages p1
    USING pages p2
    WHERE p1.page_id > p2.page_id
    AND p1.parent_doc = p2.parent_doc
    AND p1.page_number = p2.page_number;
""")

# Commit the changes
pgconn.commit()

# Close the cursor and connection
pgcur.close()
pgconn.close()